feat: add metadata scorer, photo hash dedup, and trust aggregator
This commit is contained in:
parent
1672e215b2
commit
ee3c85bfb0
8 changed files with 359 additions and 0 deletions
41
app/trust/__init__.py
Normal file
41
app/trust/__init__.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
from .metadata import MetadataScorer
|
||||
from .photo import PhotoScorer
|
||||
from .aggregator import Aggregator
|
||||
from app.db.models import Seller, Listing, TrustScore
|
||||
from app.db.store import Store
|
||||
import hashlib
|
||||
|
||||
|
||||
class TrustScorer:
|
||||
"""Orchestrates metadata + photo scoring for a batch of listings."""
|
||||
|
||||
def __init__(self, store: Store):
|
||||
self._store = store
|
||||
self._meta = MetadataScorer()
|
||||
self._photo = PhotoScorer()
|
||||
self._agg = Aggregator()
|
||||
|
||||
def score_batch(
|
||||
self,
|
||||
listings: list[Listing],
|
||||
query: str,
|
||||
) -> list[TrustScore]:
|
||||
query_hash = hashlib.md5(query.encode()).hexdigest()
|
||||
comp = self._store.get_market_comp("ebay", query_hash)
|
||||
market_median = comp.median_price if comp else None
|
||||
|
||||
photo_url_sets = [l.photo_urls for l in listings]
|
||||
duplicates = self._photo.check_duplicates(photo_url_sets)
|
||||
|
||||
scores = []
|
||||
for listing, is_dup in zip(listings, duplicates):
|
||||
seller = self._store.get_seller("ebay", listing.seller_platform_id)
|
||||
if seller:
|
||||
signal_scores = self._meta.score(seller, market_median, listing.price)
|
||||
else:
|
||||
signal_scores = {k: None for k in
|
||||
["account_age", "feedback_count", "feedback_ratio",
|
||||
"price_vs_market", "category_history"]}
|
||||
trust = self._agg.aggregate(signal_scores, is_dup, seller, listing.id or 0)
|
||||
scores.append(trust)
|
||||
return scores
|
||||
56
app/trust/aggregator.py
Normal file
56
app/trust/aggregator.py
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
"""Composite score and red flag extraction."""
|
||||
from __future__ import annotations
|
||||
import json
|
||||
from typing import Optional
|
||||
from app.db.models import Seller, TrustScore
|
||||
|
||||
HARD_FILTER_AGE_DAYS = 7
|
||||
HARD_FILTER_BAD_RATIO_MIN_COUNT = 20
|
||||
HARD_FILTER_BAD_RATIO_THRESHOLD = 0.80
|
||||
|
||||
|
||||
class Aggregator:
|
||||
def aggregate(
|
||||
self,
|
||||
signal_scores: dict[str, Optional[int]],
|
||||
photo_hash_duplicate: bool,
|
||||
seller: Optional[Seller],
|
||||
listing_id: int = 0,
|
||||
) -> TrustScore:
|
||||
is_partial = any(v is None for v in signal_scores.values())
|
||||
clean = {k: (v if v is not None else 0) for k, v in signal_scores.items()}
|
||||
composite = sum(clean.values())
|
||||
|
||||
red_flags: list[str] = []
|
||||
|
||||
# Hard filters
|
||||
if seller and seller.account_age_days < HARD_FILTER_AGE_DAYS:
|
||||
red_flags.append("new_account")
|
||||
if seller and (
|
||||
seller.feedback_ratio < HARD_FILTER_BAD_RATIO_THRESHOLD
|
||||
and seller.feedback_count > HARD_FILTER_BAD_RATIO_MIN_COUNT
|
||||
):
|
||||
red_flags.append("established_bad_actor")
|
||||
|
||||
# Soft flags
|
||||
if seller and seller.account_age_days < 30:
|
||||
red_flags.append("account_under_30_days")
|
||||
if seller and seller.feedback_count < 10:
|
||||
red_flags.append("low_feedback_count")
|
||||
if clean["price_vs_market"] == 0:
|
||||
red_flags.append("suspicious_price")
|
||||
if photo_hash_duplicate:
|
||||
red_flags.append("duplicate_photo")
|
||||
|
||||
return TrustScore(
|
||||
listing_id=listing_id,
|
||||
composite_score=composite,
|
||||
account_age_score=clean["account_age"],
|
||||
feedback_count_score=clean["feedback_count"],
|
||||
feedback_ratio_score=clean["feedback_ratio"],
|
||||
price_vs_market_score=clean["price_vs_market"],
|
||||
category_history_score=clean["category_history"],
|
||||
photo_hash_duplicate=photo_hash_duplicate,
|
||||
red_flags_json=json.dumps(red_flags),
|
||||
score_is_partial=is_partial,
|
||||
)
|
||||
67
app/trust/metadata.py
Normal file
67
app/trust/metadata.py
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
"""Five metadata trust signals, each scored 0–20."""
|
||||
from __future__ import annotations
|
||||
import json
|
||||
from typing import Optional
|
||||
from app.db.models import Seller
|
||||
|
||||
ELECTRONICS_CATEGORIES = {"ELECTRONICS", "COMPUTERS_TABLETS", "VIDEO_GAMES", "CELL_PHONES"}
|
||||
|
||||
|
||||
class MetadataScorer:
|
||||
def score(
|
||||
self,
|
||||
seller: Seller,
|
||||
market_median: Optional[float],
|
||||
listing_price: float,
|
||||
) -> dict[str, Optional[int]]:
|
||||
return {
|
||||
"account_age": self._account_age(seller.account_age_days),
|
||||
"feedback_count": self._feedback_count(seller.feedback_count),
|
||||
"feedback_ratio": self._feedback_ratio(seller.feedback_ratio, seller.feedback_count),
|
||||
"price_vs_market": self._price_vs_market(listing_price, market_median),
|
||||
"category_history": self._category_history(seller.category_history_json),
|
||||
}
|
||||
|
||||
def _account_age(self, days: int) -> int:
|
||||
if days < 7: return 0
|
||||
if days < 30: return 5
|
||||
if days < 90: return 10
|
||||
if days < 365: return 15
|
||||
return 20
|
||||
|
||||
def _feedback_count(self, count: int) -> int:
|
||||
if count < 3: return 0
|
||||
if count < 10: return 5
|
||||
if count < 50: return 10
|
||||
if count < 200: return 15
|
||||
return 20
|
||||
|
||||
def _feedback_ratio(self, ratio: float, count: int) -> int:
|
||||
if ratio < 0.80 and count > 20: return 0
|
||||
if ratio < 0.90: return 5
|
||||
if ratio < 0.95: return 10
|
||||
if ratio < 0.98: return 15
|
||||
return 20
|
||||
|
||||
def _price_vs_market(self, price: float, median: Optional[float]) -> Optional[int]:
|
||||
if median is None: return None # data unavailable → aggregator sets score_is_partial
|
||||
if median <= 0: return None
|
||||
ratio = price / median
|
||||
if ratio < 0.50: return 0 # >50% below = scam
|
||||
if ratio < 0.70: return 5 # >30% below = suspicious
|
||||
if ratio < 0.85: return 10
|
||||
if ratio <= 1.20: return 20
|
||||
return 15 # above market = still ok, just expensive
|
||||
|
||||
def _category_history(self, category_history_json: str) -> int:
|
||||
try:
|
||||
history = json.loads(category_history_json)
|
||||
except (ValueError, TypeError):
|
||||
return 0
|
||||
electronics_sales = sum(
|
||||
v for k, v in history.items() if k in ELECTRONICS_CATEGORIES
|
||||
)
|
||||
if electronics_sales == 0: return 0
|
||||
if electronics_sales < 5: return 8
|
||||
if electronics_sales < 20: return 14
|
||||
return 20
|
||||
74
app/trust/photo.py
Normal file
74
app/trust/photo.py
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
"""Perceptual hash deduplication within a result set (free tier, v0.1)."""
|
||||
from __future__ import annotations
|
||||
from typing import Optional
|
||||
import io
|
||||
import requests
|
||||
|
||||
try:
|
||||
import imagehash
|
||||
from PIL import Image
|
||||
_IMAGEHASH_AVAILABLE = True
|
||||
except ImportError:
|
||||
_IMAGEHASH_AVAILABLE = False
|
||||
|
||||
|
||||
class PhotoScorer:
|
||||
"""
|
||||
check_duplicates: compare images within a single result set.
|
||||
Cross-session dedup (PhotoHash table) is v0.2.
|
||||
Vision analysis (real/marketing/EM bag) is v0.2 paid tier.
|
||||
"""
|
||||
|
||||
def check_duplicates(self, photo_urls_per_listing: list[list[str]]) -> list[bool]:
|
||||
"""
|
||||
Returns a list of booleans parallel to photo_urls_per_listing.
|
||||
True = this listing's primary photo is a duplicate of another listing in the set.
|
||||
Falls back to URL-equality check if imagehash is unavailable or fetch fails.
|
||||
"""
|
||||
if not _IMAGEHASH_AVAILABLE:
|
||||
return self._url_dedup(photo_urls_per_listing)
|
||||
|
||||
primary_urls = [urls[0] if urls else "" for urls in photo_urls_per_listing]
|
||||
|
||||
# Fast path: URL equality is a trivial duplicate signal (no fetch needed)
|
||||
url_results = self._url_dedup([[u] for u in primary_urls])
|
||||
|
||||
hashes: list[Optional[str]] = []
|
||||
for url in primary_urls:
|
||||
hashes.append(self._fetch_hash(url))
|
||||
|
||||
results = list(url_results) # start from URL-equality results
|
||||
seen: dict[str, int] = {}
|
||||
for i, h in enumerate(hashes):
|
||||
if h is None:
|
||||
continue
|
||||
if h in seen:
|
||||
results[i] = True
|
||||
results[seen[h]] = True
|
||||
else:
|
||||
seen[h] = i
|
||||
return results
|
||||
|
||||
def _fetch_hash(self, url: str) -> Optional[str]:
|
||||
if not url:
|
||||
return None
|
||||
try:
|
||||
resp = requests.get(url, timeout=5, stream=True)
|
||||
resp.raise_for_status()
|
||||
img = Image.open(io.BytesIO(resp.content))
|
||||
return str(imagehash.phash(img))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _url_dedup(self, photo_urls_per_listing: list[list[str]]) -> list[bool]:
|
||||
seen: set[str] = set()
|
||||
results = []
|
||||
for urls in photo_urls_per_listing:
|
||||
primary = urls[0] if urls else ""
|
||||
if primary and primary in seen:
|
||||
results.append(True)
|
||||
else:
|
||||
if primary:
|
||||
seen.add(primary)
|
||||
results.append(False)
|
||||
return results
|
||||
0
tests/trust/__init__.py
Normal file
0
tests/trust/__init__.py
Normal file
52
tests/trust/test_aggregator.py
Normal file
52
tests/trust/test_aggregator.py
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
from app.db.models import Seller
|
||||
from app.trust.aggregator import Aggregator
|
||||
|
||||
|
||||
def test_composite_sum_of_five_signals():
|
||||
agg = Aggregator()
|
||||
scores = {
|
||||
"account_age": 18, "feedback_count": 16,
|
||||
"feedback_ratio": 20, "price_vs_market": 15,
|
||||
"category_history": 14,
|
||||
}
|
||||
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None)
|
||||
assert result.composite_score == 83
|
||||
|
||||
|
||||
def test_hard_filter_new_account():
|
||||
agg = Aggregator()
|
||||
scores = {k: 20 for k in ["account_age", "feedback_count",
|
||||
"feedback_ratio", "price_vs_market", "category_history"]}
|
||||
young_seller = Seller(
|
||||
platform="ebay", platform_seller_id="u", username="u",
|
||||
account_age_days=3, feedback_count=0,
|
||||
feedback_ratio=1.0, category_history_json="{}",
|
||||
)
|
||||
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=young_seller)
|
||||
assert "new_account" in result.red_flags_json
|
||||
|
||||
|
||||
def test_hard_filter_bad_actor_established_account():
|
||||
"""Established account (count > 20) with very bad ratio → hard filter."""
|
||||
agg = Aggregator()
|
||||
scores = {k: 10 for k in ["account_age", "feedback_count",
|
||||
"feedback_ratio", "price_vs_market", "category_history"]}
|
||||
bad_seller = Seller(
|
||||
platform="ebay", platform_seller_id="u", username="u",
|
||||
account_age_days=730, feedback_count=25, # count > 20
|
||||
feedback_ratio=0.70, # ratio < 80% → hard filter
|
||||
category_history_json="{}",
|
||||
)
|
||||
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=bad_seller)
|
||||
assert "established_bad_actor" in result.red_flags_json
|
||||
|
||||
|
||||
def test_partial_score_flagged_when_signals_missing():
|
||||
agg = Aggregator()
|
||||
scores = {
|
||||
"account_age": 18, "feedback_count": None, # None = unavailable
|
||||
"feedback_ratio": 20, "price_vs_market": 15,
|
||||
"category_history": 14,
|
||||
}
|
||||
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None)
|
||||
assert result.score_is_partial is True
|
||||
45
tests/trust/test_metadata.py
Normal file
45
tests/trust/test_metadata.py
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
from app.db.models import Seller
|
||||
from app.trust.metadata import MetadataScorer
|
||||
|
||||
|
||||
def _seller(**kwargs) -> Seller:
|
||||
defaults = dict(
|
||||
platform="ebay", platform_seller_id="u", username="u",
|
||||
account_age_days=730, feedback_count=450,
|
||||
feedback_ratio=0.991, category_history_json='{"ELECTRONICS": 30}',
|
||||
)
|
||||
defaults.update(kwargs)
|
||||
return Seller(**defaults)
|
||||
|
||||
|
||||
def test_established_seller_scores_high():
|
||||
scorer = MetadataScorer()
|
||||
scores = scorer.score(_seller(), market_median=1000.0, listing_price=950.0)
|
||||
total = sum(scores.values())
|
||||
assert total >= 80
|
||||
|
||||
|
||||
def test_new_account_scores_zero_on_age():
|
||||
scorer = MetadataScorer()
|
||||
scores = scorer.score(_seller(account_age_days=3), market_median=1000.0, listing_price=950.0)
|
||||
assert scores["account_age"] == 0
|
||||
|
||||
|
||||
def test_low_feedback_count_scores_low():
|
||||
scorer = MetadataScorer()
|
||||
scores = scorer.score(_seller(feedback_count=2), market_median=1000.0, listing_price=950.0)
|
||||
assert scores["feedback_count"] < 10
|
||||
|
||||
|
||||
def test_suspicious_price_scores_zero():
|
||||
scorer = MetadataScorer()
|
||||
# 60% below market → zero
|
||||
scores = scorer.score(_seller(), market_median=1000.0, listing_price=400.0)
|
||||
assert scores["price_vs_market"] == 0
|
||||
|
||||
|
||||
def test_no_market_data_returns_none():
|
||||
scorer = MetadataScorer()
|
||||
scores = scorer.score(_seller(), market_median=None, listing_price=950.0)
|
||||
# None signals "data unavailable" — aggregator will set score_is_partial=True
|
||||
assert scores["price_vs_market"] is None
|
||||
24
tests/trust/test_photo.py
Normal file
24
tests/trust/test_photo.py
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
from app.trust.photo import PhotoScorer
|
||||
|
||||
|
||||
def test_no_duplicates_in_single_listing_result():
|
||||
scorer = PhotoScorer()
|
||||
photo_urls_per_listing = [
|
||||
["https://img.com/a.jpg", "https://img.com/b.jpg"],
|
||||
["https://img.com/c.jpg"],
|
||||
]
|
||||
# All unique images — no duplicates
|
||||
results = scorer.check_duplicates(photo_urls_per_listing)
|
||||
assert all(not r for r in results)
|
||||
|
||||
|
||||
def test_duplicate_photo_flagged():
|
||||
scorer = PhotoScorer()
|
||||
# Same URL in two listings = trivially duplicate (hash will match)
|
||||
photo_urls_per_listing = [
|
||||
["https://img.com/same.jpg"],
|
||||
["https://img.com/same.jpg"],
|
||||
]
|
||||
results = scorer.check_duplicates(photo_urls_per_listing)
|
||||
# Both listings should be flagged
|
||||
assert results[0] is True or results[1] is True
|
||||
Loading…
Reference in a new issue