feat: add metadata scorer, photo hash dedup, and trust aggregator

This commit is contained in:
pyr0ball 2026-03-25 12:57:56 -07:00
parent 1672e215b2
commit ee3c85bfb0
8 changed files with 359 additions and 0 deletions

41
app/trust/__init__.py Normal file
View file

@ -0,0 +1,41 @@
from .metadata import MetadataScorer
from .photo import PhotoScorer
from .aggregator import Aggregator
from app.db.models import Seller, Listing, TrustScore
from app.db.store import Store
import hashlib
class TrustScorer:
"""Orchestrates metadata + photo scoring for a batch of listings."""
def __init__(self, store: Store):
self._store = store
self._meta = MetadataScorer()
self._photo = PhotoScorer()
self._agg = Aggregator()
def score_batch(
self,
listings: list[Listing],
query: str,
) -> list[TrustScore]:
query_hash = hashlib.md5(query.encode()).hexdigest()
comp = self._store.get_market_comp("ebay", query_hash)
market_median = comp.median_price if comp else None
photo_url_sets = [l.photo_urls for l in listings]
duplicates = self._photo.check_duplicates(photo_url_sets)
scores = []
for listing, is_dup in zip(listings, duplicates):
seller = self._store.get_seller("ebay", listing.seller_platform_id)
if seller:
signal_scores = self._meta.score(seller, market_median, listing.price)
else:
signal_scores = {k: None for k in
["account_age", "feedback_count", "feedback_ratio",
"price_vs_market", "category_history"]}
trust = self._agg.aggregate(signal_scores, is_dup, seller, listing.id or 0)
scores.append(trust)
return scores

56
app/trust/aggregator.py Normal file
View file

@ -0,0 +1,56 @@
"""Composite score and red flag extraction."""
from __future__ import annotations
import json
from typing import Optional
from app.db.models import Seller, TrustScore
HARD_FILTER_AGE_DAYS = 7
HARD_FILTER_BAD_RATIO_MIN_COUNT = 20
HARD_FILTER_BAD_RATIO_THRESHOLD = 0.80
class Aggregator:
def aggregate(
self,
signal_scores: dict[str, Optional[int]],
photo_hash_duplicate: bool,
seller: Optional[Seller],
listing_id: int = 0,
) -> TrustScore:
is_partial = any(v is None for v in signal_scores.values())
clean = {k: (v if v is not None else 0) for k, v in signal_scores.items()}
composite = sum(clean.values())
red_flags: list[str] = []
# Hard filters
if seller and seller.account_age_days < HARD_FILTER_AGE_DAYS:
red_flags.append("new_account")
if seller and (
seller.feedback_ratio < HARD_FILTER_BAD_RATIO_THRESHOLD
and seller.feedback_count > HARD_FILTER_BAD_RATIO_MIN_COUNT
):
red_flags.append("established_bad_actor")
# Soft flags
if seller and seller.account_age_days < 30:
red_flags.append("account_under_30_days")
if seller and seller.feedback_count < 10:
red_flags.append("low_feedback_count")
if clean["price_vs_market"] == 0:
red_flags.append("suspicious_price")
if photo_hash_duplicate:
red_flags.append("duplicate_photo")
return TrustScore(
listing_id=listing_id,
composite_score=composite,
account_age_score=clean["account_age"],
feedback_count_score=clean["feedback_count"],
feedback_ratio_score=clean["feedback_ratio"],
price_vs_market_score=clean["price_vs_market"],
category_history_score=clean["category_history"],
photo_hash_duplicate=photo_hash_duplicate,
red_flags_json=json.dumps(red_flags),
score_is_partial=is_partial,
)

67
app/trust/metadata.py Normal file
View file

@ -0,0 +1,67 @@
"""Five metadata trust signals, each scored 020."""
from __future__ import annotations
import json
from typing import Optional
from app.db.models import Seller
ELECTRONICS_CATEGORIES = {"ELECTRONICS", "COMPUTERS_TABLETS", "VIDEO_GAMES", "CELL_PHONES"}
class MetadataScorer:
def score(
self,
seller: Seller,
market_median: Optional[float],
listing_price: float,
) -> dict[str, Optional[int]]:
return {
"account_age": self._account_age(seller.account_age_days),
"feedback_count": self._feedback_count(seller.feedback_count),
"feedback_ratio": self._feedback_ratio(seller.feedback_ratio, seller.feedback_count),
"price_vs_market": self._price_vs_market(listing_price, market_median),
"category_history": self._category_history(seller.category_history_json),
}
def _account_age(self, days: int) -> int:
if days < 7: return 0
if days < 30: return 5
if days < 90: return 10
if days < 365: return 15
return 20
def _feedback_count(self, count: int) -> int:
if count < 3: return 0
if count < 10: return 5
if count < 50: return 10
if count < 200: return 15
return 20
def _feedback_ratio(self, ratio: float, count: int) -> int:
if ratio < 0.80 and count > 20: return 0
if ratio < 0.90: return 5
if ratio < 0.95: return 10
if ratio < 0.98: return 15
return 20
def _price_vs_market(self, price: float, median: Optional[float]) -> Optional[int]:
if median is None: return None # data unavailable → aggregator sets score_is_partial
if median <= 0: return None
ratio = price / median
if ratio < 0.50: return 0 # >50% below = scam
if ratio < 0.70: return 5 # >30% below = suspicious
if ratio < 0.85: return 10
if ratio <= 1.20: return 20
return 15 # above market = still ok, just expensive
def _category_history(self, category_history_json: str) -> int:
try:
history = json.loads(category_history_json)
except (ValueError, TypeError):
return 0
electronics_sales = sum(
v for k, v in history.items() if k in ELECTRONICS_CATEGORIES
)
if electronics_sales == 0: return 0
if electronics_sales < 5: return 8
if electronics_sales < 20: return 14
return 20

74
app/trust/photo.py Normal file
View file

@ -0,0 +1,74 @@
"""Perceptual hash deduplication within a result set (free tier, v0.1)."""
from __future__ import annotations
from typing import Optional
import io
import requests
try:
import imagehash
from PIL import Image
_IMAGEHASH_AVAILABLE = True
except ImportError:
_IMAGEHASH_AVAILABLE = False
class PhotoScorer:
"""
check_duplicates: compare images within a single result set.
Cross-session dedup (PhotoHash table) is v0.2.
Vision analysis (real/marketing/EM bag) is v0.2 paid tier.
"""
def check_duplicates(self, photo_urls_per_listing: list[list[str]]) -> list[bool]:
"""
Returns a list of booleans parallel to photo_urls_per_listing.
True = this listing's primary photo is a duplicate of another listing in the set.
Falls back to URL-equality check if imagehash is unavailable or fetch fails.
"""
if not _IMAGEHASH_AVAILABLE:
return self._url_dedup(photo_urls_per_listing)
primary_urls = [urls[0] if urls else "" for urls in photo_urls_per_listing]
# Fast path: URL equality is a trivial duplicate signal (no fetch needed)
url_results = self._url_dedup([[u] for u in primary_urls])
hashes: list[Optional[str]] = []
for url in primary_urls:
hashes.append(self._fetch_hash(url))
results = list(url_results) # start from URL-equality results
seen: dict[str, int] = {}
for i, h in enumerate(hashes):
if h is None:
continue
if h in seen:
results[i] = True
results[seen[h]] = True
else:
seen[h] = i
return results
def _fetch_hash(self, url: str) -> Optional[str]:
if not url:
return None
try:
resp = requests.get(url, timeout=5, stream=True)
resp.raise_for_status()
img = Image.open(io.BytesIO(resp.content))
return str(imagehash.phash(img))
except Exception:
return None
def _url_dedup(self, photo_urls_per_listing: list[list[str]]) -> list[bool]:
seen: set[str] = set()
results = []
for urls in photo_urls_per_listing:
primary = urls[0] if urls else ""
if primary and primary in seen:
results.append(True)
else:
if primary:
seen.add(primary)
results.append(False)
return results

0
tests/trust/__init__.py Normal file
View file

View file

@ -0,0 +1,52 @@
from app.db.models import Seller
from app.trust.aggregator import Aggregator
def test_composite_sum_of_five_signals():
agg = Aggregator()
scores = {
"account_age": 18, "feedback_count": 16,
"feedback_ratio": 20, "price_vs_market": 15,
"category_history": 14,
}
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None)
assert result.composite_score == 83
def test_hard_filter_new_account():
agg = Aggregator()
scores = {k: 20 for k in ["account_age", "feedback_count",
"feedback_ratio", "price_vs_market", "category_history"]}
young_seller = Seller(
platform="ebay", platform_seller_id="u", username="u",
account_age_days=3, feedback_count=0,
feedback_ratio=1.0, category_history_json="{}",
)
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=young_seller)
assert "new_account" in result.red_flags_json
def test_hard_filter_bad_actor_established_account():
"""Established account (count > 20) with very bad ratio → hard filter."""
agg = Aggregator()
scores = {k: 10 for k in ["account_age", "feedback_count",
"feedback_ratio", "price_vs_market", "category_history"]}
bad_seller = Seller(
platform="ebay", platform_seller_id="u", username="u",
account_age_days=730, feedback_count=25, # count > 20
feedback_ratio=0.70, # ratio < 80% → hard filter
category_history_json="{}",
)
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=bad_seller)
assert "established_bad_actor" in result.red_flags_json
def test_partial_score_flagged_when_signals_missing():
agg = Aggregator()
scores = {
"account_age": 18, "feedback_count": None, # None = unavailable
"feedback_ratio": 20, "price_vs_market": 15,
"category_history": 14,
}
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None)
assert result.score_is_partial is True

View file

@ -0,0 +1,45 @@
from app.db.models import Seller
from app.trust.metadata import MetadataScorer
def _seller(**kwargs) -> Seller:
defaults = dict(
platform="ebay", platform_seller_id="u", username="u",
account_age_days=730, feedback_count=450,
feedback_ratio=0.991, category_history_json='{"ELECTRONICS": 30}',
)
defaults.update(kwargs)
return Seller(**defaults)
def test_established_seller_scores_high():
scorer = MetadataScorer()
scores = scorer.score(_seller(), market_median=1000.0, listing_price=950.0)
total = sum(scores.values())
assert total >= 80
def test_new_account_scores_zero_on_age():
scorer = MetadataScorer()
scores = scorer.score(_seller(account_age_days=3), market_median=1000.0, listing_price=950.0)
assert scores["account_age"] == 0
def test_low_feedback_count_scores_low():
scorer = MetadataScorer()
scores = scorer.score(_seller(feedback_count=2), market_median=1000.0, listing_price=950.0)
assert scores["feedback_count"] < 10
def test_suspicious_price_scores_zero():
scorer = MetadataScorer()
# 60% below market → zero
scores = scorer.score(_seller(), market_median=1000.0, listing_price=400.0)
assert scores["price_vs_market"] == 0
def test_no_market_data_returns_none():
scorer = MetadataScorer()
scores = scorer.score(_seller(), market_median=None, listing_price=950.0)
# None signals "data unavailable" — aggregator will set score_is_partial=True
assert scores["price_vs_market"] is None

24
tests/trust/test_photo.py Normal file
View file

@ -0,0 +1,24 @@
from app.trust.photo import PhotoScorer
def test_no_duplicates_in_single_listing_result():
scorer = PhotoScorer()
photo_urls_per_listing = [
["https://img.com/a.jpg", "https://img.com/b.jpg"],
["https://img.com/c.jpg"],
]
# All unique images — no duplicates
results = scorer.check_duplicates(photo_urls_per_listing)
assert all(not r for r in results)
def test_duplicate_photo_flagged():
scorer = PhotoScorer()
# Same URL in two listings = trivially duplicate (hash will match)
photo_urls_per_listing = [
["https://img.com/same.jpg"],
["https://img.com/same.jpg"],
]
results = scorer.check_duplicates(photo_urls_per_listing)
# Both listings should be flagged
assert results[0] is True or results[1] is True