feat: add metadata scorer, photo hash dedup, and trust aggregator

2026-03-25 12:57:56 -07:00 · 2026-03-25 12:57:56 -07:00 · ee3c85bfb0
commit ee3c85bfb0
parent 1672e215b2
8 changed files with 359 additions and 0 deletions
--- a/app/trust/init.py
+++ b/app/trust/init.py
@ -0,0 +1,41 @@
+from .metadata import MetadataScorer
+from .photo import PhotoScorer
+from .aggregator import Aggregator
+from app.db.models import Seller, Listing, TrustScore
+from app.db.store import Store
+import hashlib
+
+
+class TrustScorer:
+    """Orchestrates metadata + photo scoring for a batch of listings."""
+
+    def __init__(self, store: Store):
+        self._store = store
+        self._meta = MetadataScorer()
+        self._photo = PhotoScorer()
+        self._agg = Aggregator()
+
+    def score_batch(
+        self,
+        listings: list[Listing],
+        query: str,
+    ) -> list[TrustScore]:
+        query_hash = hashlib.md5(query.encode()).hexdigest()
+        comp = self._store.get_market_comp("ebay", query_hash)
+        market_median = comp.median_price if comp else None
+
+        photo_url_sets = [l.photo_urls for l in listings]
+        duplicates = self._photo.check_duplicates(photo_url_sets)
+
+        scores = []
+        for listing, is_dup in zip(listings, duplicates):
+            seller = self._store.get_seller("ebay", listing.seller_platform_id)
+            if seller:
+                signal_scores = self._meta.score(seller, market_median, listing.price)
+            else:
+                signal_scores = {k: None for k in
+                                 ["account_age", "feedback_count", "feedback_ratio",
+                                  "price_vs_market", "category_history"]}
+            trust = self._agg.aggregate(signal_scores, is_dup, seller, listing.id or 0)
+            scores.append(trust)
+        return scores
--- a/app/trust/aggregator.py
+++ b/app/trust/aggregator.py
@ -0,0 +1,56 @@
+"""Composite score and red flag extraction."""
+from __future__ import annotations
+import json
+from typing import Optional
+from app.db.models import Seller, TrustScore
+
+HARD_FILTER_AGE_DAYS = 7
+HARD_FILTER_BAD_RATIO_MIN_COUNT = 20
+HARD_FILTER_BAD_RATIO_THRESHOLD = 0.80
+
+
+class Aggregator:
+    def aggregate(
+        self,
+        signal_scores: dict[str, Optional[int]],
+        photo_hash_duplicate: bool,
+        seller: Optional[Seller],
+        listing_id: int = 0,
+    ) -> TrustScore:
+        is_partial = any(v is None for v in signal_scores.values())
+        clean = {k: (v if v is not None else 0) for k, v in signal_scores.items()}
+        composite = sum(clean.values())
+
+        red_flags: list[str] = []
+
+        # Hard filters
+        if seller and seller.account_age_days < HARD_FILTER_AGE_DAYS:
+            red_flags.append("new_account")
+        if seller and (
+            seller.feedback_ratio < HARD_FILTER_BAD_RATIO_THRESHOLD
+            and seller.feedback_count > HARD_FILTER_BAD_RATIO_MIN_COUNT
+        ):
+            red_flags.append("established_bad_actor")
+
+        # Soft flags
+        if seller and seller.account_age_days < 30:
+            red_flags.append("account_under_30_days")
+        if seller and seller.feedback_count < 10:
+            red_flags.append("low_feedback_count")
+        if clean["price_vs_market"] == 0:
+            red_flags.append("suspicious_price")
+        if photo_hash_duplicate:
+            red_flags.append("duplicate_photo")
+
+        return TrustScore(
+            listing_id=listing_id,
+            composite_score=composite,
+            account_age_score=clean["account_age"],
+            feedback_count_score=clean["feedback_count"],
+            feedback_ratio_score=clean["feedback_ratio"],
+            price_vs_market_score=clean["price_vs_market"],
+            category_history_score=clean["category_history"],
+            photo_hash_duplicate=photo_hash_duplicate,
+            red_flags_json=json.dumps(red_flags),
+            score_is_partial=is_partial,
+        )
--- a/app/trust/metadata.py
+++ b/app/trust/metadata.py
@ -0,0 +1,67 @@
+"""Five metadata trust signals, each scored 0–20."""
+from __future__ import annotations
+import json
+from typing import Optional
+from app.db.models import Seller
+
+ELECTRONICS_CATEGORIES = {"ELECTRONICS", "COMPUTERS_TABLETS", "VIDEO_GAMES", "CELL_PHONES"}
+
+
+class MetadataScorer:
+    def score(
+        self,
+        seller: Seller,
+        market_median: Optional[float],
+        listing_price: float,
+    ) -> dict[str, Optional[int]]:
+        return {
+            "account_age":      self._account_age(seller.account_age_days),
+            "feedback_count":   self._feedback_count(seller.feedback_count),
+            "feedback_ratio":   self._feedback_ratio(seller.feedback_ratio, seller.feedback_count),
+            "price_vs_market":  self._price_vs_market(listing_price, market_median),
+            "category_history": self._category_history(seller.category_history_json),
+        }
+
+    def _account_age(self, days: int) -> int:
+        if days < 7:   return 0
+        if days < 30:  return 5
+        if days < 90:  return 10
+        if days < 365: return 15
+        return 20
+
+    def _feedback_count(self, count: int) -> int:
+        if count < 3:   return 0
+        if count < 10:  return 5
+        if count < 50:  return 10
+        if count < 200: return 15
+        return 20
+
+    def _feedback_ratio(self, ratio: float, count: int) -> int:
+        if ratio < 0.80 and count > 20: return 0
+        if ratio < 0.90: return 5
+        if ratio < 0.95: return 10
+        if ratio < 0.98: return 15
+        return 20
+
+    def _price_vs_market(self, price: float, median: Optional[float]) -> Optional[int]:
+        if median is None: return None  # data unavailable → aggregator sets score_is_partial
+        if median <= 0:    return None
+        ratio = price / median
+        if ratio < 0.50:  return 0   # >50% below = scam
+        if ratio < 0.70:  return 5   # >30% below = suspicious
+        if ratio < 0.85:  return 10
+        if ratio <= 1.20: return 20
+        return 15  # above market = still ok, just expensive
+
+    def _category_history(self, category_history_json: str) -> int:
+        try:
+            history = json.loads(category_history_json)
+        except (ValueError, TypeError):
+            return 0
+        electronics_sales = sum(
+            v for k, v in history.items() if k in ELECTRONICS_CATEGORIES
+        )
+        if electronics_sales == 0: return 0
+        if electronics_sales < 5:  return 8
+        if electronics_sales < 20: return 14
+        return 20
--- a/app/trust/photo.py
+++ b/app/trust/photo.py
@ -0,0 +1,74 @@
+"""Perceptual hash deduplication within a result set (free tier, v0.1)."""
+from __future__ import annotations
+from typing import Optional
+import io
+import requests
+
+try:
+    import imagehash
+    from PIL import Image
+    _IMAGEHASH_AVAILABLE = True
+except ImportError:
+    _IMAGEHASH_AVAILABLE = False
+
+
+class PhotoScorer:
+    """
+    check_duplicates: compare images within a single result set.
+    Cross-session dedup (PhotoHash table) is v0.2.
+    Vision analysis (real/marketing/EM bag) is v0.2 paid tier.
+    """
+
+    def check_duplicates(self, photo_urls_per_listing: list[list[str]]) -> list[bool]:
+        """
+        Returns a list of booleans parallel to photo_urls_per_listing.
+        True = this listing's primary photo is a duplicate of another listing in the set.
+        Falls back to URL-equality check if imagehash is unavailable or fetch fails.
+        """
+        if not _IMAGEHASH_AVAILABLE:
+            return self._url_dedup(photo_urls_per_listing)
+
+        primary_urls = [urls[0] if urls else "" for urls in photo_urls_per_listing]
+
+        # Fast path: URL equality is a trivial duplicate signal (no fetch needed)
+        url_results = self._url_dedup([[u] for u in primary_urls])
+
+        hashes: list[Optional[str]] = []
+        for url in primary_urls:
+            hashes.append(self._fetch_hash(url))
+
+        results = list(url_results)  # start from URL-equality results
+        seen: dict[str, int] = {}
+        for i, h in enumerate(hashes):
+            if h is None:
+                continue
+            if h in seen:
+                results[i] = True
+                results[seen[h]] = True
+            else:
+                seen[h] = i
+        return results
+
+    def _fetch_hash(self, url: str) -> Optional[str]:
+        if not url:
+            return None
+        try:
+            resp = requests.get(url, timeout=5, stream=True)
+            resp.raise_for_status()
+            img = Image.open(io.BytesIO(resp.content))
+            return str(imagehash.phash(img))
+        except Exception:
+            return None
+
+    def _url_dedup(self, photo_urls_per_listing: list[list[str]]) -> list[bool]:
+        seen: set[str] = set()
+        results = []
+        for urls in photo_urls_per_listing:
+            primary = urls[0] if urls else ""
+            if primary and primary in seen:
+                results.append(True)
+            else:
+                if primary:
+                    seen.add(primary)
+                results.append(False)
+        return results
--- a/tests/trust/init.py
+++ b/tests/trust/init.py
--- a/tests/trust/test_aggregator.py
+++ b/tests/trust/test_aggregator.py
@ -0,0 +1,52 @@
+from app.db.models import Seller
+from app.trust.aggregator import Aggregator
+
+
+def test_composite_sum_of_five_signals():
+    agg = Aggregator()
+    scores = {
+        "account_age": 18, "feedback_count": 16,
+        "feedback_ratio": 20, "price_vs_market": 15,
+        "category_history": 14,
+    }
+    result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None)
+    assert result.composite_score == 83
+
+
+def test_hard_filter_new_account():
+    agg = Aggregator()
+    scores = {k: 20 for k in ["account_age", "feedback_count",
+                               "feedback_ratio", "price_vs_market", "category_history"]}
+    young_seller = Seller(
+        platform="ebay", platform_seller_id="u", username="u",
+        account_age_days=3, feedback_count=0,
+        feedback_ratio=1.0, category_history_json="{}",
+    )
+    result = agg.aggregate(scores, photo_hash_duplicate=False, seller=young_seller)
+    assert "new_account" in result.red_flags_json
+
+
+def test_hard_filter_bad_actor_established_account():
+    """Established account (count > 20) with very bad ratio → hard filter."""
+    agg = Aggregator()
+    scores = {k: 10 for k in ["account_age", "feedback_count",
+                               "feedback_ratio", "price_vs_market", "category_history"]}
+    bad_seller = Seller(
+        platform="ebay", platform_seller_id="u", username="u",
+        account_age_days=730, feedback_count=25,  # count > 20
+        feedback_ratio=0.70,                       # ratio < 80% → hard filter
+        category_history_json="{}",
+    )
+    result = agg.aggregate(scores, photo_hash_duplicate=False, seller=bad_seller)
+    assert "established_bad_actor" in result.red_flags_json
+
+
+def test_partial_score_flagged_when_signals_missing():
+    agg = Aggregator()
+    scores = {
+        "account_age": 18, "feedback_count": None,  # None = unavailable
+        "feedback_ratio": 20, "price_vs_market": 15,
+        "category_history": 14,
+    }
+    result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None)
+    assert result.score_is_partial is True
--- a/tests/trust/test_metadata.py
+++ b/tests/trust/test_metadata.py
@ -0,0 +1,45 @@
+from app.db.models import Seller
+from app.trust.metadata import MetadataScorer
+
+
+def _seller(**kwargs) -> Seller:
+    defaults = dict(
+        platform="ebay", platform_seller_id="u", username="u",
+        account_age_days=730, feedback_count=450,
+        feedback_ratio=0.991, category_history_json='{"ELECTRONICS": 30}',
+    )
+    defaults.update(kwargs)
+    return Seller(**defaults)
+
+
+def test_established_seller_scores_high():
+    scorer = MetadataScorer()
+    scores = scorer.score(_seller(), market_median=1000.0, listing_price=950.0)
+    total = sum(scores.values())
+    assert total >= 80
+
+
+def test_new_account_scores_zero_on_age():
+    scorer = MetadataScorer()
+    scores = scorer.score(_seller(account_age_days=3), market_median=1000.0, listing_price=950.0)
+    assert scores["account_age"] == 0
+
+
+def test_low_feedback_count_scores_low():
+    scorer = MetadataScorer()
+    scores = scorer.score(_seller(feedback_count=2), market_median=1000.0, listing_price=950.0)
+    assert scores["feedback_count"] < 10
+
+
+def test_suspicious_price_scores_zero():
+    scorer = MetadataScorer()
+    # 60% below market → zero
+    scores = scorer.score(_seller(), market_median=1000.0, listing_price=400.0)
+    assert scores["price_vs_market"] == 0
+
+
+def test_no_market_data_returns_none():
+    scorer = MetadataScorer()
+    scores = scorer.score(_seller(), market_median=None, listing_price=950.0)
+    # None signals "data unavailable" — aggregator will set score_is_partial=True
+    assert scores["price_vs_market"] is None
--- a/tests/trust/test_photo.py
+++ b/tests/trust/test_photo.py
@ -0,0 +1,24 @@
+from app.trust.photo import PhotoScorer
+
+
+def test_no_duplicates_in_single_listing_result():
+    scorer = PhotoScorer()
+    photo_urls_per_listing = [
+        ["https://img.com/a.jpg", "https://img.com/b.jpg"],
+        ["https://img.com/c.jpg"],
+    ]
+    # All unique images — no duplicates
+    results = scorer.check_duplicates(photo_urls_per_listing)
+    assert all(not r for r in results)
+
+
+def test_duplicate_photo_flagged():
+    scorer = PhotoScorer()
+    # Same URL in two listings = trivially duplicate (hash will match)
+    photo_urls_per_listing = [
+        ["https://img.com/same.jpg"],
+        ["https://img.com/same.jpg"],
+    ]
+    results = scorer.check_duplicates(photo_urls_per_listing)
+    # Both listings should be flagged
+    assert results[0] is True or results[1] is True