feat: add metadata scorer, photo hash dedup, and trust aggregator

2026-03-25 12:57:56 -07:00 · 2026-03-25 12:57:56 -07:00 · ee3c85bfb0
commit ee3c85bfb0
parent 1672e215b2
8 changed files with 359 additions and 0 deletions
--- a/app/trust/init.py
+++ b/app/trust/init.py
@ -0,0 +1,41 @@
 from .metadata import MetadataScorer
 from .photo import PhotoScorer
 from .aggregator import Aggregator
 from app.db.models import Seller, Listing, TrustScore
 from app.db.store import Store
 import hashlib
 class TrustScorer:
    """Orchestrates metadata + photo scoring for a batch of listings."""
    def __init__(self, store: Store):
        self._store = store
        self._meta = MetadataScorer()
        self._photo = PhotoScorer()
        self._agg = Aggregator()
    def score_batch(
        self,
        listings: list[Listing],
        query: str,
    ) -> list[TrustScore]:
        query_hash = hashlib.md5(query.encode()).hexdigest()
        comp = self._store.get_market_comp("ebay", query_hash)
        market_median = comp.median_price if comp else None
        photo_url_sets = [l.photo_urls for l in listings]
        duplicates = self._photo.check_duplicates(photo_url_sets)
        scores = []
        for listing, is_dup in zip(listings, duplicates):
            seller = self._store.get_seller("ebay", listing.seller_platform_id)
            if seller:
                signal_scores = self._meta.score(seller, market_median, listing.price)
            else:
                signal_scores = {k: None for k in
                                 ["account_age", "feedback_count", "feedback_ratio",
                                  "price_vs_market", "category_history"]}
            trust = self._agg.aggregate(signal_scores, is_dup, seller, listing.id or 0)
            scores.append(trust)
        return scores
--- a/app/trust/aggregator.py
+++ b/app/trust/aggregator.py
@ -0,0 +1,56 @@
 """Composite score and red flag extraction."""
 from __future__ import annotations
 import json
 from typing import Optional
 from app.db.models import Seller, TrustScore
 HARD_FILTER_AGE_DAYS = 7
 HARD_FILTER_BAD_RATIO_MIN_COUNT = 20
 HARD_FILTER_BAD_RATIO_THRESHOLD = 0.80
 class Aggregator:
    def aggregate(
        self,
        signal_scores: dict[str, Optional[int]],
        photo_hash_duplicate: bool,
        seller: Optional[Seller],
        listing_id: int = 0,
    ) -> TrustScore:
        is_partial = any(v is None for v in signal_scores.values())
        clean = {k: (v if v is not None else 0) for k, v in signal_scores.items()}
        composite = sum(clean.values())
        red_flags: list[str] = []
        # Hard filters
        if seller and seller.account_age_days < HARD_FILTER_AGE_DAYS:
            red_flags.append("new_account")
        if seller and (
            seller.feedback_ratio < HARD_FILTER_BAD_RATIO_THRESHOLD
            and seller.feedback_count > HARD_FILTER_BAD_RATIO_MIN_COUNT
        ):
            red_flags.append("established_bad_actor")
        # Soft flags
        if seller and seller.account_age_days < 30:
            red_flags.append("account_under_30_days")
        if seller and seller.feedback_count < 10:
            red_flags.append("low_feedback_count")
        if clean["price_vs_market"] == 0:
            red_flags.append("suspicious_price")
        if photo_hash_duplicate:
            red_flags.append("duplicate_photo")
        return TrustScore(
            listing_id=listing_id,
            composite_score=composite,
            account_age_score=clean["account_age"],
            feedback_count_score=clean["feedback_count"],
            feedback_ratio_score=clean["feedback_ratio"],
            price_vs_market_score=clean["price_vs_market"],
            category_history_score=clean["category_history"],
            photo_hash_duplicate=photo_hash_duplicate,
            red_flags_json=json.dumps(red_flags),
            score_is_partial=is_partial,
        )
--- a/app/trust/metadata.py
+++ b/app/trust/metadata.py
@ -0,0 +1,67 @@
 """Five metadata trust signals, each scored 0–20."""
 from __future__ import annotations
 import json
 from typing import Optional
 from app.db.models import Seller
 ELECTRONICS_CATEGORIES = {"ELECTRONICS", "COMPUTERS_TABLETS", "VIDEO_GAMES", "CELL_PHONES"}
 class MetadataScorer:
    def score(
        self,
        seller: Seller,
        market_median: Optional[float],
        listing_price: float,
    ) -> dict[str, Optional[int]]:
        return {
            "account_age":      self._account_age(seller.account_age_days),
            "feedback_count":   self._feedback_count(seller.feedback_count),
            "feedback_ratio":   self._feedback_ratio(seller.feedback_ratio, seller.feedback_count),
            "price_vs_market":  self._price_vs_market(listing_price, market_median),
            "category_history": self._category_history(seller.category_history_json),
        }
    def _account_age(self, days: int) -> int:
        if days < 7:   return 0
        if days < 30:  return 5
        if days < 90:  return 10
        if days < 365: return 15
        return 20
    def _feedback_count(self, count: int) -> int:
        if count < 3:   return 0
        if count < 10:  return 5
        if count < 50:  return 10
        if count < 200: return 15
        return 20
    def _feedback_ratio(self, ratio: float, count: int) -> int:
        if ratio < 0.80 and count > 20: return 0
        if ratio < 0.90: return 5
        if ratio < 0.95: return 10
        if ratio < 0.98: return 15
        return 20
    def _price_vs_market(self, price: float, median: Optional[float]) -> Optional[int]:
        if median is None: return None  # data unavailable → aggregator sets score_is_partial
        if median <= 0:    return None
        ratio = price / median
        if ratio < 0.50:  return 0   # >50% below = scam
        if ratio < 0.70:  return 5   # >30% below = suspicious
        if ratio < 0.85:  return 10
        if ratio <= 1.20: return 20
        return 15  # above market = still ok, just expensive
    def _category_history(self, category_history_json: str) -> int:
        try:
            history = json.loads(category_history_json)
        except (ValueError, TypeError):
            return 0
        electronics_sales = sum(
            v for k, v in history.items() if k in ELECTRONICS_CATEGORIES
        )
        if electronics_sales == 0: return 0
        if electronics_sales < 5:  return 8
        if electronics_sales < 20: return 14
        return 20
--- a/app/trust/photo.py
+++ b/app/trust/photo.py
@ -0,0 +1,74 @@
 """Perceptual hash deduplication within a result set (free tier, v0.1)."""
 from __future__ import annotations
 from typing import Optional
 import io
 import requests
 try:
    import imagehash
    from PIL import Image
    _IMAGEHASH_AVAILABLE = True
 except ImportError:
    _IMAGEHASH_AVAILABLE = False
 class PhotoScorer:
    """
    check_duplicates: compare images within a single result set.
    Cross-session dedup (PhotoHash table) is v0.2.
    Vision analysis (real/marketing/EM bag) is v0.2 paid tier.
    """
    def check_duplicates(self, photo_urls_per_listing: list[list[str]]) -> list[bool]:
        """
        Returns a list of booleans parallel to photo_urls_per_listing.
        True = this listing's primary photo is a duplicate of another listing in the set.
        Falls back to URL-equality check if imagehash is unavailable or fetch fails.
        """
        if not _IMAGEHASH_AVAILABLE:
            return self._url_dedup(photo_urls_per_listing)
        primary_urls = [urls[0] if urls else "" for urls in photo_urls_per_listing]
        # Fast path: URL equality is a trivial duplicate signal (no fetch needed)
        url_results = self._url_dedup([[u] for u in primary_urls])
        hashes: list[Optional[str]] = []
        for url in primary_urls:
            hashes.append(self._fetch_hash(url))
        results = list(url_results)  # start from URL-equality results
        seen: dict[str, int] = {}
        for i, h in enumerate(hashes):
            if h is None:
                continue
            if h in seen:
                results[i] = True
                results[seen[h]] = True
            else:
                seen[h] = i
        return results
    def _fetch_hash(self, url: str) -> Optional[str]:
        if not url:
            return None
        try:
            resp = requests.get(url, timeout=5, stream=True)
            resp.raise_for_status()
            img = Image.open(io.BytesIO(resp.content))
            return str(imagehash.phash(img))
        except Exception:
            return None
    def _url_dedup(self, photo_urls_per_listing: list[list[str]]) -> list[bool]:
        seen: set[str] = set()
        results = []
        for urls in photo_urls_per_listing:
            primary = urls[0] if urls else ""
            if primary and primary in seen:
                results.append(True)
            else:
                if primary:
                    seen.add(primary)
                results.append(False)
        return results
--- a/tests/trust/init.py
+++ b/tests/trust/init.py
--- a/tests/trust/test_aggregator.py
+++ b/tests/trust/test_aggregator.py
@ -0,0 +1,52 @@
 from app.db.models import Seller
 from app.trust.aggregator import Aggregator
 def test_composite_sum_of_five_signals():
    agg = Aggregator()
    scores = {
        "account_age": 18, "feedback_count": 16,
        "feedback_ratio": 20, "price_vs_market": 15,
        "category_history": 14,
    }
    result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None)
    assert result.composite_score == 83
 def test_hard_filter_new_account():
    agg = Aggregator()
    scores = {k: 20 for k in ["account_age", "feedback_count",
                               "feedback_ratio", "price_vs_market", "category_history"]}
    young_seller = Seller(
        platform="ebay", platform_seller_id="u", username="u",
        account_age_days=3, feedback_count=0,
        feedback_ratio=1.0, category_history_json="{}",
    )
    result = agg.aggregate(scores, photo_hash_duplicate=False, seller=young_seller)
    assert "new_account" in result.red_flags_json
 def test_hard_filter_bad_actor_established_account():
    """Established account (count > 20) with very bad ratio → hard filter."""
    agg = Aggregator()
    scores = {k: 10 for k in ["account_age", "feedback_count",
                               "feedback_ratio", "price_vs_market", "category_history"]}
    bad_seller = Seller(
        platform="ebay", platform_seller_id="u", username="u",
        account_age_days=730, feedback_count=25,  # count > 20
        feedback_ratio=0.70,                       # ratio < 80% → hard filter
        category_history_json="{}",
    )
    result = agg.aggregate(scores, photo_hash_duplicate=False, seller=bad_seller)
    assert "established_bad_actor" in result.red_flags_json
 def test_partial_score_flagged_when_signals_missing():
    agg = Aggregator()
    scores = {
        "account_age": 18, "feedback_count": None,  # None = unavailable
        "feedback_ratio": 20, "price_vs_market": 15,
        "category_history": 14,
    }
    result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None)
    assert result.score_is_partial is True
--- a/tests/trust/test_metadata.py
+++ b/tests/trust/test_metadata.py
@ -0,0 +1,45 @@
 from app.db.models import Seller
 from app.trust.metadata import MetadataScorer
 def _seller(**kwargs) -> Seller:
    defaults = dict(
        platform="ebay", platform_seller_id="u", username="u",
        account_age_days=730, feedback_count=450,
        feedback_ratio=0.991, category_history_json='{"ELECTRONICS": 30}',
    )
    defaults.update(kwargs)
    return Seller(**defaults)
 def test_established_seller_scores_high():
    scorer = MetadataScorer()
    scores = scorer.score(_seller(), market_median=1000.0, listing_price=950.0)
    total = sum(scores.values())
    assert total >= 80
 def test_new_account_scores_zero_on_age():
    scorer = MetadataScorer()
    scores = scorer.score(_seller(account_age_days=3), market_median=1000.0, listing_price=950.0)
    assert scores["account_age"] == 0
 def test_low_feedback_count_scores_low():
    scorer = MetadataScorer()
    scores = scorer.score(_seller(feedback_count=2), market_median=1000.0, listing_price=950.0)
    assert scores["feedback_count"] < 10
 def test_suspicious_price_scores_zero():
    scorer = MetadataScorer()
    # 60% below market → zero
    scores = scorer.score(_seller(), market_median=1000.0, listing_price=400.0)
    assert scores["price_vs_market"] == 0
 def test_no_market_data_returns_none():
    scorer = MetadataScorer()
    scores = scorer.score(_seller(), market_median=None, listing_price=950.0)
    # None signals "data unavailable" — aggregator will set score_is_partial=True
    assert scores["price_vs_market"] is None
--- a/tests/trust/test_photo.py
+++ b/tests/trust/test_photo.py
@ -0,0 +1,24 @@
 from app.trust.photo import PhotoScorer
 def test_no_duplicates_in_single_listing_result():
    scorer = PhotoScorer()
    photo_urls_per_listing = [
        ["https://img.com/a.jpg", "https://img.com/b.jpg"],
        ["https://img.com/c.jpg"],
    ]
    # All unique images — no duplicates
    results = scorer.check_duplicates(photo_urls_per_listing)
    assert all(not r for r in results)
 def test_duplicate_photo_flagged():
    scorer = PhotoScorer()
    # Same URL in two listings = trivially duplicate (hash will match)
    photo_urls_per_listing = [
        ["https://img.com/same.jpg"],
        ["https://img.com/same.jpg"],
    ]
    results = scorer.check_duplicates(photo_urls_per_listing)
    # Both listings should be flagged
    assert results[0] is True or results[1] is True