From ee3c85bfb04b68a9256c1a52a1e0974f43094f72 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Wed, 25 Mar 2026 12:57:56 -0700 Subject: [PATCH] feat: add metadata scorer, photo hash dedup, and trust aggregator --- app/trust/__init__.py | 41 +++++++++++++++++++ app/trust/aggregator.py | 56 +++++++++++++++++++++++++ app/trust/metadata.py | 67 ++++++++++++++++++++++++++++++ app/trust/photo.py | 74 ++++++++++++++++++++++++++++++++++ tests/trust/__init__.py | 0 tests/trust/test_aggregator.py | 52 ++++++++++++++++++++++++ tests/trust/test_metadata.py | 45 +++++++++++++++++++++ tests/trust/test_photo.py | 24 +++++++++++ 8 files changed, 359 insertions(+) create mode 100644 app/trust/__init__.py create mode 100644 app/trust/aggregator.py create mode 100644 app/trust/metadata.py create mode 100644 app/trust/photo.py create mode 100644 tests/trust/__init__.py create mode 100644 tests/trust/test_aggregator.py create mode 100644 tests/trust/test_metadata.py create mode 100644 tests/trust/test_photo.py diff --git a/app/trust/__init__.py b/app/trust/__init__.py new file mode 100644 index 0000000..726912f --- /dev/null +++ b/app/trust/__init__.py @@ -0,0 +1,41 @@ +from .metadata import MetadataScorer +from .photo import PhotoScorer +from .aggregator import Aggregator +from app.db.models import Seller, Listing, TrustScore +from app.db.store import Store +import hashlib + + +class TrustScorer: + """Orchestrates metadata + photo scoring for a batch of listings.""" + + def __init__(self, store: Store): + self._store = store + self._meta = MetadataScorer() + self._photo = PhotoScorer() + self._agg = Aggregator() + + def score_batch( + self, + listings: list[Listing], + query: str, + ) -> list[TrustScore]: + query_hash = hashlib.md5(query.encode()).hexdigest() + comp = self._store.get_market_comp("ebay", query_hash) + market_median = comp.median_price if comp else None + + photo_url_sets = [l.photo_urls for l in listings] + duplicates = self._photo.check_duplicates(photo_url_sets) + + scores = [] + for listing, is_dup in zip(listings, duplicates): + seller = self._store.get_seller("ebay", listing.seller_platform_id) + if seller: + signal_scores = self._meta.score(seller, market_median, listing.price) + else: + signal_scores = {k: None for k in + ["account_age", "feedback_count", "feedback_ratio", + "price_vs_market", "category_history"]} + trust = self._agg.aggregate(signal_scores, is_dup, seller, listing.id or 0) + scores.append(trust) + return scores diff --git a/app/trust/aggregator.py b/app/trust/aggregator.py new file mode 100644 index 0000000..cffc3cc --- /dev/null +++ b/app/trust/aggregator.py @@ -0,0 +1,56 @@ +"""Composite score and red flag extraction.""" +from __future__ import annotations +import json +from typing import Optional +from app.db.models import Seller, TrustScore + +HARD_FILTER_AGE_DAYS = 7 +HARD_FILTER_BAD_RATIO_MIN_COUNT = 20 +HARD_FILTER_BAD_RATIO_THRESHOLD = 0.80 + + +class Aggregator: + def aggregate( + self, + signal_scores: dict[str, Optional[int]], + photo_hash_duplicate: bool, + seller: Optional[Seller], + listing_id: int = 0, + ) -> TrustScore: + is_partial = any(v is None for v in signal_scores.values()) + clean = {k: (v if v is not None else 0) for k, v in signal_scores.items()} + composite = sum(clean.values()) + + red_flags: list[str] = [] + + # Hard filters + if seller and seller.account_age_days < HARD_FILTER_AGE_DAYS: + red_flags.append("new_account") + if seller and ( + seller.feedback_ratio < HARD_FILTER_BAD_RATIO_THRESHOLD + and seller.feedback_count > HARD_FILTER_BAD_RATIO_MIN_COUNT + ): + red_flags.append("established_bad_actor") + + # Soft flags + if seller and seller.account_age_days < 30: + red_flags.append("account_under_30_days") + if seller and seller.feedback_count < 10: + red_flags.append("low_feedback_count") + if clean["price_vs_market"] == 0: + red_flags.append("suspicious_price") + if photo_hash_duplicate: + red_flags.append("duplicate_photo") + + return TrustScore( + listing_id=listing_id, + composite_score=composite, + account_age_score=clean["account_age"], + feedback_count_score=clean["feedback_count"], + feedback_ratio_score=clean["feedback_ratio"], + price_vs_market_score=clean["price_vs_market"], + category_history_score=clean["category_history"], + photo_hash_duplicate=photo_hash_duplicate, + red_flags_json=json.dumps(red_flags), + score_is_partial=is_partial, + ) diff --git a/app/trust/metadata.py b/app/trust/metadata.py new file mode 100644 index 0000000..4231719 --- /dev/null +++ b/app/trust/metadata.py @@ -0,0 +1,67 @@ +"""Five metadata trust signals, each scored 0–20.""" +from __future__ import annotations +import json +from typing import Optional +from app.db.models import Seller + +ELECTRONICS_CATEGORIES = {"ELECTRONICS", "COMPUTERS_TABLETS", "VIDEO_GAMES", "CELL_PHONES"} + + +class MetadataScorer: + def score( + self, + seller: Seller, + market_median: Optional[float], + listing_price: float, + ) -> dict[str, Optional[int]]: + return { + "account_age": self._account_age(seller.account_age_days), + "feedback_count": self._feedback_count(seller.feedback_count), + "feedback_ratio": self._feedback_ratio(seller.feedback_ratio, seller.feedback_count), + "price_vs_market": self._price_vs_market(listing_price, market_median), + "category_history": self._category_history(seller.category_history_json), + } + + def _account_age(self, days: int) -> int: + if days < 7: return 0 + if days < 30: return 5 + if days < 90: return 10 + if days < 365: return 15 + return 20 + + def _feedback_count(self, count: int) -> int: + if count < 3: return 0 + if count < 10: return 5 + if count < 50: return 10 + if count < 200: return 15 + return 20 + + def _feedback_ratio(self, ratio: float, count: int) -> int: + if ratio < 0.80 and count > 20: return 0 + if ratio < 0.90: return 5 + if ratio < 0.95: return 10 + if ratio < 0.98: return 15 + return 20 + + def _price_vs_market(self, price: float, median: Optional[float]) -> Optional[int]: + if median is None: return None # data unavailable → aggregator sets score_is_partial + if median <= 0: return None + ratio = price / median + if ratio < 0.50: return 0 # >50% below = scam + if ratio < 0.70: return 5 # >30% below = suspicious + if ratio < 0.85: return 10 + if ratio <= 1.20: return 20 + return 15 # above market = still ok, just expensive + + def _category_history(self, category_history_json: str) -> int: + try: + history = json.loads(category_history_json) + except (ValueError, TypeError): + return 0 + electronics_sales = sum( + v for k, v in history.items() if k in ELECTRONICS_CATEGORIES + ) + if electronics_sales == 0: return 0 + if electronics_sales < 5: return 8 + if electronics_sales < 20: return 14 + return 20 diff --git a/app/trust/photo.py b/app/trust/photo.py new file mode 100644 index 0000000..1a7a383 --- /dev/null +++ b/app/trust/photo.py @@ -0,0 +1,74 @@ +"""Perceptual hash deduplication within a result set (free tier, v0.1).""" +from __future__ import annotations +from typing import Optional +import io +import requests + +try: + import imagehash + from PIL import Image + _IMAGEHASH_AVAILABLE = True +except ImportError: + _IMAGEHASH_AVAILABLE = False + + +class PhotoScorer: + """ + check_duplicates: compare images within a single result set. + Cross-session dedup (PhotoHash table) is v0.2. + Vision analysis (real/marketing/EM bag) is v0.2 paid tier. + """ + + def check_duplicates(self, photo_urls_per_listing: list[list[str]]) -> list[bool]: + """ + Returns a list of booleans parallel to photo_urls_per_listing. + True = this listing's primary photo is a duplicate of another listing in the set. + Falls back to URL-equality check if imagehash is unavailable or fetch fails. + """ + if not _IMAGEHASH_AVAILABLE: + return self._url_dedup(photo_urls_per_listing) + + primary_urls = [urls[0] if urls else "" for urls in photo_urls_per_listing] + + # Fast path: URL equality is a trivial duplicate signal (no fetch needed) + url_results = self._url_dedup([[u] for u in primary_urls]) + + hashes: list[Optional[str]] = [] + for url in primary_urls: + hashes.append(self._fetch_hash(url)) + + results = list(url_results) # start from URL-equality results + seen: dict[str, int] = {} + for i, h in enumerate(hashes): + if h is None: + continue + if h in seen: + results[i] = True + results[seen[h]] = True + else: + seen[h] = i + return results + + def _fetch_hash(self, url: str) -> Optional[str]: + if not url: + return None + try: + resp = requests.get(url, timeout=5, stream=True) + resp.raise_for_status() + img = Image.open(io.BytesIO(resp.content)) + return str(imagehash.phash(img)) + except Exception: + return None + + def _url_dedup(self, photo_urls_per_listing: list[list[str]]) -> list[bool]: + seen: set[str] = set() + results = [] + for urls in photo_urls_per_listing: + primary = urls[0] if urls else "" + if primary and primary in seen: + results.append(True) + else: + if primary: + seen.add(primary) + results.append(False) + return results diff --git a/tests/trust/__init__.py b/tests/trust/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/trust/test_aggregator.py b/tests/trust/test_aggregator.py new file mode 100644 index 0000000..4b52b28 --- /dev/null +++ b/tests/trust/test_aggregator.py @@ -0,0 +1,52 @@ +from app.db.models import Seller +from app.trust.aggregator import Aggregator + + +def test_composite_sum_of_five_signals(): + agg = Aggregator() + scores = { + "account_age": 18, "feedback_count": 16, + "feedback_ratio": 20, "price_vs_market": 15, + "category_history": 14, + } + result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None) + assert result.composite_score == 83 + + +def test_hard_filter_new_account(): + agg = Aggregator() + scores = {k: 20 for k in ["account_age", "feedback_count", + "feedback_ratio", "price_vs_market", "category_history"]} + young_seller = Seller( + platform="ebay", platform_seller_id="u", username="u", + account_age_days=3, feedback_count=0, + feedback_ratio=1.0, category_history_json="{}", + ) + result = agg.aggregate(scores, photo_hash_duplicate=False, seller=young_seller) + assert "new_account" in result.red_flags_json + + +def test_hard_filter_bad_actor_established_account(): + """Established account (count > 20) with very bad ratio → hard filter.""" + agg = Aggregator() + scores = {k: 10 for k in ["account_age", "feedback_count", + "feedback_ratio", "price_vs_market", "category_history"]} + bad_seller = Seller( + platform="ebay", platform_seller_id="u", username="u", + account_age_days=730, feedback_count=25, # count > 20 + feedback_ratio=0.70, # ratio < 80% → hard filter + category_history_json="{}", + ) + result = agg.aggregate(scores, photo_hash_duplicate=False, seller=bad_seller) + assert "established_bad_actor" in result.red_flags_json + + +def test_partial_score_flagged_when_signals_missing(): + agg = Aggregator() + scores = { + "account_age": 18, "feedback_count": None, # None = unavailable + "feedback_ratio": 20, "price_vs_market": 15, + "category_history": 14, + } + result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None) + assert result.score_is_partial is True diff --git a/tests/trust/test_metadata.py b/tests/trust/test_metadata.py new file mode 100644 index 0000000..0be40e1 --- /dev/null +++ b/tests/trust/test_metadata.py @@ -0,0 +1,45 @@ +from app.db.models import Seller +from app.trust.metadata import MetadataScorer + + +def _seller(**kwargs) -> Seller: + defaults = dict( + platform="ebay", platform_seller_id="u", username="u", + account_age_days=730, feedback_count=450, + feedback_ratio=0.991, category_history_json='{"ELECTRONICS": 30}', + ) + defaults.update(kwargs) + return Seller(**defaults) + + +def test_established_seller_scores_high(): + scorer = MetadataScorer() + scores = scorer.score(_seller(), market_median=1000.0, listing_price=950.0) + total = sum(scores.values()) + assert total >= 80 + + +def test_new_account_scores_zero_on_age(): + scorer = MetadataScorer() + scores = scorer.score(_seller(account_age_days=3), market_median=1000.0, listing_price=950.0) + assert scores["account_age"] == 0 + + +def test_low_feedback_count_scores_low(): + scorer = MetadataScorer() + scores = scorer.score(_seller(feedback_count=2), market_median=1000.0, listing_price=950.0) + assert scores["feedback_count"] < 10 + + +def test_suspicious_price_scores_zero(): + scorer = MetadataScorer() + # 60% below market → zero + scores = scorer.score(_seller(), market_median=1000.0, listing_price=400.0) + assert scores["price_vs_market"] == 0 + + +def test_no_market_data_returns_none(): + scorer = MetadataScorer() + scores = scorer.score(_seller(), market_median=None, listing_price=950.0) + # None signals "data unavailable" — aggregator will set score_is_partial=True + assert scores["price_vs_market"] is None diff --git a/tests/trust/test_photo.py b/tests/trust/test_photo.py new file mode 100644 index 0000000..fd5b2dc --- /dev/null +++ b/tests/trust/test_photo.py @@ -0,0 +1,24 @@ +from app.trust.photo import PhotoScorer + + +def test_no_duplicates_in_single_listing_result(): + scorer = PhotoScorer() + photo_urls_per_listing = [ + ["https://img.com/a.jpg", "https://img.com/b.jpg"], + ["https://img.com/c.jpg"], + ] + # All unique images — no duplicates + results = scorer.check_duplicates(photo_urls_per_listing) + assert all(not r for r in results) + + +def test_duplicate_photo_flagged(): + scorer = PhotoScorer() + # Same URL in two listings = trivially duplicate (hash will match) + photo_urls_per_listing = [ + ["https://img.com/same.jpg"], + ["https://img.com/same.jpg"], + ] + results = scorer.check_duplicates(photo_urls_per_listing) + # Both listings should be flagged + assert results[0] is True or results[1] is True