feat: add metadata scorer, photo hash dedup, and trust aggregator
This commit is contained in:
parent
1672e215b2
commit
ee3c85bfb0
8 changed files with 359 additions and 0 deletions
41
app/trust/__init__.py
Normal file
41
app/trust/__init__.py
Normal file
|
|
@ -0,0 +1,41 @@
|
||||||
|
from .metadata import MetadataScorer
|
||||||
|
from .photo import PhotoScorer
|
||||||
|
from .aggregator import Aggregator
|
||||||
|
from app.db.models import Seller, Listing, TrustScore
|
||||||
|
from app.db.store import Store
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
|
||||||
|
class TrustScorer:
|
||||||
|
"""Orchestrates metadata + photo scoring for a batch of listings."""
|
||||||
|
|
||||||
|
def __init__(self, store: Store):
|
||||||
|
self._store = store
|
||||||
|
self._meta = MetadataScorer()
|
||||||
|
self._photo = PhotoScorer()
|
||||||
|
self._agg = Aggregator()
|
||||||
|
|
||||||
|
def score_batch(
|
||||||
|
self,
|
||||||
|
listings: list[Listing],
|
||||||
|
query: str,
|
||||||
|
) -> list[TrustScore]:
|
||||||
|
query_hash = hashlib.md5(query.encode()).hexdigest()
|
||||||
|
comp = self._store.get_market_comp("ebay", query_hash)
|
||||||
|
market_median = comp.median_price if comp else None
|
||||||
|
|
||||||
|
photo_url_sets = [l.photo_urls for l in listings]
|
||||||
|
duplicates = self._photo.check_duplicates(photo_url_sets)
|
||||||
|
|
||||||
|
scores = []
|
||||||
|
for listing, is_dup in zip(listings, duplicates):
|
||||||
|
seller = self._store.get_seller("ebay", listing.seller_platform_id)
|
||||||
|
if seller:
|
||||||
|
signal_scores = self._meta.score(seller, market_median, listing.price)
|
||||||
|
else:
|
||||||
|
signal_scores = {k: None for k in
|
||||||
|
["account_age", "feedback_count", "feedback_ratio",
|
||||||
|
"price_vs_market", "category_history"]}
|
||||||
|
trust = self._agg.aggregate(signal_scores, is_dup, seller, listing.id or 0)
|
||||||
|
scores.append(trust)
|
||||||
|
return scores
|
||||||
56
app/trust/aggregator.py
Normal file
56
app/trust/aggregator.py
Normal file
|
|
@ -0,0 +1,56 @@
|
||||||
|
"""Composite score and red flag extraction."""
|
||||||
|
from __future__ import annotations
|
||||||
|
import json
|
||||||
|
from typing import Optional
|
||||||
|
from app.db.models import Seller, TrustScore
|
||||||
|
|
||||||
|
HARD_FILTER_AGE_DAYS = 7
|
||||||
|
HARD_FILTER_BAD_RATIO_MIN_COUNT = 20
|
||||||
|
HARD_FILTER_BAD_RATIO_THRESHOLD = 0.80
|
||||||
|
|
||||||
|
|
||||||
|
class Aggregator:
|
||||||
|
def aggregate(
|
||||||
|
self,
|
||||||
|
signal_scores: dict[str, Optional[int]],
|
||||||
|
photo_hash_duplicate: bool,
|
||||||
|
seller: Optional[Seller],
|
||||||
|
listing_id: int = 0,
|
||||||
|
) -> TrustScore:
|
||||||
|
is_partial = any(v is None for v in signal_scores.values())
|
||||||
|
clean = {k: (v if v is not None else 0) for k, v in signal_scores.items()}
|
||||||
|
composite = sum(clean.values())
|
||||||
|
|
||||||
|
red_flags: list[str] = []
|
||||||
|
|
||||||
|
# Hard filters
|
||||||
|
if seller and seller.account_age_days < HARD_FILTER_AGE_DAYS:
|
||||||
|
red_flags.append("new_account")
|
||||||
|
if seller and (
|
||||||
|
seller.feedback_ratio < HARD_FILTER_BAD_RATIO_THRESHOLD
|
||||||
|
and seller.feedback_count > HARD_FILTER_BAD_RATIO_MIN_COUNT
|
||||||
|
):
|
||||||
|
red_flags.append("established_bad_actor")
|
||||||
|
|
||||||
|
# Soft flags
|
||||||
|
if seller and seller.account_age_days < 30:
|
||||||
|
red_flags.append("account_under_30_days")
|
||||||
|
if seller and seller.feedback_count < 10:
|
||||||
|
red_flags.append("low_feedback_count")
|
||||||
|
if clean["price_vs_market"] == 0:
|
||||||
|
red_flags.append("suspicious_price")
|
||||||
|
if photo_hash_duplicate:
|
||||||
|
red_flags.append("duplicate_photo")
|
||||||
|
|
||||||
|
return TrustScore(
|
||||||
|
listing_id=listing_id,
|
||||||
|
composite_score=composite,
|
||||||
|
account_age_score=clean["account_age"],
|
||||||
|
feedback_count_score=clean["feedback_count"],
|
||||||
|
feedback_ratio_score=clean["feedback_ratio"],
|
||||||
|
price_vs_market_score=clean["price_vs_market"],
|
||||||
|
category_history_score=clean["category_history"],
|
||||||
|
photo_hash_duplicate=photo_hash_duplicate,
|
||||||
|
red_flags_json=json.dumps(red_flags),
|
||||||
|
score_is_partial=is_partial,
|
||||||
|
)
|
||||||
67
app/trust/metadata.py
Normal file
67
app/trust/metadata.py
Normal file
|
|
@ -0,0 +1,67 @@
|
||||||
|
"""Five metadata trust signals, each scored 0–20."""
|
||||||
|
from __future__ import annotations
|
||||||
|
import json
|
||||||
|
from typing import Optional
|
||||||
|
from app.db.models import Seller
|
||||||
|
|
||||||
|
ELECTRONICS_CATEGORIES = {"ELECTRONICS", "COMPUTERS_TABLETS", "VIDEO_GAMES", "CELL_PHONES"}
|
||||||
|
|
||||||
|
|
||||||
|
class MetadataScorer:
|
||||||
|
def score(
|
||||||
|
self,
|
||||||
|
seller: Seller,
|
||||||
|
market_median: Optional[float],
|
||||||
|
listing_price: float,
|
||||||
|
) -> dict[str, Optional[int]]:
|
||||||
|
return {
|
||||||
|
"account_age": self._account_age(seller.account_age_days),
|
||||||
|
"feedback_count": self._feedback_count(seller.feedback_count),
|
||||||
|
"feedback_ratio": self._feedback_ratio(seller.feedback_ratio, seller.feedback_count),
|
||||||
|
"price_vs_market": self._price_vs_market(listing_price, market_median),
|
||||||
|
"category_history": self._category_history(seller.category_history_json),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _account_age(self, days: int) -> int:
|
||||||
|
if days < 7: return 0
|
||||||
|
if days < 30: return 5
|
||||||
|
if days < 90: return 10
|
||||||
|
if days < 365: return 15
|
||||||
|
return 20
|
||||||
|
|
||||||
|
def _feedback_count(self, count: int) -> int:
|
||||||
|
if count < 3: return 0
|
||||||
|
if count < 10: return 5
|
||||||
|
if count < 50: return 10
|
||||||
|
if count < 200: return 15
|
||||||
|
return 20
|
||||||
|
|
||||||
|
def _feedback_ratio(self, ratio: float, count: int) -> int:
|
||||||
|
if ratio < 0.80 and count > 20: return 0
|
||||||
|
if ratio < 0.90: return 5
|
||||||
|
if ratio < 0.95: return 10
|
||||||
|
if ratio < 0.98: return 15
|
||||||
|
return 20
|
||||||
|
|
||||||
|
def _price_vs_market(self, price: float, median: Optional[float]) -> Optional[int]:
|
||||||
|
if median is None: return None # data unavailable → aggregator sets score_is_partial
|
||||||
|
if median <= 0: return None
|
||||||
|
ratio = price / median
|
||||||
|
if ratio < 0.50: return 0 # >50% below = scam
|
||||||
|
if ratio < 0.70: return 5 # >30% below = suspicious
|
||||||
|
if ratio < 0.85: return 10
|
||||||
|
if ratio <= 1.20: return 20
|
||||||
|
return 15 # above market = still ok, just expensive
|
||||||
|
|
||||||
|
def _category_history(self, category_history_json: str) -> int:
|
||||||
|
try:
|
||||||
|
history = json.loads(category_history_json)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
return 0
|
||||||
|
electronics_sales = sum(
|
||||||
|
v for k, v in history.items() if k in ELECTRONICS_CATEGORIES
|
||||||
|
)
|
||||||
|
if electronics_sales == 0: return 0
|
||||||
|
if electronics_sales < 5: return 8
|
||||||
|
if electronics_sales < 20: return 14
|
||||||
|
return 20
|
||||||
74
app/trust/photo.py
Normal file
74
app/trust/photo.py
Normal file
|
|
@ -0,0 +1,74 @@
|
||||||
|
"""Perceptual hash deduplication within a result set (free tier, v0.1)."""
|
||||||
|
from __future__ import annotations
|
||||||
|
from typing import Optional
|
||||||
|
import io
|
||||||
|
import requests
|
||||||
|
|
||||||
|
try:
|
||||||
|
import imagehash
|
||||||
|
from PIL import Image
|
||||||
|
_IMAGEHASH_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
_IMAGEHASH_AVAILABLE = False
|
||||||
|
|
||||||
|
|
||||||
|
class PhotoScorer:
|
||||||
|
"""
|
||||||
|
check_duplicates: compare images within a single result set.
|
||||||
|
Cross-session dedup (PhotoHash table) is v0.2.
|
||||||
|
Vision analysis (real/marketing/EM bag) is v0.2 paid tier.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def check_duplicates(self, photo_urls_per_listing: list[list[str]]) -> list[bool]:
|
||||||
|
"""
|
||||||
|
Returns a list of booleans parallel to photo_urls_per_listing.
|
||||||
|
True = this listing's primary photo is a duplicate of another listing in the set.
|
||||||
|
Falls back to URL-equality check if imagehash is unavailable or fetch fails.
|
||||||
|
"""
|
||||||
|
if not _IMAGEHASH_AVAILABLE:
|
||||||
|
return self._url_dedup(photo_urls_per_listing)
|
||||||
|
|
||||||
|
primary_urls = [urls[0] if urls else "" for urls in photo_urls_per_listing]
|
||||||
|
|
||||||
|
# Fast path: URL equality is a trivial duplicate signal (no fetch needed)
|
||||||
|
url_results = self._url_dedup([[u] for u in primary_urls])
|
||||||
|
|
||||||
|
hashes: list[Optional[str]] = []
|
||||||
|
for url in primary_urls:
|
||||||
|
hashes.append(self._fetch_hash(url))
|
||||||
|
|
||||||
|
results = list(url_results) # start from URL-equality results
|
||||||
|
seen: dict[str, int] = {}
|
||||||
|
for i, h in enumerate(hashes):
|
||||||
|
if h is None:
|
||||||
|
continue
|
||||||
|
if h in seen:
|
||||||
|
results[i] = True
|
||||||
|
results[seen[h]] = True
|
||||||
|
else:
|
||||||
|
seen[h] = i
|
||||||
|
return results
|
||||||
|
|
||||||
|
def _fetch_hash(self, url: str) -> Optional[str]:
|
||||||
|
if not url:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
resp = requests.get(url, timeout=5, stream=True)
|
||||||
|
resp.raise_for_status()
|
||||||
|
img = Image.open(io.BytesIO(resp.content))
|
||||||
|
return str(imagehash.phash(img))
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _url_dedup(self, photo_urls_per_listing: list[list[str]]) -> list[bool]:
|
||||||
|
seen: set[str] = set()
|
||||||
|
results = []
|
||||||
|
for urls in photo_urls_per_listing:
|
||||||
|
primary = urls[0] if urls else ""
|
||||||
|
if primary and primary in seen:
|
||||||
|
results.append(True)
|
||||||
|
else:
|
||||||
|
if primary:
|
||||||
|
seen.add(primary)
|
||||||
|
results.append(False)
|
||||||
|
return results
|
||||||
0
tests/trust/__init__.py
Normal file
0
tests/trust/__init__.py
Normal file
52
tests/trust/test_aggregator.py
Normal file
52
tests/trust/test_aggregator.py
Normal file
|
|
@ -0,0 +1,52 @@
|
||||||
|
from app.db.models import Seller
|
||||||
|
from app.trust.aggregator import Aggregator
|
||||||
|
|
||||||
|
|
||||||
|
def test_composite_sum_of_five_signals():
|
||||||
|
agg = Aggregator()
|
||||||
|
scores = {
|
||||||
|
"account_age": 18, "feedback_count": 16,
|
||||||
|
"feedback_ratio": 20, "price_vs_market": 15,
|
||||||
|
"category_history": 14,
|
||||||
|
}
|
||||||
|
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None)
|
||||||
|
assert result.composite_score == 83
|
||||||
|
|
||||||
|
|
||||||
|
def test_hard_filter_new_account():
|
||||||
|
agg = Aggregator()
|
||||||
|
scores = {k: 20 for k in ["account_age", "feedback_count",
|
||||||
|
"feedback_ratio", "price_vs_market", "category_history"]}
|
||||||
|
young_seller = Seller(
|
||||||
|
platform="ebay", platform_seller_id="u", username="u",
|
||||||
|
account_age_days=3, feedback_count=0,
|
||||||
|
feedback_ratio=1.0, category_history_json="{}",
|
||||||
|
)
|
||||||
|
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=young_seller)
|
||||||
|
assert "new_account" in result.red_flags_json
|
||||||
|
|
||||||
|
|
||||||
|
def test_hard_filter_bad_actor_established_account():
|
||||||
|
"""Established account (count > 20) with very bad ratio → hard filter."""
|
||||||
|
agg = Aggregator()
|
||||||
|
scores = {k: 10 for k in ["account_age", "feedback_count",
|
||||||
|
"feedback_ratio", "price_vs_market", "category_history"]}
|
||||||
|
bad_seller = Seller(
|
||||||
|
platform="ebay", platform_seller_id="u", username="u",
|
||||||
|
account_age_days=730, feedback_count=25, # count > 20
|
||||||
|
feedback_ratio=0.70, # ratio < 80% → hard filter
|
||||||
|
category_history_json="{}",
|
||||||
|
)
|
||||||
|
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=bad_seller)
|
||||||
|
assert "established_bad_actor" in result.red_flags_json
|
||||||
|
|
||||||
|
|
||||||
|
def test_partial_score_flagged_when_signals_missing():
|
||||||
|
agg = Aggregator()
|
||||||
|
scores = {
|
||||||
|
"account_age": 18, "feedback_count": None, # None = unavailable
|
||||||
|
"feedback_ratio": 20, "price_vs_market": 15,
|
||||||
|
"category_history": 14,
|
||||||
|
}
|
||||||
|
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None)
|
||||||
|
assert result.score_is_partial is True
|
||||||
45
tests/trust/test_metadata.py
Normal file
45
tests/trust/test_metadata.py
Normal file
|
|
@ -0,0 +1,45 @@
|
||||||
|
from app.db.models import Seller
|
||||||
|
from app.trust.metadata import MetadataScorer
|
||||||
|
|
||||||
|
|
||||||
|
def _seller(**kwargs) -> Seller:
|
||||||
|
defaults = dict(
|
||||||
|
platform="ebay", platform_seller_id="u", username="u",
|
||||||
|
account_age_days=730, feedback_count=450,
|
||||||
|
feedback_ratio=0.991, category_history_json='{"ELECTRONICS": 30}',
|
||||||
|
)
|
||||||
|
defaults.update(kwargs)
|
||||||
|
return Seller(**defaults)
|
||||||
|
|
||||||
|
|
||||||
|
def test_established_seller_scores_high():
|
||||||
|
scorer = MetadataScorer()
|
||||||
|
scores = scorer.score(_seller(), market_median=1000.0, listing_price=950.0)
|
||||||
|
total = sum(scores.values())
|
||||||
|
assert total >= 80
|
||||||
|
|
||||||
|
|
||||||
|
def test_new_account_scores_zero_on_age():
|
||||||
|
scorer = MetadataScorer()
|
||||||
|
scores = scorer.score(_seller(account_age_days=3), market_median=1000.0, listing_price=950.0)
|
||||||
|
assert scores["account_age"] == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_low_feedback_count_scores_low():
|
||||||
|
scorer = MetadataScorer()
|
||||||
|
scores = scorer.score(_seller(feedback_count=2), market_median=1000.0, listing_price=950.0)
|
||||||
|
assert scores["feedback_count"] < 10
|
||||||
|
|
||||||
|
|
||||||
|
def test_suspicious_price_scores_zero():
|
||||||
|
scorer = MetadataScorer()
|
||||||
|
# 60% below market → zero
|
||||||
|
scores = scorer.score(_seller(), market_median=1000.0, listing_price=400.0)
|
||||||
|
assert scores["price_vs_market"] == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_market_data_returns_none():
|
||||||
|
scorer = MetadataScorer()
|
||||||
|
scores = scorer.score(_seller(), market_median=None, listing_price=950.0)
|
||||||
|
# None signals "data unavailable" — aggregator will set score_is_partial=True
|
||||||
|
assert scores["price_vs_market"] is None
|
||||||
24
tests/trust/test_photo.py
Normal file
24
tests/trust/test_photo.py
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
from app.trust.photo import PhotoScorer
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_duplicates_in_single_listing_result():
|
||||||
|
scorer = PhotoScorer()
|
||||||
|
photo_urls_per_listing = [
|
||||||
|
["https://img.com/a.jpg", "https://img.com/b.jpg"],
|
||||||
|
["https://img.com/c.jpg"],
|
||||||
|
]
|
||||||
|
# All unique images — no duplicates
|
||||||
|
results = scorer.check_duplicates(photo_urls_per_listing)
|
||||||
|
assert all(not r for r in results)
|
||||||
|
|
||||||
|
|
||||||
|
def test_duplicate_photo_flagged():
|
||||||
|
scorer = PhotoScorer()
|
||||||
|
# Same URL in two listings = trivially duplicate (hash will match)
|
||||||
|
photo_urls_per_listing = [
|
||||||
|
["https://img.com/same.jpg"],
|
||||||
|
["https://img.com/same.jpg"],
|
||||||
|
]
|
||||||
|
results = scorer.check_duplicates(photo_urls_per_listing)
|
||||||
|
# Both listings should be flagged
|
||||||
|
assert results[0] is True or results[1] is True
|
||||||
Loading…
Reference in a new issue