snipe/app/trust/aggregator.py
pyr0ball 98695b00f0 feat(snipe): eBay trust scoring MVP — search, filters, enrichment, comps
Core trust scoring:
- Five metadata signals (account age, feedback count/ratio, price vs market,
  category history), composited 0–100
- CV-based price signal suppression for heterogeneous search results
  (e.g. mixed laptop generations won't false-positive suspicious_price)
- Expanded scratch/dent title detection: evasive redirects, functional problem
  phrases, DIY/repair indicators
- Hard filters: new_account, established_bad_actor
- Soft flags: low_feedback, suspicious_price, duplicate_photo, scratch_dent,
  long_on_market, significant_price_drop

Search & filtering:
- Browse API adapter (up to 200 items/page) + Playwright scraper fallback
- OR-group query expansion for comprehensive variant coverage
- Must-include (AND/ANY/groups), must-exclude, category, price range filters
- Saved searches with full filter round-trip via URL params

Seller enrichment:
- Background BTF /itm/ scraping for account age (Kasada-safe headed Chromium)
- On-demand enrichment: POST /api/enrich + ListingCard ↻ button
- Category history derived from Browse API categories field (free, no extra calls)
- Shopping API GetUserProfile inline enrichment for API adapter

Market comps:
- eBay Marketplace Insights API with Browse API fallback (catches 403 + 404)
- Comps prioritised in ThreadPoolExecutor (submitted first)

Infrastructure:
- Staging DB fields: times_seen, first_seen_at, price_at_first_seen, category_name
- Migrations 004 (staging tracking) + 005 (listing category)
- eBay webhook handler stub
- Cloud compose stack (compose.cloud.yml)
- Vue frontend: search store, saved searches store, ListingCard, filter sidebar

Docs:
- README fully rewritten to reflect MVP status + full feature documentation
- Roadmap table linked to all 13 Forgejo issues
2026-03-26 23:37:09 -07:00

132 lines
5.4 KiB
Python

"""Composite score and red flag extraction."""
from __future__ import annotations
import json
from datetime import datetime, timezone
from typing import Optional
from app.db.models import Seller, TrustScore
HARD_FILTER_AGE_DAYS = 7
HARD_FILTER_BAD_RATIO_MIN_COUNT = 20
HARD_FILTER_BAD_RATIO_THRESHOLD = 0.80
# Title keywords that suggest cosmetic damage or wear (free-tier title scan).
# Description-body scan (paid BSL feature) runs via BTF enrichment — not implemented yet.
_SCRATCH_DENT_KEYWORDS = frozenset([
# Explicit cosmetic damage
"scratch", "scratched", "scratches", "scuff", "scuffed",
"dent", "dented", "ding", "dinged",
"crack", "cracked", "chip", "chipped",
"damage", "damaged", "cosmetic damage",
"blemish", "wear", "worn", "worn in",
# Parts / condition catch-alls
"as is", "for parts", "parts only", "spares or repair", "parts or repair",
# Evasive redirects — seller hiding damage detail in listing body
"see description", "read description", "read listing", "see listing",
"see photos for", "see pics for", "see images for",
# Functional problem phrases (phrases > single words to avoid false positives)
"issue with", "issues with", "problem with", "problems with",
"not working", "stopped working", "doesn't work", "does not work",
"no power", "dead on arrival", "powers on but", "turns on but", "boots but",
"faulty", "broken screen", "broken hinge", "broken port",
# DIY / project / repair listings
"needs repair", "needs work", "needs tlc",
"project unit", "project item", "project laptop", "project phone",
"for repair", "sold as is",
])
def _has_damage_keywords(title: str) -> bool:
lower = title.lower()
return any(kw in lower for kw in _SCRATCH_DENT_KEYWORDS)
_LONG_ON_MARKET_MIN_SIGHTINGS = 5
_LONG_ON_MARKET_MIN_DAYS = 14
_PRICE_DROP_THRESHOLD = 0.20 # 20% below first-seen price
def _days_since(iso: Optional[str]) -> Optional[int]:
if not iso:
return None
try:
dt = datetime.fromisoformat(iso.replace("Z", "+00:00"))
# Normalize to naive UTC so both paths (timezone-aware ISO and SQLite
# CURRENT_TIMESTAMP naive strings) compare correctly.
if dt.tzinfo is not None:
dt = dt.replace(tzinfo=None)
return (datetime.utcnow() - dt).days
except ValueError:
return None
class Aggregator:
def aggregate(
self,
signal_scores: dict[str, Optional[int]],
photo_hash_duplicate: bool,
seller: Optional[Seller],
listing_id: int = 0,
listing_title: str = "",
times_seen: int = 1,
first_seen_at: Optional[str] = None,
price: float = 0.0,
price_at_first_seen: Optional[float] = None,
) -> TrustScore:
is_partial = any(v is None for v in signal_scores.values())
clean = {k: (v if v is not None else 0) for k, v in signal_scores.items()}
# Score only against signals that returned real data — treating "no data"
# as 0 conflates "bad signal" with "missing signal" and drags scores down
# unfairly when the API doesn't expose a field (e.g. registrationDate).
available = [v for v in signal_scores.values() if v is not None]
available_max = len(available) * 20
if available_max > 0:
composite = round((sum(available) / available_max) * 100)
else:
composite = 0
red_flags: list[str] = []
# Hard filters
if seller and seller.account_age_days is not None and seller.account_age_days < HARD_FILTER_AGE_DAYS:
red_flags.append("new_account")
if seller and (
seller.feedback_ratio < HARD_FILTER_BAD_RATIO_THRESHOLD
and seller.feedback_count > HARD_FILTER_BAD_RATIO_MIN_COUNT
):
red_flags.append("established_bad_actor")
# Soft flags
if seller and seller.account_age_days is not None and seller.account_age_days < 30:
red_flags.append("account_under_30_days")
if seller and seller.feedback_count < 10:
red_flags.append("low_feedback_count")
if signal_scores.get("price_vs_market") == 0: # only flag when data exists and price is genuinely <50% of market
red_flags.append("suspicious_price")
if photo_hash_duplicate:
red_flags.append("duplicate_photo")
if listing_title and _has_damage_keywords(listing_title):
red_flags.append("scratch_dent_mentioned")
# Staging DB signals
days_in_index = _days_since(first_seen_at)
if (times_seen >= _LONG_ON_MARKET_MIN_SIGHTINGS
and days_in_index is not None
and days_in_index >= _LONG_ON_MARKET_MIN_DAYS):
red_flags.append("long_on_market")
if (price_at_first_seen and price_at_first_seen > 0
and price < price_at_first_seen * (1 - _PRICE_DROP_THRESHOLD)):
red_flags.append("significant_price_drop")
return TrustScore(
listing_id=listing_id,
composite_score=composite,
account_age_score=clean["account_age"],
feedback_count_score=clean["feedback_count"],
feedback_ratio_score=clean["feedback_ratio"],
price_vs_market_score=clean["price_vs_market"],
category_history_score=clean["category_history"],
photo_hash_duplicate=photo_hash_duplicate,
red_flags_json=json.dumps(red_flags),
score_is_partial=is_partial,
)