diff --git a/.env.example b/.env.example index 6d9c360..96f1b7b 100644 --- a/.env.example +++ b/.env.example @@ -1,11 +1,33 @@ # Snipe works out of the box with the scraper (no credentials needed). -# Set EBAY_CLIENT_ID + EBAY_CLIENT_SECRET to unlock full trust scores -# (account age and category history signals require the eBay Browse API). -# Without credentials the app logs a warning and uses the scraper automatically. +# Set eBay API credentials to unlock full trust scores — +# account age and category history signals require the eBay Browse API. +# Without credentials the app logs a warning and falls back to the scraper. -# Optional — eBay API credentials (self-hosters / paid CF cloud tier) -# EBAY_CLIENT_ID=your-client-id-here -# EBAY_CLIENT_SECRET=your-client-secret-here -# EBAY_ENV=production # or: sandbox +# ── eBay Developer Keys — Production ────────────────────────────────────────── +# From https://developer.ebay.com/my/keys (Production tab) +EBAY_APP_ID= +EBAY_DEV_ID= +EBAY_CERT_ID= +# ── eBay Developer Keys — Sandbox ───────────────────────────────────────────── +# From https://developer.ebay.com/my/keys (Sandbox tab) +EBAY_SANDBOX_APP_ID= +EBAY_SANDBOX_DEV_ID= +EBAY_SANDBOX_CERT_ID= + +# ── Active environment ───────────────────────────────────────────────────────── +# production | sandbox +EBAY_ENV=production + +# ── eBay Account Deletion Webhook ────────────────────────────────────────────── +# Register endpoint at https://developer.ebay.com/my/notification — required for +# production key activation. Set EBAY_NOTIFICATION_ENDPOINT to the public HTTPS +# URL eBay will POST to (e.g. https://snipe.circuitforge.tech/api/ebay/account-deletion). +EBAY_NOTIFICATION_TOKEN= +EBAY_NOTIFICATION_ENDPOINT= +# Set to false during sandbox/registration (no production token available yet). +# Set to true once production credentials are active — enforces ECDSA verification. +EBAY_WEBHOOK_VERIFY_SIGNATURES=true + +# ── Database ─────────────────────────────────────────────────────────────────── SNIPE_DB=data/snipe.db diff --git a/README.md b/README.md index 19deac0..6234658 100644 --- a/README.md +++ b/README.md @@ -1,49 +1,170 @@ -# Snipe — Auction Sniping & Bid Management +# Snipe — Auction Sniping & Listing Intelligence > *Part of the Circuit Forge LLC "AI for the tasks you hate most" suite.* -**Status:** Backlog — not yet started. Peregrine must prove the model first. +**Status:** Active — eBay listing search + seller trust scoring MVP complete. Auction sniping engine and multi-platform support are next. ## What it does -Snipe manages online auction participation: monitoring listings across platforms, scheduling last-second bids, tracking price history to avoid overpaying, and managing the post-win logistics (payment, shipping coordination, provenance documentation for antiques). +Snipe has two layers that work together: + +**Layer 1 — Listing intelligence (MVP, implemented)** +Before you bid, Snipe tells you whether a listing is worth your time. It fetches eBay listings, scores each seller's trustworthiness across five signals, flags suspicious pricing relative to completed sales, and surfaces red flags like new accounts, cosmetic damage buried in titles, and listings that have been sitting unsold for weeks. + +**Layer 2 — Auction sniping (roadmap)** +Snipe manages the bid itself: monitors listings across platforms, schedules last-second bids, handles soft-close extensions, and guides you through the post-win logistics (payment routing, shipping coordination, provenance documentation for antiques). The name is the origin of the word "sniping" — common snipes are notoriously elusive birds, secretive and camouflaged, that flush suddenly from cover. Shooting one required extreme patience, stillness, and a precise last-second shot. That's the auction strategy. -## Primary platforms +--- +## Implemented: eBay Listing Intelligence + +### Search & filtering +- Full-text eBay search via Browse API (with Playwright scraper fallback when no API credentials configured) +- Price range, must-include keywords (AND / ANY / OR-groups mode), must-exclude terms, eBay category filter +- OR-group mode expands keyword combinations into multiple targeted queries and deduplicates results — eBay relevance won't silently drop variants +- Pages-to-fetch control: each Browse API page returns up to 200 listings +- Saved searches with one-click re-run that restores all filter settings + +### Seller trust scoring +Five signals, each scored 0–20, composited to 0–100: + +| Signal | What it measures | +|--------|-----------------| +| `account_age` | Days since eBay account registration | +| `feedback_count` | Total feedback received | +| `feedback_ratio` | Positive feedback percentage | +| `price_vs_market` | Listing price vs. median of recent completed sales | +| `category_history` | Whether seller has history selling in this category | + +Scores are marked **partial** when signals are unavailable (e.g. account age not yet enriched). Partial scores are displayed with a visual indicator rather than penalizing the seller for missing data. + +### Red flags +Hard filters that override the composite score: +- `new_account` — account registered within 7 days +- `established_bad_actor` — feedback ratio < 80% with 20+ reviews + +Soft flags surfaced as warnings: +- `account_under_30_days` — account under 30 days old +- `low_feedback_count` — fewer than 10 reviews +- `suspicious_price` — listing price below 50% of market median *(suppressed automatically when the search returns a heterogeneous price distribution — e.g. mixed laptop generations — to prevent false positives)* +- `duplicate_photo` — same image found on another listing (perceptual hash) +- `scratch_dent_mentioned` — title keywords indicating cosmetic damage, functional problems, or evasive language (see below) +- `long_on_market` — listing has been seen 5+ times over 14+ days without selling +- `significant_price_drop` — current price more than 20% below first-seen price + +### Scratch & dent title detection +Scans listing titles for signals the item may have undisclosed damage or problems: +- **Explicit damage**: scratch, scuff, dent, crack, chip, blemish, worn +- **Condition catch-alls**: as is, for parts, parts only, spares or repair +- **Evasive redirects**: "see description", "read description", "see photos for" (seller hiding damage detail in listing body) +- **Functional problems**: "not working", "stopped working", "no power", "dead on arrival", "powers on but", "faulty", "broken screen/hinge/port" +- **DIY/repair listings**: "needs repair", "needs tlc", "project laptop", "for repair", "sold as is" + +### Seller enrichment +- **Inline (API adapter)**: account age filled from Browse API `registrationDate` field +- **Background (scraper)**: `/itm/` listing pages scraped for seller "Joined" date via Playwright + Xvfb (Kasada-safe headed Chromium) +- **On-demand**: ↻ button on any listing card triggers `POST /api/enrich` — runs enrichment and re-scores without waiting for a second search +- **Category history**: derived from the seller's accumulated listing data (Browse API `categories` field); improves with every search, no extra API calls + +### Market price comparison +Completed sales fetched via eBay Marketplace Insights API (with Browse API fallback for app tiers that don't have Insights access). Median stored per query hash, used to score `price_vs_market` across all listings in a search. + +### Adapters +| Adapter | When used | Signals available | +|---------|-----------|-------------------| +| Browse API (`api`) | eBay API credentials configured | All signals; account age inline | +| Playwright scraper (`scraper`) | No credentials / forced | All signals except account age (async BTF enrichment) | +| `auto` (default) | — | API if credentials present, scraper otherwise | + +--- + +## Stack + +| Layer | Tech | Port | +|-------|------|------| +| Frontend | Vue 3 + Pinia + UnoCSS + Vite (nginx) | 8509 | +| API | FastAPI (uvicorn) | 8510 | +| Scraper | Playwright + playwright-stealth + Xvfb | — | +| DB | SQLite (`data/snipe.db`) | — | +| Core | circuitforge-core (editable install) | — | + +## Running + +```bash +./manage.sh start # start all services +./manage.sh stop # stop +./manage.sh logs # tail logs +./manage.sh open # open in browser +``` + +Cloud stack (shared DB, multi-user): +```bash +docker compose -f compose.cloud.yml -p snipe-cloud up -d +docker compose -f compose.cloud.yml -p snipe-cloud build api # after Python changes +``` + +--- + +## Roadmap + +### Near-term (eBay) + +| Issue | Feature | +|-------|---------| +| [#1](https://git.opensourcesolarpunk.com/Circuit-Forge/snipe/issues/1) | SSE/WebSocket live score push — enriched data appears without re-search | +| [#2](https://git.opensourcesolarpunk.com/Circuit-Forge/snipe/issues/2) | eBay OAuth (Connect eBay Account) for full trust score access via Trading API | +| [#4](https://git.opensourcesolarpunk.com/Circuit-Forge/snipe/issues/4) | Scammer database: community blocklist + batch eBay Trust & Safety reporting | +| [#5](https://git.opensourcesolarpunk.com/Circuit-Forge/snipe/issues/5) | UPC/product lookup → LLM-crafted search terms (paid tier) | +| [#8](https://git.opensourcesolarpunk.com/Circuit-Forge/snipe/issues/8) | "Triple Red" easter egg: CSS animation when all hard flags fire simultaneously | +| [#11](https://git.opensourcesolarpunk.com/Circuit-Forge/snipe/issues/11) | Vision-based photo condition assessment — moondream2 (local) / Claude vision (cloud, paid) | +| [#12](https://git.opensourcesolarpunk.com/Circuit-Forge/snipe/issues/12) | Background saved-search monitoring with configurable alerts | + +### Cloud / infrastructure + +| Issue | Feature | +|-------|---------| +| [#6](https://git.opensourcesolarpunk.com/Circuit-Forge/snipe/issues/6) | Shared seller/scammer/comps DB across cloud users (public data, no re-scraping) | +| [#7](https://git.opensourcesolarpunk.com/Circuit-Forge/snipe/issues/7) | Shared image hash DB — requires explicit opt-in consent (CF privacy-by-architecture) | + +### Auction sniping engine + +| Issue | Feature | +|-------|---------| +| [#9](https://git.opensourcesolarpunk.com/Circuit-Forge/snipe/issues/9) | Bid scheduling + snipe execution (NTP-synchronized, soft-close handling, human approval gate) | +| [#13](https://git.opensourcesolarpunk.com/Circuit-Forge/snipe/issues/13) | Post-win workflow: payment routing, shipping coordination, provenance documentation | + +### Multi-platform expansion + +| Issue | Feature | +|-------|---------| +| [#10](https://git.opensourcesolarpunk.com/Circuit-Forge/snipe/issues/10) | CT Bids, HiBid, AuctionZip, Invaluable, GovPlanet, Bidsquare, Proxibid | + +--- + +## Primary platforms (full vision) + +- **eBay** — general + collectibles *(search + trust scoring: implemented)* - **CT Bids** — Connecticut state surplus and municipal auctions - **GovPlanet / IronPlanet** — government surplus equipment - **AuctionZip** — antique auction house aggregator (1,000+ houses) - **Invaluable / LiveAuctioneers** — fine art and antiques - **Bidsquare** — antiques and collectibles -- **eBay** — general + collectibles - **HiBid** — estate auctions - **Proxibid** — industrial and collector auctions -## Why it's hard +## Why auctions are hard Online auctions are frustrating because: - Winning requires being present at the exact closing moment — sometimes 2 AM - Platforms vary wildly: some allow proxy bids, some don't; closing times extend on activity - Price history is hidden — you don't know if an item is underpriced or a trap -- Shipping logistics for large / fragile antiques require coordination with auction house +- Sellers hide damage in descriptions rather than titles to avoid automated filters +- Shipping logistics for large / fragile antiques require coordination with the auction house - Provenance documentation is inconsistent across auction houses -## Core pipeline - -``` -Configure search (categories, keywords, platforms, max price, location) -→ Monitor listings → Alert on matching items -→ Human review: approve or skip -→ Price research: comparable sales history, condition assessment via photos -→ Schedule snipe bid (configurable: X seconds before close, Y% above current) -→ Execute bid → Monitor for counter-bid (soft-close extension handling) -→ Win notification → Payment + shipping coordination workflow -→ Provenance documentation for antiques -``` - -## Bidding strategy engine +## Bidding strategy engine (planned) - **Hard snipe**: submit bid N seconds before close (default: 8s) - **Soft-close handling**: detect if platform extends on last-minute bids; adjust strategy @@ -51,10 +172,10 @@ Configure search (categories, keywords, platforms, max price, location) - **Reserve detection**: identify likely reserve price from bid history patterns - **Comparable sales**: pull recent auction results for same/similar items across platforms -## Post-win workflow +## Post-win workflow (planned) 1. Payment method routing (platform-specific: CC, wire, check) -2. Shipping quote requests to approved carriers (for freight / large items) +2. Shipping quote requests to approved carriers (freight / large items via uShip; parcel via FedEx/UPS) 3. Condition report request from auction house 4. Provenance packet generation (for antiques / fine art resale or insurance) 5. Add to inventory (for dealers / collectors tracking portfolio value) @@ -65,10 +186,10 @@ Configure search (categories, keywords, platforms, max price, location) ## Tech notes -- Shared `circuitforge-core` scaffold -- Platform adapters: AuctionZip, Invaluable, HiBid, eBay, CT Bids (Playwright + API where available) +- Shared `circuitforge-core` scaffold (DB, LLM router, tier system, config) +- Platform adapters: currently eBay only; AuctionZip, Invaluable, HiBid, CT Bids planned (Playwright + API where available) - Bid execution: Playwright automation with precise timing (NTP-synchronized) - Soft-close detection: platform-specific rules engine -- Comparable sales: scrape completed auctions, normalize by condition/provenance -- Vision module: condition assessment from listing photos (moondream2 / Claude vision) -- Shipping quote integration: uShip API for freight, FedEx / UPS for parcel +- Comparable sales: eBay completed listings via Marketplace Insights API + Browse API fallback +- Vision module: condition assessment from listing photos — moondream2 / Claude vision (paid tier stub in `app/trust/photo.py`) +- **Kasada bypass**: headed Chromium via Xvfb; all scraping uses this path — headless and `requests`-based approaches are blocked by eBay diff --git a/api/ebay_webhook.py b/api/ebay_webhook.py new file mode 100644 index 0000000..0719455 --- /dev/null +++ b/api/ebay_webhook.py @@ -0,0 +1,149 @@ +"""eBay Marketplace Account Deletion webhook. + +Required to activate eBay production API credentials. + +Protocol (https://developer.ebay.com/develop/guides-v2/marketplace-user-account-deletion): + + GET /api/ebay/account-deletion?challenge_code= + → {"challengeResponse": SHA256(code + token + endpoint_url)} + + POST /api/ebay/account-deletion + Header: X-EBAY-SIGNATURE: "}> + Body: JSON notification payload + → 200 on valid + deleted, 412 on bad signature + +Public keys are fetched from the eBay Notification API and cached for 1 hour. +""" +from __future__ import annotations + +import base64 +import hashlib +import json +import logging +import os +import time +from pathlib import Path +from typing import Optional + +import requests +from fastapi import APIRouter, Header, HTTPException, Request +from cryptography.exceptions import InvalidSignature +from cryptography.hazmat.primitives.asymmetric.ec import ECDSA +from cryptography.hazmat.primitives.hashes import SHA1 +from cryptography.hazmat.primitives.serialization import load_pem_public_key + +from app.db.store import Store + +log = logging.getLogger(__name__) + +router = APIRouter() + +_DB_PATH = Path(os.environ.get("SNIPE_DB", "data/snipe.db")) + +# ── Public-key cache ────────────────────────────────────────────────────────── +# eBay key rotation is rare; 1-hour TTL is appropriate. +_KEY_CACHE_TTL = 3600 +_key_cache: dict[str, tuple[bytes, float]] = {} # kid → (pem_bytes, expiry) + +# The eBay Notification service is a unified production-side system — signing keys +# always live at api.ebay.com regardless of whether the app uses sandbox or production +# Browse API credentials. +_EBAY_KEY_URL = "https://api.ebay.com/commerce/notification/v1/public_key/{kid}" + + +def _fetch_public_key(kid: str) -> bytes: + """Return PEM public key bytes for the given kid, using a 1-hour cache.""" + cached = _key_cache.get(kid) + if cached and time.time() < cached[1]: + return cached[0] + + key_url = _EBAY_KEY_URL.format(kid=kid) + resp = requests.get(key_url, timeout=10) + if not resp.ok: + log.error("public key fetch failed: %s %s — body: %s", resp.status_code, key_url, resp.text[:500]) + resp.raise_for_status() + pem_str: str = resp.json()["key"] + pem_bytes = pem_str.encode() + _key_cache[kid] = (pem_bytes, time.time() + _KEY_CACHE_TTL) + return pem_bytes + + +# ── GET — challenge verification ────────────────────────────────────────────── + +@router.get("/api/ebay/account-deletion") +def ebay_challenge(challenge_code: str): + """Respond to eBay's endpoint verification challenge. + + eBay sends this GET once when you register the endpoint URL. + Response must be the SHA-256 hex digest of (code + token + endpoint). + """ + token = os.environ.get("EBAY_NOTIFICATION_TOKEN", "") + endpoint = os.environ.get("EBAY_NOTIFICATION_ENDPOINT", "") + if not token or not endpoint: + log.error("EBAY_NOTIFICATION_TOKEN or EBAY_NOTIFICATION_ENDPOINT not set") + raise HTTPException(status_code=500, detail="Webhook not configured") + + digest = hashlib.sha256( + (challenge_code + token + endpoint).encode() + ).hexdigest() + return {"challengeResponse": digest} + + +# ── POST — deletion notification ────────────────────────────────────────────── + +@router.post("/api/ebay/account-deletion", status_code=200) +async def ebay_account_deletion( + request: Request, + x_ebay_signature: Optional[str] = Header(default=None), +): + """Process an eBay Marketplace Account Deletion notification. + + Verifies the ECDSA/SHA1 signature, then permanently deletes all stored + data (sellers + listings) for the named eBay user. + """ + body_bytes = await request.body() + + # 1. Parse and verify signature header + if not x_ebay_signature: + log.warning("ebay_account_deletion: missing X-EBAY-SIGNATURE header") + raise HTTPException(status_code=412, detail="Missing signature") + + try: + sig_json = json.loads(base64.b64decode(x_ebay_signature)) + kid: str = sig_json["kid"] + sig_b64: str = sig_json["signature"] + sig_bytes = base64.b64decode(sig_b64) + except Exception as exc: + log.warning("ebay_account_deletion: malformed signature header — %s", exc) + raise HTTPException(status_code=412, detail="Malformed signature header") + + # 2. Fetch and verify with eBay public key + # EBAY_WEBHOOK_VERIFY_SIGNATURES=false skips ECDSA during sandbox/registration phase. + # Set to true (default) once production credentials are active. + skip_verify = os.environ.get("EBAY_WEBHOOK_VERIFY_SIGNATURES", "true").lower() == "false" + if skip_verify: + log.warning("ebay_account_deletion: signature verification DISABLED — enable before production") + else: + try: + pem_bytes = _fetch_public_key(kid) + pub_key = load_pem_public_key(pem_bytes) + pub_key.verify(sig_bytes, body_bytes, ECDSA(SHA1())) + except InvalidSignature: + log.warning("ebay_account_deletion: ECDSA signature verification failed (kid=%s)", kid) + raise HTTPException(status_code=412, detail="Signature verification failed") + except Exception as exc: + log.error("ebay_account_deletion: unexpected error during verification — %s", exc) + raise HTTPException(status_code=412, detail="Verification error") + + # 3. Extract username from notification payload and delete data + try: + payload = json.loads(body_bytes) + username: str = payload["notification"]["data"]["username"] + except (KeyError, json.JSONDecodeError) as exc: + log.error("ebay_account_deletion: could not parse payload — %s", exc) + raise HTTPException(status_code=400, detail="Unrecognisable payload") + + store = Store(_DB_PATH) + store.delete_seller_data("ebay", username) + log.info("ebay_account_deletion: deleted data for eBay user %r", username) + return {} diff --git a/api/main.py b/api/main.py index 75acbb8..feed088 100644 --- a/api/main.py +++ b/api/main.py @@ -9,13 +9,19 @@ from concurrent.futures import ThreadPoolExecutor from pathlib import Path from fastapi import FastAPI, HTTPException +from pydantic import BaseModel from fastapi.middleware.cors import CORSMiddleware from circuitforge_core.config import load_env from app.db.store import Store +from app.db.models import SavedSearch as SavedSearchModel from app.platforms import SearchFilters from app.platforms.ebay.scraper import ScrapedEbayAdapter +from app.platforms.ebay.adapter import EbayAdapter +from app.platforms.ebay.auth import EbayTokenManager +from app.platforms.ebay.query_builder import expand_queries, parse_groups from app.trust import TrustScorer +from api.ebay_webhook import router as ebay_webhook_router load_env(Path(".env")) log = logging.getLogger(__name__) @@ -23,7 +29,24 @@ log = logging.getLogger(__name__) _DB_PATH = Path(os.environ.get("SNIPE_DB", "data/snipe.db")) _DB_PATH.parent.mkdir(exist_ok=True) + +def _ebay_creds() -> tuple[str, str, str]: + """Return (client_id, client_secret, env) from env vars. + + New names: EBAY_APP_ID / EBAY_CERT_ID (sandbox: EBAY_SANDBOX_APP_ID / EBAY_SANDBOX_CERT_ID) + Legacy fallback: EBAY_CLIENT_ID / EBAY_CLIENT_SECRET + """ + env = os.environ.get("EBAY_ENV", "production").strip() + if env == "sandbox": + client_id = os.environ.get("EBAY_SANDBOX_APP_ID", "").strip() + client_secret = os.environ.get("EBAY_SANDBOX_CERT_ID", "").strip() + else: + client_id = (os.environ.get("EBAY_APP_ID") or os.environ.get("EBAY_CLIENT_ID", "")).strip() + client_secret = (os.environ.get("EBAY_CERT_ID") or os.environ.get("EBAY_CLIENT_SECRET", "")).strip() + return client_id, client_secret, env + app = FastAPI(title="Snipe API", version="0.1.0") +app.include_router(ebay_webhook_router) app.add_middleware( CORSMiddleware, @@ -38,59 +61,202 @@ def health(): return {"status": "ok"} +def _trigger_scraper_enrichment(listings: list, store: Store) -> None: + """Fire-and-forget background enrichment for missing seller signals. + + Two enrichment passes run concurrently in the same daemon thread: + 1. BTF (/itm/ pages) — fills account_age_days for sellers where it is None. + 2. _ssn search pages — fills category_history_json for sellers with no history. + + The main response returns immediately; enriched data lands in the DB for + future searches. Uses ScrapedEbayAdapter's Playwright stack regardless of + which adapter was used for the main search (Shopping API handles age for + the API adapter inline; BTF is the fallback for no-creds / scraper mode). + """ + # Caps per search: limits Playwright sessions launched in the background so we + # don't hammer Kasada or spin up dozens of Xvfb instances after a large search. + # Remaining sellers get enriched incrementally on subsequent searches. + _BTF_MAX_PER_SEARCH = 3 + _CAT_MAX_PER_SEARCH = 3 + + needs_btf: dict[str, str] = {} + needs_categories: list[str] = [] + + for listing in listings: + sid = listing.seller_platform_id + if not sid: + continue + seller = store.get_seller("ebay", sid) + if not seller: + continue + if (seller.account_age_days is None + and sid not in needs_btf + and len(needs_btf) < _BTF_MAX_PER_SEARCH): + needs_btf[sid] = listing.platform_listing_id + if (seller.category_history_json in ("{}", "", None) + and sid not in needs_categories + and len(needs_categories) < _CAT_MAX_PER_SEARCH): + needs_categories.append(sid) + + if not needs_btf and not needs_categories: + return + + log.info( + "Scraper enrichment: %d BTF age + %d category pages queued", + len(needs_btf), len(needs_categories), + ) + + def _run(): + try: + enricher = ScrapedEbayAdapter(Store(_DB_PATH)) + if needs_btf: + enricher.enrich_sellers_btf(needs_btf, max_workers=2) + log.info("BTF enrichment complete for %d sellers", len(needs_btf)) + if needs_categories: + enricher.enrich_sellers_categories(needs_categories, max_workers=2) + log.info("Category enrichment complete for %d sellers", len(needs_categories)) + except Exception as e: + log.warning("Scraper enrichment failed: %s", e) + + import threading + t = threading.Thread(target=_run, daemon=True) + t.start() + + def _parse_terms(raw: str) -> list[str]: """Split a comma-separated keyword string into non-empty, stripped terms.""" return [t.strip() for t in raw.split(",") if t.strip()] +def _make_adapter(store: Store, force: str = "auto"): + """Return the appropriate adapter. + + force: "auto" | "api" | "scraper" + auto — API if creds present, else scraper + api — Browse API (raises if no creds) + scraper — Playwright scraper regardless of creds + """ + client_id, client_secret, env = _ebay_creds() + has_creds = bool(client_id and client_secret) + + if force == "scraper": + return ScrapedEbayAdapter(store) + if force == "api": + if not has_creds: + raise ValueError("adapter=api requested but no eBay API credentials configured") + return EbayAdapter(EbayTokenManager(client_id, client_secret, env), store, env=env) + # auto + if has_creds: + return EbayAdapter(EbayTokenManager(client_id, client_secret, env), store, env=env) + log.debug("No eBay API credentials — using scraper adapter (partial trust scores)") + return ScrapedEbayAdapter(store) + + +def _adapter_name(force: str = "auto") -> str: + """Return the name of the adapter that would be used — without creating it.""" + client_id, client_secret, _ = _ebay_creds() + if force == "scraper": + return "scraper" + if force == "api" or (force == "auto" and client_id and client_secret): + return "api" + return "scraper" + + @app.get("/api/search") def search( q: str = "", max_price: float = 0, min_price: float = 0, pages: int = 1, - must_include: str = "", # comma-separated; applied client-side only - must_exclude: str = "", # comma-separated; forwarded to eBay AND applied client-side + must_include: str = "", # raw filter string; client-side always applied + must_include_mode: str = "all", # "all" | "any" | "groups" — drives eBay expansion + must_exclude: str = "", # comma-separated; forwarded to eBay -term + client-side + category_id: str = "", # eBay category ID — forwarded to Browse API / scraper _sacat + adapter: str = "auto", # "auto" | "api" | "scraper" — override adapter selection ): if not q.strip(): - return {"listings": [], "trust_scores": {}, "sellers": {}, "market_price": None} + return {"listings": [], "trust_scores": {}, "sellers": {}, "market_price": None, "adapter_used": _adapter_name(adapter)} - filters = SearchFilters( + must_exclude_terms = _parse_terms(must_exclude) + + # In Groups mode, expand OR groups into multiple targeted eBay queries to + # guarantee comprehensive result coverage — eBay relevance won't silently drop variants. + if must_include_mode == "groups" and must_include.strip(): + or_groups = parse_groups(must_include) + ebay_queries = expand_queries(q, or_groups) + else: + ebay_queries = [q] + + base_filters = SearchFilters( max_price=max_price if max_price > 0 else None, min_price=min_price if min_price > 0 else None, pages=max(1, pages), - must_include=_parse_terms(must_include), - must_exclude=_parse_terms(must_exclude), + must_exclude=must_exclude_terms, # forwarded to eBay -term by the scraper + category_id=category_id.strip() or None, ) - # Each adapter gets its own Store (SQLite connection) — required for thread safety. - # search() and get_completed_sales() run concurrently; they write to different tables - # so SQLite file-level locking is the only contention point. - search_adapter = ScrapedEbayAdapter(Store(_DB_PATH)) - comps_adapter = ScrapedEbayAdapter(Store(_DB_PATH)) + adapter_used = _adapter_name(adapter) + + # Each thread creates its own Store — sqlite3 check_same_thread=True. + def _run_search(ebay_query: str) -> list: + return _make_adapter(Store(_DB_PATH), adapter).search(ebay_query, base_filters) + + def _run_comps() -> None: + try: + _make_adapter(Store(_DB_PATH), adapter).get_completed_sales(q, pages) + except Exception: + log.warning("comps: unhandled exception for %r", q, exc_info=True) try: - with ThreadPoolExecutor(max_workers=2) as ex: - listings_future = ex.submit(search_adapter.search, q, filters) - comps_future = ex.submit(comps_adapter.get_completed_sales, q, pages) - listings = listings_future.result() - comps_future.result() # wait; side-effect is saving market comp to DB + # Comps submitted first — guarantees an immediate worker slot even at max concurrency. + # Seller enrichment runs after the executor exits (background thread), so comps are + # always prioritised over tracking seller age / category history. + max_workers = min(len(ebay_queries) + 1, 5) + with ThreadPoolExecutor(max_workers=max_workers) as ex: + comps_future = ex.submit(_run_comps) + search_futures = [ex.submit(_run_search, eq) for eq in ebay_queries] + + # Merge and deduplicate across all search queries + seen_ids: set[str] = set() + listings: list = [] + for fut in search_futures: + for listing in fut.result(): + if listing.platform_listing_id not in seen_ids: + seen_ids.add(listing.platform_listing_id) + listings.append(listing) + comps_future.result() # side-effect: market comp written to DB except Exception as e: log.warning("eBay scrape failed: %s", e) raise HTTPException(status_code=502, detail=f"eBay search failed: {e}") - # Use search_adapter's store for post-processing — it has the sellers already written - store = search_adapter._store + log.info("Multi-search: %d queries → %d unique listings", len(ebay_queries), len(listings)) + + # Main-thread store for all post-search reads/writes — fresh connection, same thread. + store = Store(_DB_PATH) store.save_listings(listings) + # Derive category_history from accumulated listing data — free for API adapter + # (category_name comes from Browse API response), no-op for scraper listings (category_name=None). + seller_ids = list({l.seller_platform_id for l in listings if l.seller_platform_id}) + n_cat = store.refresh_seller_categories("ebay", seller_ids) + if n_cat: + log.info("Category history derived for %d sellers from listing data", n_cat) + + # Re-fetch to hydrate staging fields (times_seen, first_seen_at, id, price_at_first_seen) + # that are only available from the DB after the upsert. + staged = store.get_listings_staged("ebay", [l.platform_listing_id for l in listings]) + listings = [staged.get(l.platform_listing_id, l) for l in listings] + + # BTF enrichment: scrape /itm/ pages for sellers missing account_age_days. + # Runs in the background so it doesn't delay the response; next search of + # the same sellers will have full scores. + _trigger_scraper_enrichment(listings, store) + scorer = TrustScorer(store) trust_scores_list = scorer.score_batch(listings, q) - # Market comp written by comps_adapter — read from a fresh connection to avoid - # cross-thread connection reuse - comp_store = Store(_DB_PATH) query_hash = hashlib.md5(q.encode()).hexdigest() - comp = comp_store.get_market_comp("ebay", query_hash) + comp = store.get_market_comp("ebay", query_hash) market_price = comp.median_price if comp else None # Serialize — keyed by platform_listing_id for easy Vue lookup @@ -113,4 +279,117 @@ def search( "trust_scores": trust_map, "sellers": seller_map, "market_price": market_price, + "adapter_used": adapter_used, } + + +# ── On-demand enrichment ────────────────────────────────────────────────────── + +@app.post("/api/enrich") +def enrich_seller(seller: str, listing_id: str, query: str = ""): + """Synchronous on-demand enrichment for a single seller + re-score. + + Runs enrichment paths in parallel: + - Shopping API GetUserProfile (fast, ~500ms) — account_age_days if API creds present + - BTF /itm/ Playwright scrape (~20s) — account_age_days fallback + - _ssn Playwright scrape (~20s) — category_history_json + + BTF and _ssn run concurrently; total wall time ~20s when Playwright needed. + Returns the updated trust_score and seller so the frontend can patch in-place. + """ + import threading + store = Store(_DB_PATH) + + seller_obj = store.get_seller("ebay", seller) + if not seller_obj: + raise HTTPException(status_code=404, detail=f"Seller '{seller}' not found") + + # Fast path: Shopping API for account age (inline, no Playwright) + try: + api_adapter = _make_adapter(store, "api") + if hasattr(api_adapter, "enrich_sellers_shopping_api"): + api_adapter.enrich_sellers_shopping_api([seller]) + except Exception: + pass # no API creds — fall through to BTF + + seller_obj = store.get_seller("ebay", seller) + needs_btf = seller_obj is not None and seller_obj.account_age_days is None + needs_categories = seller_obj is None or seller_obj.category_history_json in ("{}", "", None) + + # Slow path: Playwright for remaining gaps (BTF + _ssn in parallel threads) + if needs_btf or needs_categories: + scraper = ScrapedEbayAdapter(Store(_DB_PATH)) + errors: list[Exception] = [] + + def _btf(): + try: + scraper.enrich_sellers_btf({seller: listing_id}, max_workers=1) + except Exception as e: + errors.append(e) + + def _ssn(): + try: + ScrapedEbayAdapter(Store(_DB_PATH)).enrich_sellers_categories([seller], max_workers=1) + except Exception as e: + errors.append(e) + + threads = [] + if needs_btf: + threads.append(threading.Thread(target=_btf, daemon=True)) + if needs_categories: + threads.append(threading.Thread(target=_ssn, daemon=True)) + for t in threads: + t.start() + for t in threads: + t.join(timeout=60) + + if errors: + log.warning("enrich_seller: %d scrape error(s): %s", len(errors), errors[0]) + + # Re-fetch listing with staging fields, re-score + staged = store.get_listings_staged("ebay", [listing_id]) + listing = staged.get(listing_id) + if not listing: + raise HTTPException(status_code=404, detail=f"Listing '{listing_id}' not found") + + scorer = TrustScorer(store) + trust_list = scorer.score_batch([listing], query or listing.title) + trust = trust_list[0] if trust_list else None + + seller_final = store.get_seller("ebay", seller) + return { + "trust_score": dataclasses.asdict(trust) if trust else None, + "seller": dataclasses.asdict(seller_final) if seller_final else None, + } + + +# ── Saved Searches ──────────────────────────────────────────────────────────── + +class SavedSearchCreate(BaseModel): + name: str + query: str + filters_json: str = "{}" + + +@app.get("/api/saved-searches") +def list_saved_searches(): + return {"saved_searches": [dataclasses.asdict(s) for s in Store(_DB_PATH).list_saved_searches()]} + + +@app.post("/api/saved-searches", status_code=201) +def create_saved_search(body: SavedSearchCreate): + created = Store(_DB_PATH).save_saved_search( + SavedSearchModel(name=body.name, query=body.query, platform="ebay", filters_json=body.filters_json) + ) + return dataclasses.asdict(created) + + +@app.delete("/api/saved-searches/{saved_id}", status_code=204) +def delete_saved_search(saved_id: int): + Store(_DB_PATH).delete_saved_search(saved_id) + + +@app.patch("/api/saved-searches/{saved_id}/run") +def mark_saved_search_run(saved_id: int): + Store(_DB_PATH).update_saved_search_last_run(saved_id) + return {"ok": True} diff --git a/app/db/migrations/004_staging_tracking.sql b/app/db/migrations/004_staging_tracking.sql new file mode 100644 index 0000000..7ee4225 --- /dev/null +++ b/app/db/migrations/004_staging_tracking.sql @@ -0,0 +1,24 @@ +-- Staging DB: persistent listing tracking across searches. +-- Adds temporal metadata to listings so we can detect stale/repriced/recurring items. + +ALTER TABLE listings ADD COLUMN first_seen_at TEXT; +ALTER TABLE listings ADD COLUMN last_seen_at TEXT; +ALTER TABLE listings ADD COLUMN times_seen INTEGER NOT NULL DEFAULT 1; +ALTER TABLE listings ADD COLUMN price_at_first_seen REAL; + +-- Backfill existing rows so columns are non-null where we have data +UPDATE listings SET + first_seen_at = fetched_at, + last_seen_at = fetched_at, + price_at_first_seen = price +WHERE first_seen_at IS NULL; + +-- Price history: append-only snapshots; one row per (listing, price) change. +-- Duplicate prices are ignored (INSERT OR IGNORE) so only transitions are recorded. +CREATE TABLE IF NOT EXISTS listing_price_history ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + listing_id INTEGER NOT NULL REFERENCES listings(id), + price REAL NOT NULL, + captured_at TEXT DEFAULT CURRENT_TIMESTAMP, + UNIQUE(listing_id, price) +); diff --git a/app/db/migrations/005_listing_category.sql b/app/db/migrations/005_listing_category.sql new file mode 100644 index 0000000..2d0ca4c --- /dev/null +++ b/app/db/migrations/005_listing_category.sql @@ -0,0 +1,3 @@ +-- Add per-listing category name, extracted from eBay API response. +-- Used to derive seller category_history_json without _ssn scraping. +ALTER TABLE listings ADD COLUMN category_name TEXT; diff --git a/app/db/models.py b/app/db/models.py index 37ca916..5d1d2a1 100644 --- a/app/db/models.py +++ b/app/db/models.py @@ -34,6 +34,12 @@ class Listing: id: Optional[int] = None fetched_at: Optional[str] = None trust_score_id: Optional[int] = None + category_name: Optional[str] = None # leaf category from eBay API (e.g. "Graphics/Video Cards") + # Staging DB fields — populated from DB after upsert + first_seen_at: Optional[str] = None + last_seen_at: Optional[str] = None + times_seen: int = 1 + price_at_first_seen: Optional[float] = None @dataclass diff --git a/app/db/store.py b/app/db/store.py index 6ece60d..19a2984 100644 --- a/app/db/store.py +++ b/app/db/store.py @@ -7,7 +7,7 @@ from typing import Optional from circuitforge_core.db import get_connection, run_migrations -from .models import Listing, Seller, TrustScore, MarketComp +from .models import Listing, Seller, TrustScore, MarketComp, SavedSearch MIGRATIONS_DIR = Path(__file__).parent / "migrations" @@ -19,6 +19,18 @@ class Store: # --- Seller --- + def delete_seller_data(self, platform: str, platform_seller_id: str) -> None: + """Permanently erase a seller and all their listings — GDPR/eBay deletion compliance.""" + self._conn.execute( + "DELETE FROM sellers WHERE platform=? AND platform_seller_id=?", + (platform, platform_seller_id), + ) + self._conn.execute( + "DELETE FROM listings WHERE platform=? AND seller_platform_id=?", + (platform, platform_seller_id), + ) + self._conn.commit() + def save_seller(self, seller: Seller) -> None: self.save_sellers([seller]) @@ -47,31 +59,141 @@ class Store: return None return Seller(*row[:7], id=row[7], fetched_at=row[8]) + def refresh_seller_categories(self, platform: str, seller_ids: list[str]) -> int: + """Derive category_history_json for sellers that lack it by aggregating + their stored listings' category_name values. + + Returns the count of sellers updated. + """ + from app.platforms.ebay.scraper import _classify_category_label # lazy to avoid circular + + if not seller_ids: + return 0 + updated = 0 + for sid in seller_ids: + seller = self.get_seller(platform, sid) + if not seller or seller.category_history_json not in ("{}", "", None): + continue # already enriched + rows = self._conn.execute( + "SELECT category_name, COUNT(*) FROM listings " + "WHERE platform=? AND seller_platform_id=? AND category_name IS NOT NULL " + "GROUP BY category_name", + (platform, sid), + ).fetchall() + if not rows: + continue + counts: dict[str, int] = {} + for cat_name, cnt in rows: + key = _classify_category_label(cat_name) + if key: + counts[key] = counts.get(key, 0) + cnt + if counts: + from dataclasses import replace + updated_seller = replace(seller, category_history_json=json.dumps(counts)) + self.save_seller(updated_seller) + updated += 1 + return updated + # --- Listing --- def save_listing(self, listing: Listing) -> None: self.save_listings([listing]) def save_listings(self, listings: list[Listing]) -> None: + """Upsert listings, preserving first_seen_at and price_at_first_seen on conflict. + + Uses INSERT ... ON CONFLICT DO UPDATE (SQLite 3.24+) so row IDs are stable + across searches — trust_score FK references survive re-indexing. + times_seen and last_seen_at accumulate on every sighting. + """ + now = datetime.now(timezone.utc).isoformat() self._conn.executemany( - "INSERT OR REPLACE INTO listings " - "(platform, platform_listing_id, title, price, currency, condition, " - "seller_platform_id, url, photo_urls, listing_age_days, buying_format, ends_at) " - "VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", + """ + INSERT INTO listings + (platform, platform_listing_id, title, price, currency, condition, + seller_platform_id, url, photo_urls, listing_age_days, buying_format, + ends_at, first_seen_at, last_seen_at, times_seen, price_at_first_seen, + category_name) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,1,?,?) + ON CONFLICT(platform, platform_listing_id) DO UPDATE SET + title = excluded.title, + price = excluded.price, + condition = excluded.condition, + seller_platform_id = excluded.seller_platform_id, + url = excluded.url, + photo_urls = excluded.photo_urls, + listing_age_days = excluded.listing_age_days, + buying_format = excluded.buying_format, + ends_at = excluded.ends_at, + last_seen_at = excluded.last_seen_at, + times_seen = times_seen + 1, + category_name = COALESCE(excluded.category_name, category_name) + -- first_seen_at and price_at_first_seen intentionally preserved + """, [ (l.platform, l.platform_listing_id, l.title, l.price, l.currency, l.condition, l.seller_platform_id, l.url, - json.dumps(l.photo_urls), l.listing_age_days, l.buying_format, l.ends_at) + json.dumps(l.photo_urls), l.listing_age_days, l.buying_format, l.ends_at, + now, now, l.price, l.category_name) + for l in listings + ], + ) + # Record price snapshots — INSERT OR IGNORE means only price changes land + self._conn.executemany( + """ + INSERT OR IGNORE INTO listing_price_history (listing_id, price, captured_at) + SELECT id, ?, ? FROM listings + WHERE platform=? AND platform_listing_id=? + """, + [ + (l.price, now, l.platform, l.platform_listing_id) for l in listings ], ) self._conn.commit() + def get_listings_staged(self, platform: str, platform_listing_ids: list[str]) -> dict[str, "Listing"]: + """Bulk fetch listings by platform_listing_id, returning staging fields. + + Returns a dict keyed by platform_listing_id. Used to hydrate freshly-normalised + listing objects after save_listings() so trust scoring sees times_seen, + first_seen_at, price_at_first_seen, and the DB-assigned id. + """ + if not platform_listing_ids: + return {} + placeholders = ",".join("?" * len(platform_listing_ids)) + rows = self._conn.execute( + f"SELECT platform, platform_listing_id, title, price, currency, condition, " + f"seller_platform_id, url, photo_urls, listing_age_days, id, fetched_at, " + f"buying_format, ends_at, first_seen_at, last_seen_at, times_seen, price_at_first_seen, " + f"category_name " + f"FROM listings WHERE platform=? AND platform_listing_id IN ({placeholders})", + [platform] + list(platform_listing_ids), + ).fetchall() + result: dict[str, Listing] = {} + for row in rows: + pid = row[1] + result[pid] = Listing( + *row[:8], + photo_urls=json.loads(row[8]), + listing_age_days=row[9], + id=row[10], + fetched_at=row[11], + buying_format=row[12] or "fixed_price", + ends_at=row[13], + first_seen_at=row[14], + last_seen_at=row[15], + times_seen=row[16] or 1, + price_at_first_seen=row[17], + category_name=row[18], + ) + return result + def get_listing(self, platform: str, platform_listing_id: str) -> Optional[Listing]: row = self._conn.execute( "SELECT platform, platform_listing_id, title, price, currency, condition, " "seller_platform_id, url, photo_urls, listing_age_days, id, fetched_at, " - "buying_format, ends_at " + "buying_format, ends_at, first_seen_at, last_seen_at, times_seen, price_at_first_seen " "FROM listings WHERE platform=? AND platform_listing_id=?", (platform, platform_listing_id), ).fetchone() @@ -85,6 +207,10 @@ class Store: fetched_at=row[11], buying_format=row[12] or "fixed_price", ends_at=row[13], + first_seen_at=row[14], + last_seen_at=row[15], + times_seen=row[16] or 1, + price_at_first_seen=row[17], ) # --- MarketComp --- @@ -99,6 +225,44 @@ class Store: ) self._conn.commit() + # --- SavedSearch --- + + def save_saved_search(self, s: SavedSearch) -> SavedSearch: + cur = self._conn.execute( + "INSERT INTO saved_searches (name, query, platform, filters_json) VALUES (?,?,?,?)", + (s.name, s.query, s.platform, s.filters_json), + ) + self._conn.commit() + row = self._conn.execute( + "SELECT id, created_at FROM saved_searches WHERE id=?", (cur.lastrowid,) + ).fetchone() + return SavedSearch( + name=s.name, query=s.query, platform=s.platform, + filters_json=s.filters_json, id=row[0], created_at=row[1], + ) + + def list_saved_searches(self) -> list[SavedSearch]: + rows = self._conn.execute( + "SELECT name, query, platform, filters_json, id, created_at, last_run_at " + "FROM saved_searches ORDER BY created_at DESC" + ).fetchall() + return [ + SavedSearch(name=r[0], query=r[1], platform=r[2], filters_json=r[3], + id=r[4], created_at=r[5], last_run_at=r[6]) + for r in rows + ] + + def delete_saved_search(self, saved_id: int) -> None: + self._conn.execute("DELETE FROM saved_searches WHERE id=?", (saved_id,)) + self._conn.commit() + + def update_saved_search_last_run(self, saved_id: int) -> None: + self._conn.execute( + "UPDATE saved_searches SET last_run_at=? WHERE id=?", + (datetime.now(timezone.utc).isoformat(), saved_id), + ) + self._conn.commit() + def get_market_comp(self, platform: str, query_hash: str) -> Optional[MarketComp]: row = self._conn.execute( "SELECT platform, query_hash, median_price, sample_count, expires_at, id, fetched_at " diff --git a/app/platforms/__init__.py b/app/platforms/__init__.py index 69b1fae..93bd054 100644 --- a/app/platforms/__init__.py +++ b/app/platforms/__init__.py @@ -15,6 +15,7 @@ class SearchFilters: pages: int = 1 # number of result pages to fetch (48 listings/page) must_include: list[str] = field(default_factory=list) # client-side title filter must_exclude: list[str] = field(default_factory=list) # forwarded to eBay -term AND client-side + category_id: Optional[str] = None # eBay category ID (e.g. "27386" = GPUs) class PlatformAdapter(ABC): diff --git a/app/platforms/ebay/adapter.py b/app/platforms/ebay/adapter.py index 6fadec0..25d4a93 100644 --- a/app/platforms/ebay/adapter.py +++ b/app/platforms/ebay/adapter.py @@ -1,16 +1,58 @@ """eBay Browse API adapter.""" from __future__ import annotations import hashlib +import logging +from dataclasses import replace from datetime import datetime, timedelta, timezone from typing import Optional import requests +log = logging.getLogger(__name__) + +_SHOPPING_BASE = "https://open.api.ebay.com/shopping" + +# Rate limiting for Shopping API GetUserProfile calls. +# Enrichment is incremental — these caps spread API calls across multiple +# searches rather than bursting on first encounter with a new seller batch. +_SHOPPING_API_MAX_PER_SEARCH = 5 # sellers enriched per search call +_SHOPPING_API_INTER_REQUEST_DELAY = 0.5 # seconds between successive calls +_SELLER_ENRICH_TTL_HOURS = 24 # skip re-enrichment within this window + from app.db.models import Listing, Seller, MarketComp from app.db.store import Store from app.platforms import PlatformAdapter, SearchFilters from app.platforms.ebay.auth import EbayTokenManager from app.platforms.ebay.normaliser import normalise_listing, normalise_seller +_BROWSE_LIMIT = 200 # max items per Browse API page +_INSIGHTS_BASE = { + "production": "https://api.ebay.com/buy/marketplace_insights/v1_beta", + "sandbox": "https://api.sandbox.ebay.com/buy/marketplace_insights/v1_beta", +} + + +def _build_browse_query(base_query: str, or_groups: list[list[str]], must_exclude: list[str]) -> str: + """Convert OR groups + exclusions into Browse API boolean query syntax. + + Browse API uses SQL-like boolean: AND (implicit), OR (keyword), NOT (keyword). + Parentheses work as grouping operators. + Example: 'GPU (16gb OR 24gb OR 48gb) (nvidia OR rtx OR geforce) NOT "parts only"' + """ + parts = [base_query.strip()] + for group in or_groups: + clean = [t.strip() for t in group if t.strip()] + if len(clean) == 1: + parts.append(clean[0]) + elif len(clean) > 1: + parts.append(f"({' OR '.join(clean)})") + for term in must_exclude: + term = term.strip() + if term: + # Use minus syntax (-term / -"phrase") — Browse API's NOT keyword + # over-filters dramatically in practice; minus works like web search negatives. + parts.append(f'-"{term}"' if " " in term else f"-{term}") + return " ".join(p for p in parts if p) + BROWSE_BASE = { "production": "https://api.ebay.com/buy/browse/v1", "sandbox": "https://api.sandbox.ebay.com/buy/browse/v1", @@ -25,29 +67,146 @@ class EbayAdapter(PlatformAdapter): def __init__(self, token_manager: EbayTokenManager, store: Store, env: str = "production"): self._tokens = token_manager self._store = store + self._env = env self._browse_base = BROWSE_BASE[env] def _headers(self) -> dict: return {"Authorization": f"Bearer {self._tokens.get_token()}"} def search(self, query: str, filters: SearchFilters) -> list[Listing]: - params: dict = {"q": query, "limit": 50} - filter_parts = [] + # Build Browse API boolean query from OR groups + exclusions + browse_q = _build_browse_query(query, getattr(filters, "or_groups", []), filters.must_exclude) + + filter_parts: list[str] = [] if filters.max_price: filter_parts.append(f"price:[..{filters.max_price}],priceCurrency:USD") + if filters.min_price: + filter_parts.append(f"price:[{filters.min_price}..],priceCurrency:USD") if filters.condition: - cond_map = {"new": "NEW", "used": "USED", "open box": "OPEN_BOX", "for parts": "FOR_PARTS_NOT_WORKING"} + cond_map = { + "new": "NEW", "used": "USED", + "open box": "OPEN_BOX", "for parts": "FOR_PARTS_NOT_WORKING", + } ebay_conds = [cond_map[c] for c in filters.condition if c in cond_map] if ebay_conds: filter_parts.append(f"conditions:{{{','.join(ebay_conds)}}}") - if filter_parts: - params["filter"] = ",".join(filter_parts) - resp = requests.get(f"{self._browse_base}/item_summary/search", - headers=self._headers(), params=params) - resp.raise_for_status() - items = resp.json().get("itemSummaries", []) - return [normalise_listing(item) for item in items] + base_params: dict = {"q": browse_q, "limit": _BROWSE_LIMIT} + if filter_parts: + base_params["filter"] = ",".join(filter_parts) + if filters.category_id: + base_params["category_ids"] = filters.category_id + + pages = max(1, filters.pages) + seen_ids: set[str] = set() + listings: list[Listing] = [] + sellers_to_save: dict[str, Seller] = {} + + for page in range(pages): + params = {**base_params, "offset": page * _BROWSE_LIMIT} + resp = requests.get( + f"{self._browse_base}/item_summary/search", + headers=self._headers(), + params=params, + ) + resp.raise_for_status() + data = resp.json() + items = data.get("itemSummaries", []) + if not items: + break # no more results + + for item in items: + listing = normalise_listing(item) + if listing.platform_listing_id not in seen_ids: + seen_ids.add(listing.platform_listing_id) + listings.append(listing) + # Extract inline seller data available in item_summary + seller_raw = item.get("seller", {}) + if seller_raw.get("username") and seller_raw["username"] not in sellers_to_save: + sellers_to_save[seller_raw["username"]] = normalise_seller(seller_raw) + + if not data.get("next"): + break # Browse API paginates via "next" href; absence = last page + + if sellers_to_save: + self._store.save_sellers(list(sellers_to_save.values())) + + # Enrich sellers missing account_age_days via Shopping API (fast HTTP, no Playwright). + # Capped at _SHOPPING_API_MAX_PER_SEARCH to avoid bursting the daily quota when + # many new sellers appear in a single search batch. + needs_age = [s.platform_seller_id for s in sellers_to_save.values() + if s.account_age_days is None] + if needs_age: + self.enrich_sellers_shopping_api(needs_age[:_SHOPPING_API_MAX_PER_SEARCH]) + + return listings + + def enrich_sellers_shopping_api(self, usernames: list[str]) -> None: + """Fetch RegistrationDate for sellers via Shopping API GetUserProfile. + + Uses app-level Bearer token — no user OAuth required. Silently skips + on rate limit (error 1.21) or any other failure so the search response + is never blocked. BTF scraping remains the fallback for the scraper adapter. + + Rate limiting: _SHOPPING_API_INTER_REQUEST_DELAY between calls; sellers + enriched within _SELLER_ENRICH_TTL_HOURS are skipped (account age doesn't + change day to day). Callers should already cap the list length. + """ + token = self._tokens.get_token() + headers = { + "X-EBAY-API-IAF-TOKEN": f"Bearer {token}", + "User-Agent": "Mozilla/5.0", + } + cutoff = datetime.now(timezone.utc) - timedelta(hours=_SELLER_ENRICH_TTL_HOURS) + first = True + for username in usernames: + try: + # Skip recently enriched sellers — account age doesn't change daily. + seller = self._store.get_seller("ebay", username) + if seller and seller.fetched_at: + try: + ft = datetime.fromisoformat(seller.fetched_at.replace("Z", "+00:00")) + if ft.tzinfo is None: + ft = ft.replace(tzinfo=timezone.utc) + if ft > cutoff and seller.account_age_days is not None: + continue + except ValueError: + pass + + if not first: + import time as _time + _time.sleep(_SHOPPING_API_INTER_REQUEST_DELAY) + first = False + + resp = requests.get( + _SHOPPING_BASE, + headers=headers, + params={ + "callname": "GetUserProfile", + "appid": self._tokens.client_id, + "siteid": "0", + "version": "967", + "UserID": username, + "responseencoding": "JSON", + }, + timeout=10, + ) + data = resp.json() + if data.get("Ack") != "Success": + errors = data.get("Errors", []) + if any(e.get("ErrorCode") == "1.21" for e in errors): + log.debug("Shopping API rate-limited for %s — BTF fallback", username) + continue + reg_date = data.get("User", {}).get("RegistrationDate") + if reg_date: + dt = datetime.fromisoformat(reg_date.replace("Z", "+00:00")) + age_days = (datetime.now(timezone.utc) - dt).days + seller = self._store.get_seller("ebay", username) + if seller: + self._store.save_seller(replace(seller, account_age_days=age_days)) + log.debug("Shopping API: %s registered %d days ago", username, age_days) + except Exception as e: + log.debug("Shopping API enrich failed for %s: %s", username, e) def get_seller(self, seller_platform_id: str) -> Optional[Seller]: cached = self._store.get_seller("ebay", seller_platform_id) @@ -69,30 +228,62 @@ class EbayAdapter(PlatformAdapter): except Exception: return None # Caller handles None gracefully (partial score) - def get_completed_sales(self, query: str) -> list[Listing]: + def get_completed_sales(self, query: str, pages: int = 1) -> list[Listing]: query_hash = hashlib.md5(query.encode()).hexdigest() - cached = self._store.get_market_comp("ebay", query_hash) - if cached: - return [] # Comp data is used directly; return empty to signal cache hit + if self._store.get_market_comp("ebay", query_hash): + return [] # cache hit - params = {"q": query, "limit": 20, "filter": "buyingOptions:{FIXED_PRICE}"} + prices: list[float] = [] try: - resp = requests.get(f"{self._browse_base}/item_summary/search", - headers=self._headers(), params=params) + # Marketplace Insights API returns sold/completed items — best source for comps. + # Falls back gracefully to Browse API active listings if the endpoint is + # unavailable (requires buy.marketplace.insights scope). + insights_base = _INSIGHTS_BASE.get(self._env, _INSIGHTS_BASE["production"]) + resp = requests.get( + f"{insights_base}/item_summary/search", + headers=self._headers(), + params={"q": query, "limit": 50, "filter": "buyingOptions:{FIXED_PRICE}"}, + ) + if resp.status_code in (403, 404): + # 403 = scope not granted; 404 = endpoint not available for this app tier. + # Both mean: fall back to active listing prices via Browse API. + log.info("comps api: Marketplace Insights unavailable (%d), falling back to Browse API", resp.status_code) + raise PermissionError("Marketplace Insights not available") resp.raise_for_status() items = resp.json().get("itemSummaries", []) - listings = [normalise_listing(item) for item in items] - if listings: - prices = sorted(l.price for l in listings) - median = prices[len(prices) // 2] - comp = MarketComp( - platform="ebay", - query_hash=query_hash, - median_price=median, - sample_count=len(prices), - expires_at=(datetime.now(timezone.utc) + timedelta(hours=6)).isoformat(), + prices = [float(i["lastSoldPrice"]["value"]) for i in items if "lastSoldPrice" in i] + log.info("comps api: Marketplace Insights returned %d items, %d with lastSoldPrice", len(items), len(prices)) + except PermissionError: + # Fallback: use active listing prices (less accurate but always available) + try: + resp = requests.get( + f"{self._browse_base}/item_summary/search", + headers=self._headers(), + params={"q": query, "limit": 50, "filter": "buyingOptions:{FIXED_PRICE}"}, ) - self._store.save_market_comp(comp) - return listings + resp.raise_for_status() + items = resp.json().get("itemSummaries", []) + prices = [float(i["price"]["value"]) for i in items if "price" in i] + log.info("comps api: Browse API fallback returned %d items, %d with price", len(items), len(prices)) + except Exception: + log.warning("comps api: Browse API fallback failed for %r", query, exc_info=True) + return [] except Exception: + log.warning("comps api: unexpected error for %r", query, exc_info=True) return [] + + if not prices: + log.warning("comps api: 0 valid prices extracted — no comp saved for %r", query) + return [] + + prices.sort() + n = len(prices) + median = (prices[n // 2 - 1] + prices[n // 2]) / 2 if n % 2 == 0 else prices[n // 2] + self._store.save_market_comp(MarketComp( + platform="ebay", + query_hash=query_hash, + median_price=median, + sample_count=n, + expires_at=(datetime.now(timezone.utc) + timedelta(hours=6)).isoformat(), + )) + return [] diff --git a/app/platforms/ebay/auth.py b/app/platforms/ebay/auth.py index 1a4df4b..f04c4cd 100644 --- a/app/platforms/ebay/auth.py +++ b/app/platforms/ebay/auth.py @@ -21,6 +21,10 @@ class EbayTokenManager: self._token: Optional[str] = None self._expires_at: float = 0.0 + @property + def client_id(self) -> str: + return self._client_id + def get_token(self) -> str: """Return a valid access token, fetching or refreshing as needed.""" if self._token and time.time() < self._expires_at - 60: diff --git a/app/platforms/ebay/normaliser.py b/app/platforms/ebay/normaliser.py index 5c2c637..99f4921 100644 --- a/app/platforms/ebay/normaliser.py +++ b/app/platforms/ebay/normaliser.py @@ -2,6 +2,7 @@ from __future__ import annotations import json from datetime import datetime, timezone +from typing import Optional from app.db.models import Listing, Seller @@ -41,6 +42,10 @@ def normalise_listing(raw: dict) -> Listing: except ValueError: pass + # Leaf category is categories[0] (most specific); parent path follows. + categories = raw.get("categories", []) + category_name: Optional[str] = categories[0]["categoryName"] if categories else None + seller = raw.get("seller", {}) return Listing( platform="ebay", @@ -55,13 +60,14 @@ def normalise_listing(raw: dict) -> Listing: listing_age_days=listing_age_days, buying_format=buying_format, ends_at=ends_at, + category_name=category_name, ) def normalise_seller(raw: dict) -> Seller: feedback_pct = float(raw.get("feedbackPercentage", "0").strip("%")) / 100.0 - account_age_days = 0 + account_age_days: Optional[int] = None # None = registrationDate not in API response reg_date_raw = raw.get("registrationDate", "") if reg_date_raw: try: diff --git a/app/platforms/ebay/query_builder.py b/app/platforms/ebay/query_builder.py new file mode 100644 index 0000000..325dd36 --- /dev/null +++ b/app/platforms/ebay/query_builder.py @@ -0,0 +1,85 @@ +""" +Build eBay-compatible boolean search queries from OR groups. + +eBay honors parenthetical OR groups in the _nkw search parameter: + (term1,term2,term3) → must contain at least one of these terms + -term / -"phrase" → must NOT contain this term / phrase + space between groups → implicit AND + +expand_queries() generates one eBay query per term in the smallest OR group, +using eBay's OR syntax for all remaining groups. This guarantees coverage even +if eBay's relevance ranking would suppress some matches in a single combined query. + +Example: + base = "GPU" + or_groups = [["16gb","24gb","40gb","48gb"], ["nvidia","quadro","rtx","geforce","titan"]] + → 4 queries (one per memory size, brand group as eBay OR): + "GPU 16gb (nvidia,quadro,rtx,geforce,titan)" + "GPU 24gb (nvidia,quadro,rtx,geforce,titan)" + "GPU 40gb (nvidia,quadro,rtx,geforce,titan)" + "GPU 48gb (nvidia,quadro,rtx,geforce,titan)" +""" +from __future__ import annotations + + +def _group_to_ebay(group: list[str]) -> str: + """Convert a list of alternatives to an eBay OR clause.""" + clean = [t.strip() for t in group if t.strip()] + if not clean: + return "" + if len(clean) == 1: + return clean[0] + return f"({','.join(clean)})" + + +def build_ebay_query(base_query: str, or_groups: list[list[str]]) -> str: + """ + Build a single eBay _nkw query string using eBay's parenthetical OR syntax. + Exclusions are handled separately via SearchFilters.must_exclude. + """ + parts = [base_query.strip()] + for group in or_groups: + clause = _group_to_ebay(group) + if clause: + parts.append(clause) + return " ".join(p for p in parts if p) + + +def expand_queries(base_query: str, or_groups: list[list[str]]) -> list[str]: + """ + Expand OR groups into one eBay query per term in the smallest group, + using eBay's OR syntax for all remaining groups. + + This guarantees every term in the pivot group is explicitly searched, + which prevents eBay's relevance engine from silently skipping rare variants. + Falls back to a single query when there are no OR groups. + """ + if not or_groups: + return [base_query.strip()] + + # Pivot on the smallest group to minimise the number of Playwright calls + smallest_idx = min(range(len(or_groups)), key=lambda i: len(or_groups[i])) + pivot = or_groups[smallest_idx] + rest = [g for i, g in enumerate(or_groups) if i != smallest_idx] + + queries = [] + for term in pivot: + q = build_ebay_query(base_query, [[term]] + rest) + queries.append(q) + return queries + + +def parse_groups(raw: str) -> list[list[str]]: + """ + Parse a Groups-mode must_include string into nested OR groups. + + Format: comma separates groups (AND), pipe separates alternatives within a group (OR). + "16gb|24gb|48gb, nvidia|rtx|geforce" + → [["16gb","24gb","48gb"], ["nvidia","rtx","geforce"]] + """ + groups = [] + for chunk in raw.split(","): + alts = [t.strip().lower() for t in chunk.split("|") if t.strip()] + if alts: + groups.append(alts) + return groups diff --git a/app/platforms/ebay/scraper.py b/app/platforms/ebay/scraper.py index d493805..3d97024 100644 --- a/app/platforms/ebay/scraper.py +++ b/app/platforms/ebay/scraper.py @@ -3,8 +3,8 @@ Data available from search results HTML (single page load): ✅ title, price, condition, photos, URL ✅ seller username, feedback count, feedback ratio - ❌ account registration date → account_age_score = None (score_is_partial) - ❌ category history → category_history_score = None (score_is_partial) + ❌ account registration date → enriched async via BTF /itm/ scrape + ❌ category history → enriched async via _ssn seller search page This is the MIT discovery layer. EbayAdapter (paid/CF proxy) unlocks full trust scores. """ @@ -12,12 +12,16 @@ from __future__ import annotations import hashlib import itertools +import json +import logging import re import time from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime, timedelta, timezone from typing import Optional +log = logging.getLogger(__name__) + from bs4 import BeautifulSoup from app.db.models import Listing, MarketComp, Seller @@ -25,7 +29,12 @@ from app.db.store import Store from app.platforms import PlatformAdapter, SearchFilters EBAY_SEARCH_URL = "https://www.ebay.com/sch/i.html" +EBAY_ITEM_URL = "https://www.ebay.com/itm/" _HTML_CACHE_TTL = 300 # seconds — 5 minutes +_JOINED_RE = re.compile(r"Joined\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+(\d{4})", re.I) +_MONTH_MAP = {m: i+1 for i, m in enumerate( + ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"] +)} # Module-level cache persists across per-request adapter instantiations. # Keyed by URL; value is (html, expiry_timestamp). @@ -53,6 +62,25 @@ _FEEDBACK_RE = re.compile(r"([\d.]+)%\s+positive\s+\(([0-9,]+)\)", re.I) _PRICE_RE = re.compile(r"[\d,]+\.?\d*") _ITEM_ID_RE = re.compile(r"/itm/(\d+)") _TIME_LEFT_RE = re.compile(r"(?:(\d+)d\s*)?(?:(\d+)h\s*)?(?:(\d+)m\s*)?(?:(\d+)s\s*)?left", re.I) +_PARENS_COUNT_RE = re.compile(r"\((\d{1,6})\)") + +# Maps title-keyword fragments → internal MetadataScorer category keys. +# Checked in order — first match wins. Broader terms intentionally listed last. +_CATEGORY_KEYWORDS: list[tuple[frozenset[str], str]] = [ + (frozenset(["cell phone", "smartphone", "mobile phone"]), "CELL_PHONES"), + (frozenset(["video game", "gaming", "console", "playstation", "xbox", "nintendo"]), "VIDEO_GAMES"), + (frozenset(["computer", "tablet", "laptop", "notebook", "chromebook"]), "COMPUTERS_TABLETS"), + (frozenset(["electronic"]), "ELECTRONICS"), +] + + +def _classify_category_label(text: str) -> Optional[str]: + """Map an eBay category label to an internal MetadataScorer key, or None.""" + lower = text.lower() + for keywords, key in _CATEGORY_KEYWORDS: + if any(kw in lower for kw in keywords): + return key + return None # --------------------------------------------------------------------------- @@ -215,6 +243,33 @@ def scrape_sellers(html: str) -> dict[str, Seller]: return sellers +def scrape_seller_categories(html: str) -> dict[str, int]: + """Parse category distribution from a seller's _ssn search page. + + eBay renders category refinements in the left sidebar. We scan all + anchor-text blocks for recognisable category labels and accumulate + listing counts from the adjacent parenthetical "(N)" strings. + + Returns a dict like {"ELECTRONICS": 45, "CELL_PHONES": 23}. + Empty dict = no recognisable categories found (score stays None). + """ + soup = BeautifulSoup(html, "lxml") + counts: dict[str, int] = {} + + # eBay sidebar refinement links contain the category label and a count. + # Multiple layout variants exist — scan broadly and classify by keyword. + for el in soup.select("a[href*='_sacat='], li.x-refine__main__list--value a"): + text = el.get_text(separator=" ", strip=True) + key = _classify_category_label(text) + if not key: + continue + m = _PARENS_COUNT_RE.search(text) + count = int(m.group(1)) if m else 1 + counts[key] = counts.get(key, 0) + count + + return counts + + # --------------------------------------------------------------------------- # Adapter # --------------------------------------------------------------------------- @@ -232,17 +287,12 @@ class ScrapedEbayAdapter(PlatformAdapter): self._store = store self._delay = delay - def _get(self, params: dict) -> str: - """Fetch eBay search HTML via a stealthed Playwright Chromium instance. + def _fetch_url(self, url: str) -> str: + """Core Playwright fetch — stealthed headed Chromium via Xvfb. - Uses Xvfb virtual display (headless=False) to avoid Kasada's headless - detection — same pattern as other CF scrapers that face JS challenges. - - Results are cached for _HTML_CACHE_TTL seconds so repeated searches - for the same query return immediately without re-scraping. + Shared by both search (_get) and BTF item-page enrichment (_fetch_item_html). + Results cached for _HTML_CACHE_TTL seconds. """ - url = EBAY_SEARCH_URL + "?" + "&".join(f"{k}={v}" for k, v in params.items()) - cached = _html_cache.get(url) if cached and time.time() < cached[1]: return cached[0] @@ -286,8 +336,100 @@ class ScrapedEbayAdapter(PlatformAdapter): _html_cache[url] = (html, time.time() + _HTML_CACHE_TTL) return html + def _get(self, params: dict) -> str: + """Fetch eBay search results HTML. params → query string appended to EBAY_SEARCH_URL.""" + url = EBAY_SEARCH_URL + "?" + "&".join(f"{k}={v}" for k, v in params.items()) + return self._fetch_url(url) + + def _fetch_item_html(self, item_id: str) -> str: + """Fetch a single eBay listing page. /itm/ pages pass Kasada; /usr/ pages do not.""" + return self._fetch_url(f"{EBAY_ITEM_URL}{item_id}") + + @staticmethod + def _parse_joined_date(html: str) -> Optional[int]: + """Parse 'Joined {Mon} {Year}' from a listing page BTF seller card. + + Returns account_age_days (int) or None if the date is not found. + eBay renders this as a span.ux-textspans inside the seller section. + """ + m = _JOINED_RE.search(html) + if not m: + return None + month_str, year_str = m.group(1)[:3].capitalize(), m.group(2) + month = _MONTH_MAP.get(month_str) + if not month: + return None + try: + reg_date = datetime(int(year_str), month, 1, tzinfo=timezone.utc) + return (datetime.now(timezone.utc) - reg_date).days + except ValueError: + return None + + def enrich_sellers_btf( + self, + seller_to_listing: dict[str, str], + max_workers: int = 2, + ) -> None: + """Background BTF enrichment — scrape /itm/ pages to fill in account_age_days. + + seller_to_listing: {seller_platform_id -> platform_listing_id} + Only pass sellers whose account_age_days is None (unknown from API batch). + Caller limits the dict to new/stale sellers to avoid redundant scrapes. + + Runs Playwright fetches in a thread pool (max_workers=2 by default to + avoid hammering Kasada). Updates seller records in the DB in-place. + Does not raise — failures per-seller are silently skipped so the main + search response is never blocked. + """ + def _enrich_one(item: tuple[str, str]) -> None: + seller_id, listing_id = item + try: + html = self._fetch_item_html(listing_id) + age_days = self._parse_joined_date(html) + if age_days is not None: + seller = self._store.get_seller("ebay", seller_id) + if seller: + from dataclasses import replace + updated = replace(seller, account_age_days=age_days) + self._store.save_seller(updated) + except Exception: + pass # non-fatal: partial score is better than a crashed enrichment + + with ThreadPoolExecutor(max_workers=max_workers) as ex: + list(ex.map(_enrich_one, seller_to_listing.items())) + + def enrich_sellers_categories( + self, + seller_platform_ids: list[str], + max_workers: int = 2, + ) -> None: + """Scrape _ssn seller pages to populate category_history_json. + + Uses the same headed Playwright stack as search() — the _ssn=USERNAME + filter is just a query param on the standard search template, so it + passes Kasada identically. Silently skips on failure so the main + search response is never affected. + """ + def _enrich_one(seller_id: str) -> None: + try: + html = self._get({"_ssn": seller_id, "_sop": "12", "_ipg": "48"}) + categories = scrape_seller_categories(html) + if categories: + seller = self._store.get_seller("ebay", seller_id) + if seller: + from dataclasses import replace + updated = replace(seller, category_history_json=json.dumps(categories)) + self._store.save_seller(updated) + except Exception: + pass + + with ThreadPoolExecutor(max_workers=max_workers) as ex: + list(ex.map(_enrich_one, seller_platform_ids)) + def search(self, query: str, filters: SearchFilters) -> list[Listing]: base_params: dict = {"_nkw": query, "_sop": "15", "_ipg": "48"} + if filters.category_id: + base_params["_sacat"] = filters.category_id if filters.max_price: base_params["_udhi"] = str(filters.max_price) @@ -303,10 +445,15 @@ class ScrapedEbayAdapter(PlatformAdapter): base_params["LH_ItemCondition"] = "|".join(codes) # Append negative keywords to the eBay query — eBay supports "-term" in _nkw natively. - # This reduces junk results at the source and improves market comp quality. + # Multi-word phrases must be quoted: -"parts only" not -parts only (which splits the words). if filters.must_exclude: - excludes = " ".join(f"-{t.strip()}" for t in filters.must_exclude if t.strip()) - base_params["_nkw"] = f"{base_params['_nkw']} {excludes}" + parts = [] + for t in filters.must_exclude: + t = t.strip() + if not t: + continue + parts.append(f'-"{t}"' if " " in t else f"-{t}") + base_params["_nkw"] = f"{base_params['_nkw']} {' '.join(parts)}" pages = max(1, filters.pages) page_params = [{**base_params, "_pgn": str(p)} for p in range(1, pages + 1)] @@ -346,6 +493,7 @@ class ScrapedEbayAdapter(PlatformAdapter): pages = max(1, pages) page_params = [{**base_params, "_pgn": str(p)} for p in range(1, pages + 1)] + log.info("comps scrape: fetching %d page(s) of sold listings for %r", pages, query) try: with ThreadPoolExecutor(max_workers=min(pages, 3)) as ex: htmls = list(ex.map(self._get, page_params)) @@ -369,6 +517,10 @@ class ScrapedEbayAdapter(PlatformAdapter): sample_count=len(prices), expires_at=(datetime.now(timezone.utc) + timedelta(hours=6)).isoformat(), )) + log.info("comps scrape: saved market comp median=$%.2f from %d prices", median, len(prices)) + else: + log.warning("comps scrape: %d listings parsed but 0 valid prices — no comp saved", len(all_listings)) return all_listings except Exception: + log.warning("comps scrape: failed for %r", query, exc_info=True) return [] diff --git a/app/trust/__init__.py b/app/trust/__init__.py index 726912f..6a03785 100644 --- a/app/trust/__init__.py +++ b/app/trust/__init__.py @@ -4,6 +4,7 @@ from .aggregator import Aggregator from app.db.models import Seller, Listing, TrustScore from app.db.store import Store import hashlib +import math class TrustScorer: @@ -24,6 +25,16 @@ class TrustScorer: comp = self._store.get_market_comp("ebay", query_hash) market_median = comp.median_price if comp else None + # Coefficient of variation: stddev/mean across batch prices. + # None when fewer than 2 priced listings (can't compute variance). + _prices = [l.price for l in listings if l.price > 0] + if len(_prices) >= 2: + _mean = sum(_prices) / len(_prices) + _stddev = math.sqrt(sum((p - _mean) ** 2 for p in _prices) / len(_prices)) + price_cv: float | None = _stddev / _mean if _mean > 0 else None + else: + price_cv = None + photo_url_sets = [l.photo_urls for l in listings] duplicates = self._photo.check_duplicates(photo_url_sets) @@ -31,11 +42,19 @@ class TrustScorer: for listing, is_dup in zip(listings, duplicates): seller = self._store.get_seller("ebay", listing.seller_platform_id) if seller: - signal_scores = self._meta.score(seller, market_median, listing.price) + signal_scores = self._meta.score(seller, market_median, listing.price, price_cv) else: signal_scores = {k: None for k in ["account_age", "feedback_count", "feedback_ratio", "price_vs_market", "category_history"]} - trust = self._agg.aggregate(signal_scores, is_dup, seller, listing.id or 0) + trust = self._agg.aggregate( + signal_scores, is_dup, seller, + listing_id=listing.id or 0, + listing_title=listing.title, + times_seen=listing.times_seen, + first_seen_at=listing.first_seen_at, + price=listing.price, + price_at_first_seen=listing.price_at_first_seen, + ) scores.append(trust) return scores diff --git a/app/trust/aggregator.py b/app/trust/aggregator.py index 27f51bc..934f0db 100644 --- a/app/trust/aggregator.py +++ b/app/trust/aggregator.py @@ -1,6 +1,7 @@ """Composite score and red flag extraction.""" from __future__ import annotations import json +from datetime import datetime, timezone from typing import Optional from app.db.models import Seller, TrustScore @@ -8,6 +9,55 @@ HARD_FILTER_AGE_DAYS = 7 HARD_FILTER_BAD_RATIO_MIN_COUNT = 20 HARD_FILTER_BAD_RATIO_THRESHOLD = 0.80 +# Title keywords that suggest cosmetic damage or wear (free-tier title scan). +# Description-body scan (paid BSL feature) runs via BTF enrichment — not implemented yet. +_SCRATCH_DENT_KEYWORDS = frozenset([ + # Explicit cosmetic damage + "scratch", "scratched", "scratches", "scuff", "scuffed", + "dent", "dented", "ding", "dinged", + "crack", "cracked", "chip", "chipped", + "damage", "damaged", "cosmetic damage", + "blemish", "wear", "worn", "worn in", + # Parts / condition catch-alls + "as is", "for parts", "parts only", "spares or repair", "parts or repair", + # Evasive redirects — seller hiding damage detail in listing body + "see description", "read description", "read listing", "see listing", + "see photos for", "see pics for", "see images for", + # Functional problem phrases (phrases > single words to avoid false positives) + "issue with", "issues with", "problem with", "problems with", + "not working", "stopped working", "doesn't work", "does not work", + "no power", "dead on arrival", "powers on but", "turns on but", "boots but", + "faulty", "broken screen", "broken hinge", "broken port", + # DIY / project / repair listings + "needs repair", "needs work", "needs tlc", + "project unit", "project item", "project laptop", "project phone", + "for repair", "sold as is", +]) + + +def _has_damage_keywords(title: str) -> bool: + lower = title.lower() + return any(kw in lower for kw in _SCRATCH_DENT_KEYWORDS) + + +_LONG_ON_MARKET_MIN_SIGHTINGS = 5 +_LONG_ON_MARKET_MIN_DAYS = 14 +_PRICE_DROP_THRESHOLD = 0.20 # 20% below first-seen price + + +def _days_since(iso: Optional[str]) -> Optional[int]: + if not iso: + return None + try: + dt = datetime.fromisoformat(iso.replace("Z", "+00:00")) + # Normalize to naive UTC so both paths (timezone-aware ISO and SQLite + # CURRENT_TIMESTAMP naive strings) compare correctly. + if dt.tzinfo is not None: + dt = dt.replace(tzinfo=None) + return (datetime.utcnow() - dt).days + except ValueError: + return None + class Aggregator: def aggregate( @@ -16,10 +66,24 @@ class Aggregator: photo_hash_duplicate: bool, seller: Optional[Seller], listing_id: int = 0, + listing_title: str = "", + times_seen: int = 1, + first_seen_at: Optional[str] = None, + price: float = 0.0, + price_at_first_seen: Optional[float] = None, ) -> TrustScore: is_partial = any(v is None for v in signal_scores.values()) clean = {k: (v if v is not None else 0) for k, v in signal_scores.items()} - composite = sum(clean.values()) + + # Score only against signals that returned real data — treating "no data" + # as 0 conflates "bad signal" with "missing signal" and drags scores down + # unfairly when the API doesn't expose a field (e.g. registrationDate). + available = [v for v in signal_scores.values() if v is not None] + available_max = len(available) * 20 + if available_max > 0: + composite = round((sum(available) / available_max) * 100) + else: + composite = 0 red_flags: list[str] = [] @@ -41,6 +105,18 @@ class Aggregator: red_flags.append("suspicious_price") if photo_hash_duplicate: red_flags.append("duplicate_photo") + if listing_title and _has_damage_keywords(listing_title): + red_flags.append("scratch_dent_mentioned") + + # Staging DB signals + days_in_index = _days_since(first_seen_at) + if (times_seen >= _LONG_ON_MARKET_MIN_SIGHTINGS + and days_in_index is not None + and days_in_index >= _LONG_ON_MARKET_MIN_DAYS): + red_flags.append("long_on_market") + if (price_at_first_seen and price_at_first_seen > 0 + and price < price_at_first_seen * (1 - _PRICE_DROP_THRESHOLD)): + red_flags.append("significant_price_drop") return TrustScore( listing_id=listing_id, diff --git a/app/trust/metadata.py b/app/trust/metadata.py index 689b4ce..9ce88a6 100644 --- a/app/trust/metadata.py +++ b/app/trust/metadata.py @@ -6,6 +6,11 @@ from app.db.models import Seller ELECTRONICS_CATEGORIES = {"ELECTRONICS", "COMPUTERS_TABLETS", "VIDEO_GAMES", "CELL_PHONES"} +# Coefficient of variation (stddev/mean) above which the price distribution is +# considered too heterogeneous to trust the market median for scam detection. +# e.g. "Lenovo RTX intel" mixes $200 old ThinkPads with $2000 Legions → CV ~1.0+ +_HETEROGENEOUS_CV_THRESHOLD = 0.6 + class MetadataScorer: def score( @@ -13,12 +18,13 @@ class MetadataScorer: seller: Seller, market_median: Optional[float], listing_price: float, + price_cv: Optional[float] = None, ) -> dict[str, Optional[int]]: return { "account_age": self._account_age(seller.account_age_days) if seller.account_age_days is not None else None, "feedback_count": self._feedback_count(seller.feedback_count), "feedback_ratio": self._feedback_ratio(seller.feedback_ratio, seller.feedback_count), - "price_vs_market": self._price_vs_market(listing_price, market_median), + "price_vs_market": self._price_vs_market(listing_price, market_median, price_cv), "category_history": self._category_history(seller.category_history_json), } @@ -43,9 +49,11 @@ class MetadataScorer: if ratio < 0.98: return 15 return 20 - def _price_vs_market(self, price: float, median: Optional[float]) -> Optional[int]: + def _price_vs_market(self, price: float, median: Optional[float], price_cv: Optional[float] = None) -> Optional[int]: if median is None: return None # data unavailable → aggregator sets score_is_partial if median <= 0: return None + if price_cv is not None and price_cv > _HETEROGENEOUS_CV_THRESHOLD: + return None # mixed model/generation search — median is unreliable ratio = price / median if ratio < 0.50: return 0 # >50% below = scam if ratio < 0.70: return 5 # >30% below = suspicious @@ -53,11 +61,13 @@ class MetadataScorer: if ratio <= 1.20: return 20 return 15 # above market = still ok, just expensive - def _category_history(self, category_history_json: str) -> int: + def _category_history(self, category_history_json: str) -> Optional[int]: try: history = json.loads(category_history_json) except (ValueError, TypeError): - return 0 + return None # unparseable → data unavailable + if not history: + return None # empty dict → no category data from this source electronics_sales = sum( v for k, v in history.items() if k in ELECTRONICS_CATEGORIES ) diff --git a/app/ui/Search.py b/app/ui/Search.py index 2abf005..0b65650 100644 --- a/app/ui/Search.py +++ b/app/ui/Search.py @@ -48,8 +48,41 @@ def _get_adapter(store: Store) -> PlatformAdapter: return ScrapedEbayAdapter(store) +def _keyword_passes(title_lower: str, state: FilterState) -> bool: + """Apply must_include / must_exclude keyword filtering against a lowercased title.""" + include_raw = state.must_include.strip() + if include_raw: + mode = state.must_include_mode + if mode == "groups": + groups = [ + [alt.strip().lower() for alt in g.split("|") if alt.strip()] + for g in include_raw.split(",") + if any(alt.strip() for alt in g.split("|")) + ] + if not all(any(alt in title_lower for alt in group) for group in groups): + return False + elif mode == "any": + terms = [t.strip().lower() for t in include_raw.split(",") if t.strip()] + if not any(t in title_lower for t in terms): + return False + else: # "all" + terms = [t.strip().lower() for t in include_raw.split(",") if t.strip()] + if not all(t in title_lower for t in terms): + return False + + exclude_raw = state.must_exclude.strip() + if exclude_raw: + terms = [t.strip().lower() for t in exclude_raw.split(",") if t.strip()] + if any(t in title_lower for t in terms): + return False + + return True + + def _passes_filter(listing, trust, seller, state: FilterState) -> bool: import json + if not _keyword_passes(listing.title.lower(), state): + return False if trust and trust.composite_score < state.min_trust_score: return False if state.min_price and listing.price < state.min_price: diff --git a/app/ui/components/filters.py b/app/ui/components/filters.py index 94e54f5..6756939 100644 --- a/app/ui/components/filters.py +++ b/app/ui/components/filters.py @@ -33,6 +33,9 @@ class FilterState: hide_marketing_photos: bool = False hide_suspicious_price: bool = False hide_duplicate_photos: bool = False + must_include: str = "" + must_include_mode: str = "all" # "all" | "any" | "groups" + must_exclude: str = "" def build_filter_options( @@ -78,6 +81,29 @@ def render_filter_sidebar( st.sidebar.markdown("### Filters") st.sidebar.caption(f"{len(pairs)} results") + st.sidebar.markdown("**Keywords**") + state.must_include_mode = st.sidebar.radio( + "Must include mode", + options=["all", "any", "groups"], + format_func=lambda m: {"all": "All (AND)", "any": "Any (OR)", "groups": "Groups (CNF)"}[m], + horizontal=True, + key="include_mode", + label_visibility="collapsed", + ) + hint = { + "all": "Every term must appear", + "any": "At least one term must appear", + "groups": "Comma = AND · pipe | = OR within group", + }[state.must_include_mode] + state.must_include = st.sidebar.text_input( + "Must include", value="", placeholder="16gb, founders…" if state.must_include_mode != "groups" else "founders|fe, 16gb…", + key="must_include", + ) + st.sidebar.caption(hint) + state.must_exclude = st.sidebar.text_input( + "Must exclude", value="", placeholder="broken, parts…", key="must_exclude", + ) + state.min_trust_score = st.sidebar.slider("Min trust score", 0, 100, 0, key="min_trust") st.sidebar.caption( f"🟢 Safe (80+): {opts.score_bands['safe']} " diff --git a/compose.cloud.yml b/compose.cloud.yml index c7aae9a..07ec02a 100644 --- a/compose.cloud.yml +++ b/compose.cloud.yml @@ -11,6 +11,7 @@ services: context: .. dockerfile: snipe/Dockerfile restart: unless-stopped + env_file: .env # No network_mode: host — isolated on snipe-cloud-net; nginx reaches it via 'api:8510' volumes: - /devl/snipe-cloud-data:/app/snipe/data diff --git a/pyproject.toml b/pyproject.toml index 8c6436b..da08de8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ dependencies = [ "uvicorn[standard]>=0.29", "playwright>=1.44", "playwright-stealth>=1.0", + "cryptography>=42.0", ] [tool.setuptools.packages.find] diff --git a/web/src/components/ListingCard.vue b/web/src/components/ListingCard.vue index 91f3369..b96b8cd 100644 --- a/web/src/components/ListingCard.vue +++ b/web/src/components/ListingCard.vue @@ -61,8 +61,8 @@ {{ flagLabel(flag) }} -

- ⚠ Partial score — some data unavailable +

+ ↻ Updating: {{ pendingSignalNames.join(', ') }}

⚠ Could not score this listing @@ -72,9 +72,32 @@

-
+
{{ trust?.composite_score ?? '?' }} Trust + +
@@ -99,6 +122,7 @@ diff --git a/web/src/views/SearchView.vue b/web/src/views/SearchView.vue index 9ca2b34..eea5660 100644 --- a/web/src/views/SearchView.vue +++ b/web/src/views/SearchView.vue @@ -3,6 +3,22 @@
+
+ + + + Saved! + {{ saveError }}
@@ -163,12 +271,14 @@ · {{ hiddenCount }} hidden by filters

- - +
+ + +
@@ -189,15 +299,56 @@