**Scammer blocklist** - migration 006: scammer_blocklist table (platform + seller_id unique key, source: manual|csv_import|community) - ScammerEntry dataclass + Store.add/remove/list_blocklist methods - blocklist.ts Pinia store — CRUD, export CSV, import CSV with validation - BlocklistView.vue — list with search, export/import, bulk-remove; sellers show on ListingCard with force-score-0 badge - API: GET/POST/DELETE /api/blocklist + CSV export/import endpoints - Router: /blocklist route added; AppNav link **Migration renumber** - 002_background_tasks.sql → 007_background_tasks.sql (correct sequence after blocklist; idempotent CREATE IF NOT EXISTS safe for existing DBs) **Search + listing UI overhaul** - SearchView.vue: keyword expansion preview, filter chips for condition/ format/price, saved-search quick-run button, paginated results - ListingCard.vue: trust tier badge, scammer flag overlay, photo count chip, quick-block button, save-to-search action - savedSearches store: optimistic update on run, last-run timestamp **Tier refactor** - tiers.py: full rewrite with docstring ladder, BYOK LOCAL_VISION_UNLOCKABLE flag, intentionally-free list with rationale (scammer_db, saved_searches, market_comps free to maximise adoption) **Trust aggregator + scraper** - aggregator.py: blocklist check short-circuits scoring to 0/BAD_ACTOR - scraper.py: listing format detection, photo count, improved title parsing **Theme** - theme.css: trust tier color tokens, badge variants, blocklist badge
571 lines
23 KiB
Python
571 lines
23 KiB
Python
"""Scraper-based eBay adapter — free tier, no API key required.
|
||
|
||
Data available from search results HTML (single page load):
|
||
✅ title, price, condition, photos, URL
|
||
✅ seller username, feedback count, feedback ratio
|
||
❌ account registration date → enriched async via BTF /itm/ scrape
|
||
❌ category history → enriched async via _ssn seller search page
|
||
|
||
This is the MIT discovery layer. EbayAdapter (paid/CF proxy) unlocks full trust scores.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import hashlib
|
||
import itertools
|
||
import json
|
||
import logging
|
||
import re
|
||
import time
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
from datetime import datetime, timedelta, timezone
|
||
from typing import Optional
|
||
|
||
log = logging.getLogger(__name__)
|
||
|
||
from bs4 import BeautifulSoup
|
||
|
||
from app.db.models import Listing, MarketComp, Seller
|
||
from app.db.store import Store
|
||
from app.platforms import PlatformAdapter, SearchFilters
|
||
|
||
EBAY_SEARCH_URL = "https://www.ebay.com/sch/i.html"
|
||
EBAY_ITEM_URL = "https://www.ebay.com/itm/"
|
||
_HTML_CACHE_TTL = 300 # seconds — 5 minutes
|
||
_JOINED_RE = re.compile(r"Joined\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+(\d{4})", re.I)
|
||
# Matches "username (1,234) 99.1% positive feedback" on /itm/ listing pages.
|
||
# Capture groups: 1=raw_count ("1,234"), 2=ratio_pct ("99.1").
|
||
_ITEM_FEEDBACK_RE = re.compile(r'\((\d[\d,]*)\)\s*([\d.]+)%\s*positive', re.I)
|
||
_MONTH_MAP = {m: i+1 for i, m in enumerate(
|
||
["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]
|
||
)}
|
||
|
||
# Module-level cache persists across per-request adapter instantiations.
|
||
# Keyed by URL; value is (html, expiry_timestamp).
|
||
_html_cache: dict[str, tuple[str, float]] = {}
|
||
|
||
# Cycle through display numbers :200–:299 so concurrent/sequential Playwright
|
||
# calls don't collide on the Xvfb lock file from the previous run.
|
||
_display_counter = itertools.cycle(range(200, 300))
|
||
|
||
_HEADERS = {
|
||
"User-Agent": (
|
||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||
),
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||
"Accept-Language": "en-US,en;q=0.5",
|
||
"Accept-Encoding": "gzip, deflate, br",
|
||
"DNT": "1",
|
||
"Connection": "keep-alive",
|
||
"Upgrade-Insecure-Requests": "1",
|
||
}
|
||
|
||
_SELLER_RE = re.compile(r"^(.+?)\s+\(([0-9,]+)\)\s+([\d.]+)%")
|
||
_FEEDBACK_RE = re.compile(r"([\d.]+)%\s+positive\s+\(([0-9,]+)\)", re.I)
|
||
_PRICE_RE = re.compile(r"[\d,]+\.?\d*")
|
||
_ITEM_ID_RE = re.compile(r"/itm/(\d+)")
|
||
_TIME_LEFT_RE = re.compile(r"(?:(\d+)d\s*)?(?:(\d+)h\s*)?(?:(\d+)m\s*)?(?:(\d+)s\s*)?left", re.I)
|
||
_PARENS_COUNT_RE = re.compile(r"\((\d{1,6})\)")
|
||
|
||
# Maps title-keyword fragments → internal MetadataScorer category keys.
|
||
# Checked in order — first match wins. Broader terms intentionally listed last.
|
||
_CATEGORY_KEYWORDS: list[tuple[frozenset[str], str]] = [
|
||
(frozenset(["cell phone", "smartphone", "mobile phone"]), "CELL_PHONES"),
|
||
(frozenset(["video game", "gaming", "console", "playstation", "xbox", "nintendo"]), "VIDEO_GAMES"),
|
||
(frozenset(["computer", "tablet", "laptop", "notebook", "chromebook"]), "COMPUTERS_TABLETS"),
|
||
(frozenset(["electronic"]), "ELECTRONICS"),
|
||
]
|
||
|
||
|
||
def _classify_category_label(text: str) -> Optional[str]:
|
||
"""Map an eBay category label to an internal MetadataScorer key, or None."""
|
||
lower = text.lower()
|
||
for keywords, key in _CATEGORY_KEYWORDS:
|
||
if any(kw in lower for kw in keywords):
|
||
return key
|
||
return None
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Pure HTML parsing functions (unit-testable, no HTTP)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _parse_price(text: str) -> float:
|
||
"""Extract first numeric value from price text.
|
||
|
||
Handles '$950.00', '$900.00 to $1,050.00', '$1,234.56/ea'.
|
||
Takes the lower bound for price ranges (conservative for trust scoring).
|
||
"""
|
||
m = _PRICE_RE.search(text.replace(",", ""))
|
||
return float(m.group()) if m else 0.0
|
||
|
||
|
||
def _parse_seller(text: str) -> tuple[str, int, float]:
|
||
"""Parse eBay seller-info text into (username, feedback_count, feedback_ratio).
|
||
|
||
Input format: 'tech_seller (1,234) 99.1% positive feedback'
|
||
Returns ('tech_seller', 1234, 0.991).
|
||
Falls back gracefully if the format doesn't match.
|
||
"""
|
||
text = text.strip()
|
||
m = _SELLER_RE.match(text)
|
||
if not m:
|
||
return (text.split()[0] if text else ""), 0, 0.0
|
||
return m.group(1).strip(), int(m.group(2).replace(",", "")), float(m.group(3)) / 100.0
|
||
|
||
|
||
def _parse_time_left(text: str) -> Optional[timedelta]:
|
||
"""Parse eBay time-left text into a timedelta.
|
||
|
||
Handles '3d 14h left', '14h 23m left', '23m 45s left'.
|
||
Returns None if text doesn't match (i.e. fixed-price listing).
|
||
"""
|
||
if not text:
|
||
return None
|
||
m = _TIME_LEFT_RE.search(text)
|
||
if not m or not any(m.groups()):
|
||
return None
|
||
days = int(m.group(1) or 0)
|
||
hours = int(m.group(2) or 0)
|
||
minutes = int(m.group(3) or 0)
|
||
seconds = int(m.group(4) or 0)
|
||
if days == hours == minutes == seconds == 0:
|
||
return None
|
||
return timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)
|
||
|
||
|
||
def _extract_seller_from_card(card) -> tuple[str, int, float]:
|
||
"""Extract (username, feedback_count, feedback_ratio) from an s-card element.
|
||
|
||
New eBay layout has seller username and feedback as separate su-styled-text spans.
|
||
We find the feedback span by regex, then take the immediately preceding text as username.
|
||
"""
|
||
texts = [s.get_text(strip=True) for s in card.select("span.su-styled-text") if s.get_text(strip=True)]
|
||
username, count, ratio = "", 0, 0.0
|
||
for i, t in enumerate(texts):
|
||
m = _FEEDBACK_RE.search(t)
|
||
if m:
|
||
ratio = float(m.group(1)) / 100.0
|
||
count = int(m.group(2).replace(",", ""))
|
||
# Username is the span just before the feedback span
|
||
if i > 0:
|
||
username = texts[i - 1].strip()
|
||
break
|
||
return username, count, ratio
|
||
|
||
|
||
def scrape_listings(html: str) -> list[Listing]:
|
||
"""Parse eBay search results HTML into Listing objects."""
|
||
soup = BeautifulSoup(html, "lxml")
|
||
results = []
|
||
|
||
for item in soup.select("li.s-card"):
|
||
# Skip promos: no data-listingid or title is "Shop on eBay"
|
||
platform_listing_id = item.get("data-listingid", "")
|
||
if not platform_listing_id:
|
||
continue
|
||
|
||
title_el = item.select_one("div.s-card__title")
|
||
if not title_el or "Shop on eBay" in title_el.get_text():
|
||
continue
|
||
|
||
link_el = item.select_one('a.s-card__link[href*="/itm/"]')
|
||
url = link_el["href"].split("?")[0] if link_el else ""
|
||
|
||
price_el = item.select_one("span.s-card__price")
|
||
price = _parse_price(price_el.get_text()) if price_el else 0.0
|
||
|
||
condition_el = item.select_one("div.s-card__subtitle")
|
||
condition = condition_el.get_text(strip=True).split("·")[0].strip().lower() if condition_el else ""
|
||
|
||
seller_username, _, _ = _extract_seller_from_card(item)
|
||
|
||
img_el = item.select_one("img.s-card__image")
|
||
photo_url = img_el.get("src") or img_el.get("data-src") or "" if img_el else ""
|
||
|
||
# Auction detection via time-left text patterns in card spans
|
||
time_remaining = None
|
||
for span in item.select("span.su-styled-text"):
|
||
t = span.get_text(strip=True)
|
||
td = _parse_time_left(t)
|
||
if td:
|
||
time_remaining = td
|
||
break
|
||
buying_format = "auction" if time_remaining is not None else "fixed_price"
|
||
ends_at = (datetime.now(timezone.utc) + time_remaining).isoformat() if time_remaining else None
|
||
|
||
# Strip eBay's screen-reader accessibility text injected into title links.
|
||
# get_text() is CSS-blind and picks up visually-hidden spans.
|
||
raw_title = title_el.get_text(separator=" ", strip=True)
|
||
title = re.sub(r"\s*Opens in a new window or tab\s*", "", raw_title, flags=re.IGNORECASE).strip()
|
||
|
||
results.append(Listing(
|
||
platform="ebay",
|
||
platform_listing_id=platform_listing_id,
|
||
title=title,
|
||
price=price,
|
||
currency="USD",
|
||
condition=condition,
|
||
seller_platform_id=seller_username,
|
||
url=url,
|
||
photo_urls=[photo_url] if photo_url else [],
|
||
listing_age_days=0,
|
||
buying_format=buying_format,
|
||
ends_at=ends_at,
|
||
))
|
||
|
||
return results
|
||
|
||
|
||
def scrape_sellers(html: str) -> dict[str, Seller]:
|
||
"""Extract Seller objects from search results HTML.
|
||
|
||
Returns a dict keyed by username. account_age_days and category_history_json
|
||
are left empty — they require a separate seller profile page fetch, which
|
||
would mean one extra HTTP request per seller. That data gap is what separates
|
||
free (scraper) from paid (API) tier.
|
||
"""
|
||
soup = BeautifulSoup(html, "lxml")
|
||
sellers: dict[str, Seller] = {}
|
||
|
||
for item in soup.select("li.s-card"):
|
||
if not item.get("data-listingid"):
|
||
continue
|
||
username, count, ratio = _extract_seller_from_card(item)
|
||
if username and username not in sellers:
|
||
sellers[username] = Seller(
|
||
platform="ebay",
|
||
platform_seller_id=username,
|
||
username=username,
|
||
account_age_days=None, # not fetched at scraper tier
|
||
feedback_count=count,
|
||
feedback_ratio=ratio,
|
||
category_history_json="{}", # not available from search HTML
|
||
)
|
||
|
||
return sellers
|
||
|
||
|
||
def scrape_seller_categories(html: str) -> dict[str, int]:
|
||
"""Parse category distribution from a seller's _ssn search page.
|
||
|
||
eBay renders category refinements in the left sidebar. We scan all
|
||
anchor-text blocks for recognisable category labels and accumulate
|
||
listing counts from the adjacent parenthetical "(N)" strings.
|
||
|
||
Returns a dict like {"ELECTRONICS": 45, "CELL_PHONES": 23}.
|
||
Empty dict = no recognisable categories found (score stays None).
|
||
"""
|
||
soup = BeautifulSoup(html, "lxml")
|
||
counts: dict[str, int] = {}
|
||
|
||
# eBay sidebar refinement links contain the category label and a count.
|
||
# Multiple layout variants exist — scan broadly and classify by keyword.
|
||
for el in soup.select("a[href*='_sacat='], li.x-refine__main__list--value a"):
|
||
text = el.get_text(separator=" ", strip=True)
|
||
key = _classify_category_label(text)
|
||
if not key:
|
||
continue
|
||
m = _PARENS_COUNT_RE.search(text)
|
||
count = int(m.group(1)) if m else 1
|
||
counts[key] = counts.get(key, 0) + count
|
||
|
||
return counts
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Adapter
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class ScrapedEbayAdapter(PlatformAdapter):
|
||
"""
|
||
Scraper-based eBay adapter implementing PlatformAdapter with no API key.
|
||
|
||
Extracts seller feedback directly from search result cards — no extra
|
||
per-seller page requests. The two unavailable signals (account_age,
|
||
category_history) cause TrustScorer to set score_is_partial=True.
|
||
"""
|
||
|
||
def __init__(self, shared_store: Store, delay: float = 1.0):
|
||
self._store = shared_store
|
||
self._delay = delay
|
||
|
||
def _fetch_url(self, url: str) -> str:
|
||
"""Core Playwright fetch — stealthed headed Chromium via Xvfb.
|
||
|
||
Shared by both search (_get) and BTF item-page enrichment (_fetch_item_html).
|
||
Results cached for _HTML_CACHE_TTL seconds.
|
||
"""
|
||
cached = _html_cache.get(url)
|
||
if cached and time.time() < cached[1]:
|
||
return cached[0]
|
||
|
||
time.sleep(self._delay)
|
||
|
||
import subprocess, os
|
||
display_num = next(_display_counter)
|
||
display = f":{display_num}"
|
||
xvfb = subprocess.Popen(
|
||
["Xvfb", display, "-screen", "0", "1280x800x24"],
|
||
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
|
||
)
|
||
env = os.environ.copy()
|
||
env["DISPLAY"] = display
|
||
|
||
try:
|
||
from playwright.sync_api import sync_playwright # noqa: PLC0415 — lazy: only needed in Docker
|
||
from playwright_stealth import Stealth # noqa: PLC0415
|
||
|
||
with sync_playwright() as pw:
|
||
browser = pw.chromium.launch(
|
||
headless=False,
|
||
env=env,
|
||
args=["--no-sandbox", "--disable-dev-shm-usage"],
|
||
)
|
||
ctx = browser.new_context(
|
||
user_agent=_HEADERS["User-Agent"],
|
||
viewport={"width": 1280, "height": 800},
|
||
)
|
||
page = ctx.new_page()
|
||
Stealth().apply_stealth_sync(page)
|
||
page.goto(url, wait_until="domcontentloaded", timeout=30_000)
|
||
page.wait_for_timeout(2000) # let any JS challenges resolve
|
||
html = page.content()
|
||
browser.close()
|
||
finally:
|
||
xvfb.terminate()
|
||
xvfb.wait()
|
||
|
||
_html_cache[url] = (html, time.time() + _HTML_CACHE_TTL)
|
||
return html
|
||
|
||
def _get(self, params: dict) -> str:
|
||
"""Fetch eBay search results HTML. params → query string appended to EBAY_SEARCH_URL."""
|
||
url = EBAY_SEARCH_URL + "?" + "&".join(f"{k}={v}" for k, v in params.items())
|
||
return self._fetch_url(url)
|
||
|
||
def _fetch_item_html(self, item_id: str) -> str:
|
||
"""Fetch a single eBay listing page. /itm/ pages pass Kasada; /usr/ pages do not.
|
||
|
||
Browse API returns itemId as "v1|123456789012|0"; extract the numeric
|
||
segment so the URL resolves correctly (scraper IDs are already numeric).
|
||
"""
|
||
if "|" in item_id:
|
||
item_id = item_id.split("|")[1]
|
||
return self._fetch_url(f"{EBAY_ITEM_URL}{item_id}")
|
||
|
||
@staticmethod
|
||
def _parse_joined_date(html: str) -> Optional[int]:
|
||
"""Parse 'Joined {Mon} {Year}' from a listing page BTF seller card.
|
||
|
||
Returns account_age_days (int) or None if the date is not found.
|
||
eBay renders this as a span.ux-textspans inside the seller section.
|
||
"""
|
||
m = _JOINED_RE.search(html)
|
||
if not m:
|
||
return None
|
||
month_str, year_str = m.group(1)[:3].capitalize(), m.group(2)
|
||
month = _MONTH_MAP.get(month_str)
|
||
if not month:
|
||
return None
|
||
try:
|
||
reg_date = datetime(int(year_str), month, 1, tzinfo=timezone.utc)
|
||
return (datetime.now(timezone.utc) - reg_date).days
|
||
except ValueError:
|
||
return None
|
||
|
||
@staticmethod
|
||
def _parse_feedback_from_item(html: str) -> tuple[Optional[int], Optional[float]]:
|
||
"""Parse feedback count and ratio from a listing page seller card.
|
||
|
||
Matches 'username (1,234) 99.1% positive feedback'.
|
||
Returns (count, ratio) or (None, None) if not found.
|
||
"""
|
||
m = _ITEM_FEEDBACK_RE.search(html)
|
||
if not m:
|
||
return None, None
|
||
try:
|
||
count = int(m.group(1).replace(",", ""))
|
||
ratio = float(m.group(2)) / 100.0
|
||
return count, ratio
|
||
except ValueError:
|
||
return None, None
|
||
|
||
def enrich_sellers_btf(
|
||
self,
|
||
seller_to_listing: dict[str, str],
|
||
max_workers: int = 2,
|
||
) -> None:
|
||
"""Background BTF enrichment — scrape /itm/ pages to fill in account_age_days.
|
||
|
||
seller_to_listing: {seller_platform_id -> platform_listing_id}
|
||
Only pass sellers whose account_age_days is None (unknown from API batch).
|
||
Caller limits the dict to new/stale sellers to avoid redundant scrapes.
|
||
|
||
Runs Playwright fetches in a thread pool (max_workers=2 by default to
|
||
avoid hammering Kasada). Updates seller records in the DB in-place.
|
||
Does not raise — failures per-seller are silently skipped so the main
|
||
search response is never blocked.
|
||
"""
|
||
db_path = self._store._db_path # capture for thread-local Store creation
|
||
|
||
def _enrich_one(item: tuple[str, str]) -> None:
|
||
seller_id, listing_id = item
|
||
try:
|
||
html = self._fetch_item_html(listing_id)
|
||
age_days = self._parse_joined_date(html)
|
||
fb_count, fb_ratio = self._parse_feedback_from_item(html)
|
||
log.debug(
|
||
"BTF enrich: seller=%s age_days=%s feedback=%s ratio=%s",
|
||
seller_id, age_days, fb_count, fb_ratio,
|
||
)
|
||
if age_days is None and fb_count is None:
|
||
return # nothing new to write
|
||
thread_store = Store(db_path)
|
||
seller = thread_store.get_seller("ebay", seller_id)
|
||
if not seller:
|
||
log.warning("BTF enrich: seller %s not found in DB", seller_id)
|
||
return
|
||
from dataclasses import replace
|
||
updates: dict = {}
|
||
if age_days is not None:
|
||
updates["account_age_days"] = age_days
|
||
# Only overwrite feedback if the listing page found a real value —
|
||
# prefer a fresh count over a 0 that came from a failed search parse.
|
||
if fb_count is not None:
|
||
updates["feedback_count"] = fb_count
|
||
if fb_ratio is not None:
|
||
updates["feedback_ratio"] = fb_ratio
|
||
thread_store.save_seller(replace(seller, **updates))
|
||
except Exception as exc:
|
||
log.warning("BTF enrich failed for %s/%s: %s", seller_id, listing_id, exc)
|
||
|
||
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
||
list(ex.map(_enrich_one, seller_to_listing.items()))
|
||
|
||
def enrich_sellers_categories(
|
||
self,
|
||
seller_platform_ids: list[str],
|
||
max_workers: int = 2,
|
||
) -> None:
|
||
"""Scrape _ssn seller pages to populate category_history_json.
|
||
|
||
Uses the same headed Playwright stack as search() — the _ssn=USERNAME
|
||
filter is just a query param on the standard search template, so it
|
||
passes Kasada identically. Silently skips on failure so the main
|
||
search response is never affected.
|
||
"""
|
||
def _enrich_one(seller_id: str) -> None:
|
||
try:
|
||
html = self._get({"_ssn": seller_id, "_sop": "12", "_ipg": "48"})
|
||
categories = scrape_seller_categories(html)
|
||
if categories:
|
||
seller = self._store.get_seller("ebay", seller_id)
|
||
if seller:
|
||
from dataclasses import replace
|
||
updated = replace(seller, category_history_json=json.dumps(categories))
|
||
self._store.save_seller(updated)
|
||
except Exception:
|
||
pass
|
||
|
||
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
||
list(ex.map(_enrich_one, seller_platform_ids))
|
||
|
||
def search(self, query: str, filters: SearchFilters) -> list[Listing]:
|
||
base_params: dict = {"_nkw": query, "_sop": "15", "_ipg": "48"}
|
||
if filters.category_id:
|
||
base_params["_sacat"] = filters.category_id
|
||
|
||
if filters.max_price:
|
||
base_params["_udhi"] = str(filters.max_price)
|
||
if filters.min_price:
|
||
base_params["_udlo"] = str(filters.min_price)
|
||
if filters.condition:
|
||
cond_map = {
|
||
"new": "1000", "used": "3000",
|
||
"open box": "2500", "for parts": "7000",
|
||
}
|
||
codes = [cond_map[c] for c in filters.condition if c in cond_map]
|
||
if codes:
|
||
base_params["LH_ItemCondition"] = "|".join(codes)
|
||
|
||
# Append negative keywords to the eBay query — eBay supports "-term" in _nkw natively.
|
||
# Multi-word phrases must be quoted: -"parts only" not -parts only (which splits the words).
|
||
if filters.must_exclude:
|
||
parts = []
|
||
for t in filters.must_exclude:
|
||
t = t.strip()
|
||
if not t:
|
||
continue
|
||
parts.append(f'-"{t}"' if " " in t else f"-{t}")
|
||
base_params["_nkw"] = f"{base_params['_nkw']} {' '.join(parts)}"
|
||
|
||
pages = max(1, filters.pages)
|
||
page_params = [{**base_params, "_pgn": str(p)} for p in range(1, pages + 1)]
|
||
|
||
with ThreadPoolExecutor(max_workers=min(pages, 3)) as ex:
|
||
htmls = list(ex.map(self._get, page_params))
|
||
|
||
seen_ids: set[str] = set()
|
||
listings: list[Listing] = []
|
||
sellers: dict[str, "Seller"] = {}
|
||
for html in htmls:
|
||
for listing in scrape_listings(html):
|
||
if listing.platform_listing_id not in seen_ids:
|
||
seen_ids.add(listing.platform_listing_id)
|
||
listings.append(listing)
|
||
sellers.update(scrape_sellers(html))
|
||
|
||
self._store.save_sellers(list(sellers.values()))
|
||
return listings
|
||
|
||
def get_seller(self, seller_platform_id: str) -> Optional[Seller]:
|
||
# Sellers are pre-populated during search(); no extra fetch needed
|
||
return self._store.get_seller("ebay", seller_platform_id)
|
||
|
||
def get_completed_sales(self, query: str, pages: int = 1) -> list[Listing]:
|
||
query_hash = hashlib.md5(query.encode()).hexdigest()
|
||
if self._store.get_market_comp("ebay", query_hash):
|
||
return [] # cache hit — comp already stored
|
||
|
||
base_params = {
|
||
"_nkw": query,
|
||
"LH_Sold": "1",
|
||
"LH_Complete": "1",
|
||
"_sop": "13", # sort by price+shipping, lowest first
|
||
"_ipg": "48",
|
||
}
|
||
pages = max(1, pages)
|
||
page_params = [{**base_params, "_pgn": str(p)} for p in range(1, pages + 1)]
|
||
|
||
log.info("comps scrape: fetching %d page(s) of sold listings for %r", pages, query)
|
||
try:
|
||
with ThreadPoolExecutor(max_workers=min(pages, 3)) as ex:
|
||
htmls = list(ex.map(self._get, page_params))
|
||
|
||
seen_ids: set[str] = set()
|
||
all_listings: list[Listing] = []
|
||
for html in htmls:
|
||
for listing in scrape_listings(html):
|
||
if listing.platform_listing_id not in seen_ids:
|
||
seen_ids.add(listing.platform_listing_id)
|
||
all_listings.append(listing)
|
||
|
||
prices = sorted(l.price for l in all_listings if l.price > 0)
|
||
if prices:
|
||
mid = len(prices) // 2
|
||
median = (prices[mid - 1] + prices[mid]) / 2 if len(prices) % 2 == 0 else prices[mid]
|
||
self._store.save_market_comp(MarketComp(
|
||
platform="ebay",
|
||
query_hash=query_hash,
|
||
median_price=median,
|
||
sample_count=len(prices),
|
||
expires_at=(datetime.now(timezone.utc) + timedelta(hours=6)).isoformat(),
|
||
))
|
||
log.info("comps scrape: saved market comp median=$%.2f from %d prices", median, len(prices))
|
||
else:
|
||
log.warning("comps scrape: %d listings parsed but 0 valid prices — no comp saved", len(all_listings))
|
||
return all_listings
|
||
except Exception:
|
||
log.warning("comps scrape: failed for %r", query, exc_info=True)
|
||
return []
|