diff --git a/api/main.py b/api/main.py index c7f738b..83781f6 100644 --- a/api/main.py +++ b/api/main.py @@ -5,6 +5,7 @@ import dataclasses import hashlib import logging import os +from concurrent.futures import ThreadPoolExecutor from pathlib import Path from fastapi import FastAPI, HTTPException @@ -38,33 +39,44 @@ def health(): @app.get("/api/search") -def search(q: str = "", max_price: float = 0, min_price: float = 0): +def search(q: str = "", max_price: float = 0, min_price: float = 0, pages: int = 1): if not q.strip(): return {"listings": [], "trust_scores": {}, "sellers": {}, "market_price": None} - store = Store(_DB_PATH) - adapter = ScrapedEbayAdapter(store) - filters = SearchFilters( max_price=max_price if max_price > 0 else None, min_price=min_price if min_price > 0 else None, + pages=max(1, pages), ) + # Each adapter gets its own Store (SQLite connection) — required for thread safety. + # search() and get_completed_sales() run concurrently; they write to different tables + # so SQLite file-level locking is the only contention point. + search_adapter = ScrapedEbayAdapter(Store(_DB_PATH)) + comps_adapter = ScrapedEbayAdapter(Store(_DB_PATH)) + try: - listings = adapter.search(q, filters) - adapter.get_completed_sales(q) # warm market comp cache + with ThreadPoolExecutor(max_workers=2) as ex: + listings_future = ex.submit(search_adapter.search, q, filters) + comps_future = ex.submit(comps_adapter.get_completed_sales, q, pages) + listings = listings_future.result() + comps_future.result() # wait; side-effect is saving market comp to DB except Exception as e: log.warning("eBay scrape failed: %s", e) raise HTTPException(status_code=502, detail=f"eBay search failed: {e}") + # Use search_adapter's store for post-processing — it has the sellers already written + store = search_adapter._store store.save_listings(listings) scorer = TrustScorer(store) trust_scores_list = scorer.score_batch(listings, q) - # Market comp + # Market comp written by comps_adapter — read from a fresh connection to avoid + # cross-thread connection reuse + comp_store = Store(_DB_PATH) query_hash = hashlib.md5(q.encode()).hexdigest() - comp = store.get_market_comp("ebay", query_hash) + comp = comp_store.get_market_comp("ebay", query_hash) market_price = comp.median_price if comp else None # Serialize — keyed by platform_listing_id for easy Vue lookup diff --git a/app/platforms/__init__.py b/app/platforms/__init__.py index da6c94c..f9162ee 100644 --- a/app/platforms/__init__.py +++ b/app/platforms/__init__.py @@ -12,6 +12,7 @@ class SearchFilters: min_price: Optional[float] = None condition: Optional[list[str]] = field(default_factory=list) location_radius_km: Optional[int] = None + pages: int = 1 # number of result pages to fetch (48 listings/page) class PlatformAdapter(ABC): diff --git a/app/platforms/ebay/scraper.py b/app/platforms/ebay/scraper.py index 64ecd5d..50e33ed 100644 --- a/app/platforms/ebay/scraper.py +++ b/app/platforms/ebay/scraper.py @@ -14,12 +14,11 @@ import hashlib import itertools import re import time +from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime, timedelta, timezone from typing import Optional from bs4 import BeautifulSoup -from playwright.sync_api import sync_playwright -from playwright_stealth import Stealth from app.db.models import Listing, MarketComp, Seller from app.db.store import Store @@ -164,10 +163,15 @@ def scrape_listings(html: str) -> list[Listing]: buying_format = "auction" if time_remaining is not None else "fixed_price" ends_at = (datetime.now(timezone.utc) + time_remaining).isoformat() if time_remaining else None + # Strip eBay's screen-reader accessibility text injected into title links. + # get_text() is CSS-blind and picks up visually-hidden spans. + raw_title = title_el.get_text(separator=" ", strip=True) + title = re.sub(r"\s*Opens in a new window or tab\s*", "", raw_title, flags=re.IGNORECASE).strip() + results.append(Listing( platform="ebay", platform_listing_id=platform_listing_id, - title=title_el.get_text(strip=True), + title=title, price=price, currency="USD", condition=condition, @@ -256,6 +260,9 @@ class ScrapedEbayAdapter(PlatformAdapter): env["DISPLAY"] = display try: + from playwright.sync_api import sync_playwright # noqa: PLC0415 — lazy: only needed in Docker + from playwright_stealth import Stealth # noqa: PLC0415 + with sync_playwright() as pw: browser = pw.chromium.launch( headless=False, @@ -280,12 +287,12 @@ class ScrapedEbayAdapter(PlatformAdapter): return html def search(self, query: str, filters: SearchFilters) -> list[Listing]: - params: dict = {"_nkw": query, "_sop": "15", "_ipg": "48"} + base_params: dict = {"_nkw": query, "_sop": "15", "_ipg": "48"} if filters.max_price: - params["_udhi"] = str(filters.max_price) + base_params["_udhi"] = str(filters.max_price) if filters.min_price: - params["_udlo"] = str(filters.min_price) + base_params["_udlo"] = str(filters.min_price) if filters.condition: cond_map = { "new": "1000", "used": "3000", @@ -293,38 +300,62 @@ class ScrapedEbayAdapter(PlatformAdapter): } codes = [cond_map[c] for c in filters.condition if c in cond_map] if codes: - params["LH_ItemCondition"] = "|".join(codes) + base_params["LH_ItemCondition"] = "|".join(codes) - html = self._get(params) - listings = scrape_listings(html) + pages = max(1, filters.pages) + page_params = [{**base_params, "_pgn": str(p)} for p in range(1, pages + 1)] - # Cache seller objects extracted from the same page - self._store.save_sellers(list(scrape_sellers(html).values())) + with ThreadPoolExecutor(max_workers=min(pages, 3)) as ex: + htmls = list(ex.map(self._get, page_params)) + seen_ids: set[str] = set() + listings: list[Listing] = [] + sellers: dict[str, "Seller"] = {} + for html in htmls: + for listing in scrape_listings(html): + if listing.platform_listing_id not in seen_ids: + seen_ids.add(listing.platform_listing_id) + listings.append(listing) + sellers.update(scrape_sellers(html)) + + self._store.save_sellers(list(sellers.values())) return listings def get_seller(self, seller_platform_id: str) -> Optional[Seller]: # Sellers are pre-populated during search(); no extra fetch needed return self._store.get_seller("ebay", seller_platform_id) - def get_completed_sales(self, query: str) -> list[Listing]: + def get_completed_sales(self, query: str, pages: int = 1) -> list[Listing]: query_hash = hashlib.md5(query.encode()).hexdigest() if self._store.get_market_comp("ebay", query_hash): return [] # cache hit — comp already stored - params = { + base_params = { "_nkw": query, "LH_Sold": "1", "LH_Complete": "1", - "_sop": "13", # price + shipping: lowest first + "_sop": "13", # sort by price+shipping, lowest first "_ipg": "48", } + pages = max(1, pages) + page_params = [{**base_params, "_pgn": str(p)} for p in range(1, pages + 1)] + try: - html = self._get(params) - listings = scrape_listings(html) - prices = sorted(l.price for l in listings if l.price > 0) + with ThreadPoolExecutor(max_workers=min(pages, 3)) as ex: + htmls = list(ex.map(self._get, page_params)) + + seen_ids: set[str] = set() + all_listings: list[Listing] = [] + for html in htmls: + for listing in scrape_listings(html): + if listing.platform_listing_id not in seen_ids: + seen_ids.add(listing.platform_listing_id) + all_listings.append(listing) + + prices = sorted(l.price for l in all_listings if l.price > 0) if prices: - median = prices[len(prices) // 2] + mid = len(prices) // 2 + median = (prices[mid - 1] + prices[mid]) / 2 if len(prices) % 2 == 0 else prices[mid] self._store.save_market_comp(MarketComp( platform="ebay", query_hash=query_hash, @@ -332,6 +363,6 @@ class ScrapedEbayAdapter(PlatformAdapter): sample_count=len(prices), expires_at=(datetime.now(timezone.utc) + timedelta(hours=6)).isoformat(), )) - return listings + return all_listings except Exception: return [] diff --git a/app/trust/aggregator.py b/app/trust/aggregator.py index 83d7d5d..27f51bc 100644 --- a/app/trust/aggregator.py +++ b/app/trust/aggregator.py @@ -37,7 +37,7 @@ class Aggregator: red_flags.append("account_under_30_days") if seller and seller.feedback_count < 10: red_flags.append("low_feedback_count") - if clean["price_vs_market"] == 0: + if signal_scores.get("price_vs_market") == 0: # only flag when data exists and price is genuinely <50% of market red_flags.append("suspicious_price") if photo_hash_duplicate: red_flags.append("duplicate_photo") diff --git a/tests/db/test_store.py b/tests/db/test_store.py index d6ca099..26e60ac 100644 --- a/tests/db/test_store.py +++ b/tests/db/test_store.py @@ -1,4 +1,5 @@ import pytest +from datetime import datetime, timedelta, timezone from pathlib import Path from app.db.store import Store from app.db.models import Listing, Seller, TrustScore, MarketComp @@ -57,7 +58,7 @@ def test_save_and_get_market_comp(store): query_hash="abc123", median_price=1050.0, sample_count=12, - expires_at="2026-03-26T00:00:00", + expires_at=(datetime.now(timezone.utc) + timedelta(hours=6)).isoformat(), ) store.save_market_comp(comp) result = store.get_market_comp("ebay", "abc123") diff --git a/tests/platforms/test_ebay_scraper.py b/tests/platforms/test_ebay_scraper.py index 8e9a6e5..a4c8519 100644 --- a/tests/platforms/test_ebay_scraper.py +++ b/tests/platforms/test_ebay_scraper.py @@ -27,8 +27,9 @@ _EBAY_HTML = """ +