feat(snipe): parallel search+comps, pagination, title fix, price flag fix

- Parallel execution: search() and get_completed_sales() now run
  concurrently via ThreadPoolExecutor — each gets its own Store/SQLite
  connection for thread safety. First cold search time ~halved.

- Pagination: SearchFilters.pages (default 1) controls how many eBay
  result pages are fetched. Both search and sold-comps support up to 3
  parallel Playwright sessions per call (capped to avoid Xvfb overload).
  UI: segmented 1/2/3/5 pages selector in filter sidebar with cost hint.

- True median: get_completed_sales() now averages the two middle values
  for even-length price lists instead of always taking the lower bound.

- Fix suspicious_price false positive: aggregator now checks
  signal_scores.get("price_vs_market") == 0 (pre-None-substitution)
  so listings without market data are never flagged as suspicious.

- Fix title pollution: scraper strips eBay's hidden screen-reader span
  ("Opens in a new window or tab") from listing titles via regex.
  Lazy-imports playwright/playwright_stealth inside _get() so pure
  parsing functions are importable without the full browser stack.

- Tests: 48 pass on host (scraper tests now runnable without Docker),
  new regression guards for all three bug fixes.
This commit is contained in:
pyr0ball 2026-03-25 22:16:08 -07:00
parent 2ab41219f8
commit ea78b9c2cd
9 changed files with 184 additions and 31 deletions

View file

@ -5,6 +5,7 @@ import dataclasses
import hashlib import hashlib
import logging import logging
import os import os
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path from pathlib import Path
from fastapi import FastAPI, HTTPException from fastapi import FastAPI, HTTPException
@ -38,33 +39,44 @@ def health():
@app.get("/api/search") @app.get("/api/search")
def search(q: str = "", max_price: float = 0, min_price: float = 0): def search(q: str = "", max_price: float = 0, min_price: float = 0, pages: int = 1):
if not q.strip(): if not q.strip():
return {"listings": [], "trust_scores": {}, "sellers": {}, "market_price": None} return {"listings": [], "trust_scores": {}, "sellers": {}, "market_price": None}
store = Store(_DB_PATH)
adapter = ScrapedEbayAdapter(store)
filters = SearchFilters( filters = SearchFilters(
max_price=max_price if max_price > 0 else None, max_price=max_price if max_price > 0 else None,
min_price=min_price if min_price > 0 else None, min_price=min_price if min_price > 0 else None,
pages=max(1, pages),
) )
# Each adapter gets its own Store (SQLite connection) — required for thread safety.
# search() and get_completed_sales() run concurrently; they write to different tables
# so SQLite file-level locking is the only contention point.
search_adapter = ScrapedEbayAdapter(Store(_DB_PATH))
comps_adapter = ScrapedEbayAdapter(Store(_DB_PATH))
try: try:
listings = adapter.search(q, filters) with ThreadPoolExecutor(max_workers=2) as ex:
adapter.get_completed_sales(q) # warm market comp cache listings_future = ex.submit(search_adapter.search, q, filters)
comps_future = ex.submit(comps_adapter.get_completed_sales, q, pages)
listings = listings_future.result()
comps_future.result() # wait; side-effect is saving market comp to DB
except Exception as e: except Exception as e:
log.warning("eBay scrape failed: %s", e) log.warning("eBay scrape failed: %s", e)
raise HTTPException(status_code=502, detail=f"eBay search failed: {e}") raise HTTPException(status_code=502, detail=f"eBay search failed: {e}")
# Use search_adapter's store for post-processing — it has the sellers already written
store = search_adapter._store
store.save_listings(listings) store.save_listings(listings)
scorer = TrustScorer(store) scorer = TrustScorer(store)
trust_scores_list = scorer.score_batch(listings, q) trust_scores_list = scorer.score_batch(listings, q)
# Market comp # Market comp written by comps_adapter — read from a fresh connection to avoid
# cross-thread connection reuse
comp_store = Store(_DB_PATH)
query_hash = hashlib.md5(q.encode()).hexdigest() query_hash = hashlib.md5(q.encode()).hexdigest()
comp = store.get_market_comp("ebay", query_hash) comp = comp_store.get_market_comp("ebay", query_hash)
market_price = comp.median_price if comp else None market_price = comp.median_price if comp else None
# Serialize — keyed by platform_listing_id for easy Vue lookup # Serialize — keyed by platform_listing_id for easy Vue lookup

View file

@ -12,6 +12,7 @@ class SearchFilters:
min_price: Optional[float] = None min_price: Optional[float] = None
condition: Optional[list[str]] = field(default_factory=list) condition: Optional[list[str]] = field(default_factory=list)
location_radius_km: Optional[int] = None location_radius_km: Optional[int] = None
pages: int = 1 # number of result pages to fetch (48 listings/page)
class PlatformAdapter(ABC): class PlatformAdapter(ABC):

View file

@ -14,12 +14,11 @@ import hashlib
import itertools import itertools
import re import re
import time import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timedelta, timezone from datetime import datetime, timedelta, timezone
from typing import Optional from typing import Optional
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
from playwright_stealth import Stealth
from app.db.models import Listing, MarketComp, Seller from app.db.models import Listing, MarketComp, Seller
from app.db.store import Store from app.db.store import Store
@ -164,10 +163,15 @@ def scrape_listings(html: str) -> list[Listing]:
buying_format = "auction" if time_remaining is not None else "fixed_price" buying_format = "auction" if time_remaining is not None else "fixed_price"
ends_at = (datetime.now(timezone.utc) + time_remaining).isoformat() if time_remaining else None ends_at = (datetime.now(timezone.utc) + time_remaining).isoformat() if time_remaining else None
# Strip eBay's screen-reader accessibility text injected into title links.
# get_text() is CSS-blind and picks up visually-hidden spans.
raw_title = title_el.get_text(separator=" ", strip=True)
title = re.sub(r"\s*Opens in a new window or tab\s*", "", raw_title, flags=re.IGNORECASE).strip()
results.append(Listing( results.append(Listing(
platform="ebay", platform="ebay",
platform_listing_id=platform_listing_id, platform_listing_id=platform_listing_id,
title=title_el.get_text(strip=True), title=title,
price=price, price=price,
currency="USD", currency="USD",
condition=condition, condition=condition,
@ -256,6 +260,9 @@ class ScrapedEbayAdapter(PlatformAdapter):
env["DISPLAY"] = display env["DISPLAY"] = display
try: try:
from playwright.sync_api import sync_playwright # noqa: PLC0415 — lazy: only needed in Docker
from playwright_stealth import Stealth # noqa: PLC0415
with sync_playwright() as pw: with sync_playwright() as pw:
browser = pw.chromium.launch( browser = pw.chromium.launch(
headless=False, headless=False,
@ -280,12 +287,12 @@ class ScrapedEbayAdapter(PlatformAdapter):
return html return html
def search(self, query: str, filters: SearchFilters) -> list[Listing]: def search(self, query: str, filters: SearchFilters) -> list[Listing]:
params: dict = {"_nkw": query, "_sop": "15", "_ipg": "48"} base_params: dict = {"_nkw": query, "_sop": "15", "_ipg": "48"}
if filters.max_price: if filters.max_price:
params["_udhi"] = str(filters.max_price) base_params["_udhi"] = str(filters.max_price)
if filters.min_price: if filters.min_price:
params["_udlo"] = str(filters.min_price) base_params["_udlo"] = str(filters.min_price)
if filters.condition: if filters.condition:
cond_map = { cond_map = {
"new": "1000", "used": "3000", "new": "1000", "used": "3000",
@ -293,38 +300,62 @@ class ScrapedEbayAdapter(PlatformAdapter):
} }
codes = [cond_map[c] for c in filters.condition if c in cond_map] codes = [cond_map[c] for c in filters.condition if c in cond_map]
if codes: if codes:
params["LH_ItemCondition"] = "|".join(codes) base_params["LH_ItemCondition"] = "|".join(codes)
html = self._get(params) pages = max(1, filters.pages)
listings = scrape_listings(html) page_params = [{**base_params, "_pgn": str(p)} for p in range(1, pages + 1)]
# Cache seller objects extracted from the same page with ThreadPoolExecutor(max_workers=min(pages, 3)) as ex:
self._store.save_sellers(list(scrape_sellers(html).values())) htmls = list(ex.map(self._get, page_params))
seen_ids: set[str] = set()
listings: list[Listing] = []
sellers: dict[str, "Seller"] = {}
for html in htmls:
for listing in scrape_listings(html):
if listing.platform_listing_id not in seen_ids:
seen_ids.add(listing.platform_listing_id)
listings.append(listing)
sellers.update(scrape_sellers(html))
self._store.save_sellers(list(sellers.values()))
return listings return listings
def get_seller(self, seller_platform_id: str) -> Optional[Seller]: def get_seller(self, seller_platform_id: str) -> Optional[Seller]:
# Sellers are pre-populated during search(); no extra fetch needed # Sellers are pre-populated during search(); no extra fetch needed
return self._store.get_seller("ebay", seller_platform_id) return self._store.get_seller("ebay", seller_platform_id)
def get_completed_sales(self, query: str) -> list[Listing]: def get_completed_sales(self, query: str, pages: int = 1) -> list[Listing]:
query_hash = hashlib.md5(query.encode()).hexdigest() query_hash = hashlib.md5(query.encode()).hexdigest()
if self._store.get_market_comp("ebay", query_hash): if self._store.get_market_comp("ebay", query_hash):
return [] # cache hit — comp already stored return [] # cache hit — comp already stored
params = { base_params = {
"_nkw": query, "_nkw": query,
"LH_Sold": "1", "LH_Sold": "1",
"LH_Complete": "1", "LH_Complete": "1",
"_sop": "13", # price + shipping: lowest first "_sop": "13", # sort by price+shipping, lowest first
"_ipg": "48", "_ipg": "48",
} }
pages = max(1, pages)
page_params = [{**base_params, "_pgn": str(p)} for p in range(1, pages + 1)]
try: try:
html = self._get(params) with ThreadPoolExecutor(max_workers=min(pages, 3)) as ex:
listings = scrape_listings(html) htmls = list(ex.map(self._get, page_params))
prices = sorted(l.price for l in listings if l.price > 0)
seen_ids: set[str] = set()
all_listings: list[Listing] = []
for html in htmls:
for listing in scrape_listings(html):
if listing.platform_listing_id not in seen_ids:
seen_ids.add(listing.platform_listing_id)
all_listings.append(listing)
prices = sorted(l.price for l in all_listings if l.price > 0)
if prices: if prices:
median = prices[len(prices) // 2] mid = len(prices) // 2
median = (prices[mid - 1] + prices[mid]) / 2 if len(prices) % 2 == 0 else prices[mid]
self._store.save_market_comp(MarketComp( self._store.save_market_comp(MarketComp(
platform="ebay", platform="ebay",
query_hash=query_hash, query_hash=query_hash,
@ -332,6 +363,6 @@ class ScrapedEbayAdapter(PlatformAdapter):
sample_count=len(prices), sample_count=len(prices),
expires_at=(datetime.now(timezone.utc) + timedelta(hours=6)).isoformat(), expires_at=(datetime.now(timezone.utc) + timedelta(hours=6)).isoformat(),
)) ))
return listings return all_listings
except Exception: except Exception:
return [] return []

View file

@ -37,7 +37,7 @@ class Aggregator:
red_flags.append("account_under_30_days") red_flags.append("account_under_30_days")
if seller and seller.feedback_count < 10: if seller and seller.feedback_count < 10:
red_flags.append("low_feedback_count") red_flags.append("low_feedback_count")
if clean["price_vs_market"] == 0: if signal_scores.get("price_vs_market") == 0: # only flag when data exists and price is genuinely <50% of market
red_flags.append("suspicious_price") red_flags.append("suspicious_price")
if photo_hash_duplicate: if photo_hash_duplicate:
red_flags.append("duplicate_photo") red_flags.append("duplicate_photo")

View file

@ -1,4 +1,5 @@
import pytest import pytest
from datetime import datetime, timedelta, timezone
from pathlib import Path from pathlib import Path
from app.db.store import Store from app.db.store import Store
from app.db.models import Listing, Seller, TrustScore, MarketComp from app.db.models import Listing, Seller, TrustScore, MarketComp
@ -57,7 +58,7 @@ def test_save_and_get_market_comp(store):
query_hash="abc123", query_hash="abc123",
median_price=1050.0, median_price=1050.0,
sample_count=12, sample_count=12,
expires_at="2026-03-26T00:00:00", expires_at=(datetime.now(timezone.utc) + timedelta(hours=6)).isoformat(),
) )
store.save_market_comp(comp) store.save_market_comp(comp)
result = store.get_market_comp("ebay", "abc123") result = store.get_market_comp("ebay", "abc123")

View file

@ -27,8 +27,9 @@ _EBAY_HTML = """
</li> </li>
<!-- Real listing 1: established seller, used, fixed price --> <!-- Real listing 1: established seller, used, fixed price -->
<!-- Includes eBay's hidden accessibility span to test title stripping -->
<li class="s-card" data-listingid="123456789"> <li class="s-card" data-listingid="123456789">
<div class="s-card__title">RTX 4090 Founders Edition GPU</div> <div class="s-card__title">RTX 4090 Founders Edition GPU<span class="clipped">Opens in a new window or tab</span></div>
<a class="s-card__link" href="https://www.ebay.com/itm/123456789?somequery=1"></a> <a class="s-card__link" href="https://www.ebay.com/itm/123456789?somequery=1"></a>
<span class="s-card__price">$950.00</span> <span class="s-card__price">$950.00</span>
<div class="s-card__subtitle">Used · Free shipping</div> <div class="s-card__subtitle">Used · Free shipping</div>
@ -179,6 +180,15 @@ class TestScrapeListings:
titles = [l.title for l in listings] titles = [l.title for l in listings]
assert "Shop on eBay" not in titles assert "Shop on eBay" not in titles
def test_strips_ebay_accessibility_text_from_title(self):
"""eBay injects a hidden 'Opens in a new window or tab' span into title links
for screen readers. get_text() is CSS-blind so we must strip it explicitly."""
listings = scrape_listings(_EBAY_HTML)
for listing in listings:
assert "Opens in a new window or tab" not in listing.title
# Verify the actual title content is preserved
assert listings[0].title == "RTX 4090 Founders Edition GPU"
def test_parses_three_real_listings(self): def test_parses_three_real_listings(self):
assert len(scrape_listings(_EBAY_HTML)) == 3 assert len(scrape_listings(_EBAY_HTML)) == 3

View file

@ -50,3 +50,46 @@ def test_partial_score_flagged_when_signals_missing():
} }
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None) result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None)
assert result.score_is_partial is True assert result.score_is_partial is True
def test_suspicious_price_not_flagged_when_market_data_absent():
"""None price_vs_market (no market comp) must NOT trigger suspicious_price.
Regression guard: clean[] replaces None with 0, so naive `clean[...] == 0`
would fire even when the signal is simply unavailable.
"""
agg = Aggregator()
scores = {
"account_age": 15, "feedback_count": 15,
"feedback_ratio": 20, "price_vs_market": None, # no market data
"category_history": 0,
}
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None)
assert "suspicious_price" not in result.red_flags_json
def test_suspicious_price_flagged_when_price_genuinely_low():
"""price_vs_market == 0 (explicitly, meaning >50% below median) → flag fires."""
agg = Aggregator()
scores = {
"account_age": 15, "feedback_count": 15,
"feedback_ratio": 20, "price_vs_market": 0, # price is scam-level low
"category_history": 0,
}
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None)
assert "suspicious_price" in result.red_flags_json
def test_new_account_not_flagged_when_age_absent():
"""account_age_days=None (scraper tier) must NOT trigger new_account or account_under_30_days."""
agg = Aggregator()
scores = {k: 10 for k in ["account_age", "feedback_count",
"feedback_ratio", "price_vs_market", "category_history"]}
scraper_seller = Seller(
platform="ebay", platform_seller_id="u", username="u",
account_age_days=None, # not fetched at scraper tier
feedback_count=50, feedback_ratio=0.99, category_history_json="{}",
)
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=scraper_seller)
assert "new_account" not in result.red_flags_json
assert "account_under_30_days" not in result.red_flags_json

View file

@ -60,6 +60,7 @@ export interface SearchFilters {
hideNewAccounts?: boolean hideNewAccounts?: boolean
hideSuspiciousPrice?: boolean hideSuspiciousPrice?: boolean
hideDuplicatePhotos?: boolean hideDuplicatePhotos?: boolean
pages?: number // number of eBay result pages to fetch (48 listings/page, default 1)
} }
// ── Store ──────────────────────────────────────────────────────────────────── // ── Store ────────────────────────────────────────────────────────────────────
@ -83,7 +84,8 @@ export const useSearchStore = defineStore('search', () => {
// API does not exist yet — stub returns empty results // API does not exist yet — stub returns empty results
const params = new URLSearchParams({ q }) const params = new URLSearchParams({ q })
if (filters.maxPrice != null) params.set('max_price', String(filters.maxPrice)) if (filters.maxPrice != null) params.set('max_price', String(filters.maxPrice))
if (filters.minTrustScore != null) params.set('min_trust', String(filters.minTrustScore)) if (filters.minPrice != null) params.set('min_price', String(filters.minPrice))
if (filters.pages != null && filters.pages > 1) params.set('pages', String(filters.pages))
const res = await fetch(`/api/search?${params}`) const res = await fetch(`/api/search?${params}`)
if (!res.ok) throw new Error(`Search failed: ${res.status} ${res.statusText}`) if (!res.ok) throw new Error(`Search failed: ${res.status} ${res.statusText}`)

View file

@ -41,6 +41,21 @@
<span class="filter-range-val">{{ filters.minTrustScore ?? 0 }}</span> <span class="filter-range-val">{{ filters.minTrustScore ?? 0 }}</span>
</fieldset> </fieldset>
<fieldset class="filter-group">
<legend class="filter-label">Pages to fetch</legend>
<div class="filter-pages" role="group" aria-label="Number of result pages">
<button
v-for="p in [1, 2, 3, 5]"
:key="p"
type="button"
class="filter-pages-btn"
:class="{ 'filter-pages-btn--active': filters.pages === p }"
@click="filters.pages = p"
>{{ p }}</button>
</div>
<p class="filter-pages-hint">{{ (filters.pages ?? 1) * 48 }} listings · {{ (filters.pages ?? 1) * 2 }} Playwright calls</p>
</fieldset>
<fieldset class="filter-group"> <fieldset class="filter-group">
<legend class="filter-label">Price</legend> <legend class="filter-label">Price</legend>
<div class="filter-row"> <div class="filter-row">
@ -166,6 +181,7 @@ const filters = reactive<SearchFilters>({
hideNewAccounts: false, hideNewAccounts: false,
hideSuspiciousPrice: false, hideSuspiciousPrice: false,
hideDuplicatePhotos: false, hideDuplicatePhotos: false,
pages: 1,
}) })
const CONDITIONS = [ const CONDITIONS = [
@ -407,6 +423,43 @@ async function onSearch() {
height: 14px; height: 14px;
} }
.filter-pages {
display: flex;
gap: var(--space-1);
}
.filter-pages-btn {
flex: 1;
padding: var(--space-1) 0;
background: var(--color-surface-raised);
border: 1px solid var(--color-border);
border-radius: var(--radius-sm);
color: var(--color-text-muted);
font-family: var(--font-body);
font-size: 0.8125rem;
font-weight: 600;
cursor: pointer;
transition: background 120ms ease, color 120ms ease, border-color 120ms ease;
}
.filter-pages-btn:hover:not(.filter-pages-btn--active) {
border-color: var(--app-primary);
color: var(--app-primary);
}
.filter-pages-btn--active {
background: var(--app-primary);
border-color: var(--app-primary);
color: var(--color-text-inverse);
}
.filter-pages-hint {
font-size: 0.6875rem;
color: var(--color-text-muted);
margin: 0;
opacity: 0.75;
}
/* Results area */ /* Results area */
.results-area { .results-area {
flex: 1; flex: 1;