feat(snipe): parallel search+comps, pagination, title fix, price flag fix

- Parallel execution: search() and get_completed_sales() now run
  concurrently via ThreadPoolExecutor — each gets its own Store/SQLite
  connection for thread safety. First cold search time ~halved.

- Pagination: SearchFilters.pages (default 1) controls how many eBay
  result pages are fetched. Both search and sold-comps support up to 3
  parallel Playwright sessions per call (capped to avoid Xvfb overload).
  UI: segmented 1/2/3/5 pages selector in filter sidebar with cost hint.

- True median: get_completed_sales() now averages the two middle values
  for even-length price lists instead of always taking the lower bound.

- Fix suspicious_price false positive: aggregator now checks
  signal_scores.get("price_vs_market") == 0 (pre-None-substitution)
  so listings without market data are never flagged as suspicious.

- Fix title pollution: scraper strips eBay's hidden screen-reader span
  ("Opens in a new window or tab") from listing titles via regex.
  Lazy-imports playwright/playwright_stealth inside _get() so pure
  parsing functions are importable without the full browser stack.

- Tests: 48 pass on host (scraper tests now runnable without Docker),
  new regression guards for all three bug fixes.
This commit is contained in:
pyr0ball 2026-03-25 22:16:08 -07:00
parent 2ab41219f8
commit ea78b9c2cd
9 changed files with 184 additions and 31 deletions

View file

@ -5,6 +5,7 @@ import dataclasses
import hashlib
import logging
import os
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from fastapi import FastAPI, HTTPException
@ -38,33 +39,44 @@ def health():
@app.get("/api/search")
def search(q: str = "", max_price: float = 0, min_price: float = 0):
def search(q: str = "", max_price: float = 0, min_price: float = 0, pages: int = 1):
if not q.strip():
return {"listings": [], "trust_scores": {}, "sellers": {}, "market_price": None}
store = Store(_DB_PATH)
adapter = ScrapedEbayAdapter(store)
filters = SearchFilters(
max_price=max_price if max_price > 0 else None,
min_price=min_price if min_price > 0 else None,
pages=max(1, pages),
)
# Each adapter gets its own Store (SQLite connection) — required for thread safety.
# search() and get_completed_sales() run concurrently; they write to different tables
# so SQLite file-level locking is the only contention point.
search_adapter = ScrapedEbayAdapter(Store(_DB_PATH))
comps_adapter = ScrapedEbayAdapter(Store(_DB_PATH))
try:
listings = adapter.search(q, filters)
adapter.get_completed_sales(q) # warm market comp cache
with ThreadPoolExecutor(max_workers=2) as ex:
listings_future = ex.submit(search_adapter.search, q, filters)
comps_future = ex.submit(comps_adapter.get_completed_sales, q, pages)
listings = listings_future.result()
comps_future.result() # wait; side-effect is saving market comp to DB
except Exception as e:
log.warning("eBay scrape failed: %s", e)
raise HTTPException(status_code=502, detail=f"eBay search failed: {e}")
# Use search_adapter's store for post-processing — it has the sellers already written
store = search_adapter._store
store.save_listings(listings)
scorer = TrustScorer(store)
trust_scores_list = scorer.score_batch(listings, q)
# Market comp
# Market comp written by comps_adapter — read from a fresh connection to avoid
# cross-thread connection reuse
comp_store = Store(_DB_PATH)
query_hash = hashlib.md5(q.encode()).hexdigest()
comp = store.get_market_comp("ebay", query_hash)
comp = comp_store.get_market_comp("ebay", query_hash)
market_price = comp.median_price if comp else None
# Serialize — keyed by platform_listing_id for easy Vue lookup

View file

@ -12,6 +12,7 @@ class SearchFilters:
min_price: Optional[float] = None
condition: Optional[list[str]] = field(default_factory=list)
location_radius_km: Optional[int] = None
pages: int = 1 # number of result pages to fetch (48 listings/page)
class PlatformAdapter(ABC):

View file

@ -14,12 +14,11 @@ import hashlib
import itertools
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timedelta, timezone
from typing import Optional
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
from playwright_stealth import Stealth
from app.db.models import Listing, MarketComp, Seller
from app.db.store import Store
@ -164,10 +163,15 @@ def scrape_listings(html: str) -> list[Listing]:
buying_format = "auction" if time_remaining is not None else "fixed_price"
ends_at = (datetime.now(timezone.utc) + time_remaining).isoformat() if time_remaining else None
# Strip eBay's screen-reader accessibility text injected into title links.
# get_text() is CSS-blind and picks up visually-hidden spans.
raw_title = title_el.get_text(separator=" ", strip=True)
title = re.sub(r"\s*Opens in a new window or tab\s*", "", raw_title, flags=re.IGNORECASE).strip()
results.append(Listing(
platform="ebay",
platform_listing_id=platform_listing_id,
title=title_el.get_text(strip=True),
title=title,
price=price,
currency="USD",
condition=condition,
@ -256,6 +260,9 @@ class ScrapedEbayAdapter(PlatformAdapter):
env["DISPLAY"] = display
try:
from playwright.sync_api import sync_playwright # noqa: PLC0415 — lazy: only needed in Docker
from playwright_stealth import Stealth # noqa: PLC0415
with sync_playwright() as pw:
browser = pw.chromium.launch(
headless=False,
@ -280,12 +287,12 @@ class ScrapedEbayAdapter(PlatformAdapter):
return html
def search(self, query: str, filters: SearchFilters) -> list[Listing]:
params: dict = {"_nkw": query, "_sop": "15", "_ipg": "48"}
base_params: dict = {"_nkw": query, "_sop": "15", "_ipg": "48"}
if filters.max_price:
params["_udhi"] = str(filters.max_price)
base_params["_udhi"] = str(filters.max_price)
if filters.min_price:
params["_udlo"] = str(filters.min_price)
base_params["_udlo"] = str(filters.min_price)
if filters.condition:
cond_map = {
"new": "1000", "used": "3000",
@ -293,38 +300,62 @@ class ScrapedEbayAdapter(PlatformAdapter):
}
codes = [cond_map[c] for c in filters.condition if c in cond_map]
if codes:
params["LH_ItemCondition"] = "|".join(codes)
base_params["LH_ItemCondition"] = "|".join(codes)
html = self._get(params)
listings = scrape_listings(html)
pages = max(1, filters.pages)
page_params = [{**base_params, "_pgn": str(p)} for p in range(1, pages + 1)]
# Cache seller objects extracted from the same page
self._store.save_sellers(list(scrape_sellers(html).values()))
with ThreadPoolExecutor(max_workers=min(pages, 3)) as ex:
htmls = list(ex.map(self._get, page_params))
seen_ids: set[str] = set()
listings: list[Listing] = []
sellers: dict[str, "Seller"] = {}
for html in htmls:
for listing in scrape_listings(html):
if listing.platform_listing_id not in seen_ids:
seen_ids.add(listing.platform_listing_id)
listings.append(listing)
sellers.update(scrape_sellers(html))
self._store.save_sellers(list(sellers.values()))
return listings
def get_seller(self, seller_platform_id: str) -> Optional[Seller]:
# Sellers are pre-populated during search(); no extra fetch needed
return self._store.get_seller("ebay", seller_platform_id)
def get_completed_sales(self, query: str) -> list[Listing]:
def get_completed_sales(self, query: str, pages: int = 1) -> list[Listing]:
query_hash = hashlib.md5(query.encode()).hexdigest()
if self._store.get_market_comp("ebay", query_hash):
return [] # cache hit — comp already stored
params = {
base_params = {
"_nkw": query,
"LH_Sold": "1",
"LH_Complete": "1",
"_sop": "13", # price + shipping: lowest first
"_sop": "13", # sort by price+shipping, lowest first
"_ipg": "48",
}
pages = max(1, pages)
page_params = [{**base_params, "_pgn": str(p)} for p in range(1, pages + 1)]
try:
html = self._get(params)
listings = scrape_listings(html)
prices = sorted(l.price for l in listings if l.price > 0)
with ThreadPoolExecutor(max_workers=min(pages, 3)) as ex:
htmls = list(ex.map(self._get, page_params))
seen_ids: set[str] = set()
all_listings: list[Listing] = []
for html in htmls:
for listing in scrape_listings(html):
if listing.platform_listing_id not in seen_ids:
seen_ids.add(listing.platform_listing_id)
all_listings.append(listing)
prices = sorted(l.price for l in all_listings if l.price > 0)
if prices:
median = prices[len(prices) // 2]
mid = len(prices) // 2
median = (prices[mid - 1] + prices[mid]) / 2 if len(prices) % 2 == 0 else prices[mid]
self._store.save_market_comp(MarketComp(
platform="ebay",
query_hash=query_hash,
@ -332,6 +363,6 @@ class ScrapedEbayAdapter(PlatformAdapter):
sample_count=len(prices),
expires_at=(datetime.now(timezone.utc) + timedelta(hours=6)).isoformat(),
))
return listings
return all_listings
except Exception:
return []

View file

@ -37,7 +37,7 @@ class Aggregator:
red_flags.append("account_under_30_days")
if seller and seller.feedback_count < 10:
red_flags.append("low_feedback_count")
if clean["price_vs_market"] == 0:
if signal_scores.get("price_vs_market") == 0: # only flag when data exists and price is genuinely <50% of market
red_flags.append("suspicious_price")
if photo_hash_duplicate:
red_flags.append("duplicate_photo")

View file

@ -1,4 +1,5 @@
import pytest
from datetime import datetime, timedelta, timezone
from pathlib import Path
from app.db.store import Store
from app.db.models import Listing, Seller, TrustScore, MarketComp
@ -57,7 +58,7 @@ def test_save_and_get_market_comp(store):
query_hash="abc123",
median_price=1050.0,
sample_count=12,
expires_at="2026-03-26T00:00:00",
expires_at=(datetime.now(timezone.utc) + timedelta(hours=6)).isoformat(),
)
store.save_market_comp(comp)
result = store.get_market_comp("ebay", "abc123")

View file

@ -27,8 +27,9 @@ _EBAY_HTML = """
</li>
<!-- Real listing 1: established seller, used, fixed price -->
<!-- Includes eBay's hidden accessibility span to test title stripping -->
<li class="s-card" data-listingid="123456789">
<div class="s-card__title">RTX 4090 Founders Edition GPU</div>
<div class="s-card__title">RTX 4090 Founders Edition GPU<span class="clipped">Opens in a new window or tab</span></div>
<a class="s-card__link" href="https://www.ebay.com/itm/123456789?somequery=1"></a>
<span class="s-card__price">$950.00</span>
<div class="s-card__subtitle">Used · Free shipping</div>
@ -179,6 +180,15 @@ class TestScrapeListings:
titles = [l.title for l in listings]
assert "Shop on eBay" not in titles
def test_strips_ebay_accessibility_text_from_title(self):
"""eBay injects a hidden 'Opens in a new window or tab' span into title links
for screen readers. get_text() is CSS-blind so we must strip it explicitly."""
listings = scrape_listings(_EBAY_HTML)
for listing in listings:
assert "Opens in a new window or tab" not in listing.title
# Verify the actual title content is preserved
assert listings[0].title == "RTX 4090 Founders Edition GPU"
def test_parses_three_real_listings(self):
assert len(scrape_listings(_EBAY_HTML)) == 3

View file

@ -50,3 +50,46 @@ def test_partial_score_flagged_when_signals_missing():
}
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None)
assert result.score_is_partial is True
def test_suspicious_price_not_flagged_when_market_data_absent():
"""None price_vs_market (no market comp) must NOT trigger suspicious_price.
Regression guard: clean[] replaces None with 0, so naive `clean[...] == 0`
would fire even when the signal is simply unavailable.
"""
agg = Aggregator()
scores = {
"account_age": 15, "feedback_count": 15,
"feedback_ratio": 20, "price_vs_market": None, # no market data
"category_history": 0,
}
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None)
assert "suspicious_price" not in result.red_flags_json
def test_suspicious_price_flagged_when_price_genuinely_low():
"""price_vs_market == 0 (explicitly, meaning >50% below median) → flag fires."""
agg = Aggregator()
scores = {
"account_age": 15, "feedback_count": 15,
"feedback_ratio": 20, "price_vs_market": 0, # price is scam-level low
"category_history": 0,
}
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None)
assert "suspicious_price" in result.red_flags_json
def test_new_account_not_flagged_when_age_absent():
"""account_age_days=None (scraper tier) must NOT trigger new_account or account_under_30_days."""
agg = Aggregator()
scores = {k: 10 for k in ["account_age", "feedback_count",
"feedback_ratio", "price_vs_market", "category_history"]}
scraper_seller = Seller(
platform="ebay", platform_seller_id="u", username="u",
account_age_days=None, # not fetched at scraper tier
feedback_count=50, feedback_ratio=0.99, category_history_json="{}",
)
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=scraper_seller)
assert "new_account" not in result.red_flags_json
assert "account_under_30_days" not in result.red_flags_json

View file

@ -60,6 +60,7 @@ export interface SearchFilters {
hideNewAccounts?: boolean
hideSuspiciousPrice?: boolean
hideDuplicatePhotos?: boolean
pages?: number // number of eBay result pages to fetch (48 listings/page, default 1)
}
// ── Store ────────────────────────────────────────────────────────────────────
@ -83,7 +84,8 @@ export const useSearchStore = defineStore('search', () => {
// API does not exist yet — stub returns empty results
const params = new URLSearchParams({ q })
if (filters.maxPrice != null) params.set('max_price', String(filters.maxPrice))
if (filters.minTrustScore != null) params.set('min_trust', String(filters.minTrustScore))
if (filters.minPrice != null) params.set('min_price', String(filters.minPrice))
if (filters.pages != null && filters.pages > 1) params.set('pages', String(filters.pages))
const res = await fetch(`/api/search?${params}`)
if (!res.ok) throw new Error(`Search failed: ${res.status} ${res.statusText}`)

View file

@ -41,6 +41,21 @@
<span class="filter-range-val">{{ filters.minTrustScore ?? 0 }}</span>
</fieldset>
<fieldset class="filter-group">
<legend class="filter-label">Pages to fetch</legend>
<div class="filter-pages" role="group" aria-label="Number of result pages">
<button
v-for="p in [1, 2, 3, 5]"
:key="p"
type="button"
class="filter-pages-btn"
:class="{ 'filter-pages-btn--active': filters.pages === p }"
@click="filters.pages = p"
>{{ p }}</button>
</div>
<p class="filter-pages-hint">{{ (filters.pages ?? 1) * 48 }} listings · {{ (filters.pages ?? 1) * 2 }} Playwright calls</p>
</fieldset>
<fieldset class="filter-group">
<legend class="filter-label">Price</legend>
<div class="filter-row">
@ -166,6 +181,7 @@ const filters = reactive<SearchFilters>({
hideNewAccounts: false,
hideSuspiciousPrice: false,
hideDuplicatePhotos: false,
pages: 1,
})
const CONDITIONS = [
@ -407,6 +423,43 @@ async function onSearch() {
height: 14px;
}
.filter-pages {
display: flex;
gap: var(--space-1);
}
.filter-pages-btn {
flex: 1;
padding: var(--space-1) 0;
background: var(--color-surface-raised);
border: 1px solid var(--color-border);
border-radius: var(--radius-sm);
color: var(--color-text-muted);
font-family: var(--font-body);
font-size: 0.8125rem;
font-weight: 600;
cursor: pointer;
transition: background 120ms ease, color 120ms ease, border-color 120ms ease;
}
.filter-pages-btn:hover:not(.filter-pages-btn--active) {
border-color: var(--app-primary);
color: var(--app-primary);
}
.filter-pages-btn--active {
background: var(--app-primary);
border-color: var(--app-primary);
color: var(--color-text-inverse);
}
.filter-pages-hint {
font-size: 0.6875rem;
color: var(--color-text-muted);
margin: 0;
opacity: 0.75;
}
/* Results area */
.results-area {
flex: 1;