feat(snipe): parallel search+comps, pagination, title fix, price flag fix
- Parallel execution: search() and get_completed_sales() now run
concurrently via ThreadPoolExecutor — each gets its own Store/SQLite
connection for thread safety. First cold search time ~halved.
- Pagination: SearchFilters.pages (default 1) controls how many eBay
result pages are fetched. Both search and sold-comps support up to 3
parallel Playwright sessions per call (capped to avoid Xvfb overload).
UI: segmented 1/2/3/5 pages selector in filter sidebar with cost hint.
- True median: get_completed_sales() now averages the two middle values
for even-length price lists instead of always taking the lower bound.
- Fix suspicious_price false positive: aggregator now checks
signal_scores.get("price_vs_market") == 0 (pre-None-substitution)
so listings without market data are never flagged as suspicious.
- Fix title pollution: scraper strips eBay's hidden screen-reader span
("Opens in a new window or tab") from listing titles via regex.
Lazy-imports playwright/playwright_stealth inside _get() so pure
parsing functions are importable without the full browser stack.
- Tests: 48 pass on host (scraper tests now runnable without Docker),
new regression guards for all three bug fixes.
This commit is contained in:
parent
2ab41219f8
commit
ea78b9c2cd
9 changed files with 184 additions and 31 deletions
28
api/main.py
28
api/main.py
|
|
@ -5,6 +5,7 @@ import dataclasses
|
|||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
|
|
@ -38,33 +39,44 @@ def health():
|
|||
|
||||
|
||||
@app.get("/api/search")
|
||||
def search(q: str = "", max_price: float = 0, min_price: float = 0):
|
||||
def search(q: str = "", max_price: float = 0, min_price: float = 0, pages: int = 1):
|
||||
if not q.strip():
|
||||
return {"listings": [], "trust_scores": {}, "sellers": {}, "market_price": None}
|
||||
|
||||
store = Store(_DB_PATH)
|
||||
adapter = ScrapedEbayAdapter(store)
|
||||
|
||||
filters = SearchFilters(
|
||||
max_price=max_price if max_price > 0 else None,
|
||||
min_price=min_price if min_price > 0 else None,
|
||||
pages=max(1, pages),
|
||||
)
|
||||
|
||||
# Each adapter gets its own Store (SQLite connection) — required for thread safety.
|
||||
# search() and get_completed_sales() run concurrently; they write to different tables
|
||||
# so SQLite file-level locking is the only contention point.
|
||||
search_adapter = ScrapedEbayAdapter(Store(_DB_PATH))
|
||||
comps_adapter = ScrapedEbayAdapter(Store(_DB_PATH))
|
||||
|
||||
try:
|
||||
listings = adapter.search(q, filters)
|
||||
adapter.get_completed_sales(q) # warm market comp cache
|
||||
with ThreadPoolExecutor(max_workers=2) as ex:
|
||||
listings_future = ex.submit(search_adapter.search, q, filters)
|
||||
comps_future = ex.submit(comps_adapter.get_completed_sales, q, pages)
|
||||
listings = listings_future.result()
|
||||
comps_future.result() # wait; side-effect is saving market comp to DB
|
||||
except Exception as e:
|
||||
log.warning("eBay scrape failed: %s", e)
|
||||
raise HTTPException(status_code=502, detail=f"eBay search failed: {e}")
|
||||
|
||||
# Use search_adapter's store for post-processing — it has the sellers already written
|
||||
store = search_adapter._store
|
||||
store.save_listings(listings)
|
||||
|
||||
scorer = TrustScorer(store)
|
||||
trust_scores_list = scorer.score_batch(listings, q)
|
||||
|
||||
# Market comp
|
||||
# Market comp written by comps_adapter — read from a fresh connection to avoid
|
||||
# cross-thread connection reuse
|
||||
comp_store = Store(_DB_PATH)
|
||||
query_hash = hashlib.md5(q.encode()).hexdigest()
|
||||
comp = store.get_market_comp("ebay", query_hash)
|
||||
comp = comp_store.get_market_comp("ebay", query_hash)
|
||||
market_price = comp.median_price if comp else None
|
||||
|
||||
# Serialize — keyed by platform_listing_id for easy Vue lookup
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ class SearchFilters:
|
|||
min_price: Optional[float] = None
|
||||
condition: Optional[list[str]] = field(default_factory=list)
|
||||
location_radius_km: Optional[int] = None
|
||||
pages: int = 1 # number of result pages to fetch (48 listings/page)
|
||||
|
||||
|
||||
class PlatformAdapter(ABC):
|
||||
|
|
|
|||
|
|
@ -14,12 +14,11 @@ import hashlib
|
|||
import itertools
|
||||
import re
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Optional
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from playwright.sync_api import sync_playwright
|
||||
from playwright_stealth import Stealth
|
||||
|
||||
from app.db.models import Listing, MarketComp, Seller
|
||||
from app.db.store import Store
|
||||
|
|
@ -164,10 +163,15 @@ def scrape_listings(html: str) -> list[Listing]:
|
|||
buying_format = "auction" if time_remaining is not None else "fixed_price"
|
||||
ends_at = (datetime.now(timezone.utc) + time_remaining).isoformat() if time_remaining else None
|
||||
|
||||
# Strip eBay's screen-reader accessibility text injected into title links.
|
||||
# get_text() is CSS-blind and picks up visually-hidden spans.
|
||||
raw_title = title_el.get_text(separator=" ", strip=True)
|
||||
title = re.sub(r"\s*Opens in a new window or tab\s*", "", raw_title, flags=re.IGNORECASE).strip()
|
||||
|
||||
results.append(Listing(
|
||||
platform="ebay",
|
||||
platform_listing_id=platform_listing_id,
|
||||
title=title_el.get_text(strip=True),
|
||||
title=title,
|
||||
price=price,
|
||||
currency="USD",
|
||||
condition=condition,
|
||||
|
|
@ -256,6 +260,9 @@ class ScrapedEbayAdapter(PlatformAdapter):
|
|||
env["DISPLAY"] = display
|
||||
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright # noqa: PLC0415 — lazy: only needed in Docker
|
||||
from playwright_stealth import Stealth # noqa: PLC0415
|
||||
|
||||
with sync_playwright() as pw:
|
||||
browser = pw.chromium.launch(
|
||||
headless=False,
|
||||
|
|
@ -280,12 +287,12 @@ class ScrapedEbayAdapter(PlatformAdapter):
|
|||
return html
|
||||
|
||||
def search(self, query: str, filters: SearchFilters) -> list[Listing]:
|
||||
params: dict = {"_nkw": query, "_sop": "15", "_ipg": "48"}
|
||||
base_params: dict = {"_nkw": query, "_sop": "15", "_ipg": "48"}
|
||||
|
||||
if filters.max_price:
|
||||
params["_udhi"] = str(filters.max_price)
|
||||
base_params["_udhi"] = str(filters.max_price)
|
||||
if filters.min_price:
|
||||
params["_udlo"] = str(filters.min_price)
|
||||
base_params["_udlo"] = str(filters.min_price)
|
||||
if filters.condition:
|
||||
cond_map = {
|
||||
"new": "1000", "used": "3000",
|
||||
|
|
@ -293,38 +300,62 @@ class ScrapedEbayAdapter(PlatformAdapter):
|
|||
}
|
||||
codes = [cond_map[c] for c in filters.condition if c in cond_map]
|
||||
if codes:
|
||||
params["LH_ItemCondition"] = "|".join(codes)
|
||||
base_params["LH_ItemCondition"] = "|".join(codes)
|
||||
|
||||
html = self._get(params)
|
||||
listings = scrape_listings(html)
|
||||
pages = max(1, filters.pages)
|
||||
page_params = [{**base_params, "_pgn": str(p)} for p in range(1, pages + 1)]
|
||||
|
||||
# Cache seller objects extracted from the same page
|
||||
self._store.save_sellers(list(scrape_sellers(html).values()))
|
||||
with ThreadPoolExecutor(max_workers=min(pages, 3)) as ex:
|
||||
htmls = list(ex.map(self._get, page_params))
|
||||
|
||||
seen_ids: set[str] = set()
|
||||
listings: list[Listing] = []
|
||||
sellers: dict[str, "Seller"] = {}
|
||||
for html in htmls:
|
||||
for listing in scrape_listings(html):
|
||||
if listing.platform_listing_id not in seen_ids:
|
||||
seen_ids.add(listing.platform_listing_id)
|
||||
listings.append(listing)
|
||||
sellers.update(scrape_sellers(html))
|
||||
|
||||
self._store.save_sellers(list(sellers.values()))
|
||||
return listings
|
||||
|
||||
def get_seller(self, seller_platform_id: str) -> Optional[Seller]:
|
||||
# Sellers are pre-populated during search(); no extra fetch needed
|
||||
return self._store.get_seller("ebay", seller_platform_id)
|
||||
|
||||
def get_completed_sales(self, query: str) -> list[Listing]:
|
||||
def get_completed_sales(self, query: str, pages: int = 1) -> list[Listing]:
|
||||
query_hash = hashlib.md5(query.encode()).hexdigest()
|
||||
if self._store.get_market_comp("ebay", query_hash):
|
||||
return [] # cache hit — comp already stored
|
||||
|
||||
params = {
|
||||
base_params = {
|
||||
"_nkw": query,
|
||||
"LH_Sold": "1",
|
||||
"LH_Complete": "1",
|
||||
"_sop": "13", # price + shipping: lowest first
|
||||
"_sop": "13", # sort by price+shipping, lowest first
|
||||
"_ipg": "48",
|
||||
}
|
||||
pages = max(1, pages)
|
||||
page_params = [{**base_params, "_pgn": str(p)} for p in range(1, pages + 1)]
|
||||
|
||||
try:
|
||||
html = self._get(params)
|
||||
listings = scrape_listings(html)
|
||||
prices = sorted(l.price for l in listings if l.price > 0)
|
||||
with ThreadPoolExecutor(max_workers=min(pages, 3)) as ex:
|
||||
htmls = list(ex.map(self._get, page_params))
|
||||
|
||||
seen_ids: set[str] = set()
|
||||
all_listings: list[Listing] = []
|
||||
for html in htmls:
|
||||
for listing in scrape_listings(html):
|
||||
if listing.platform_listing_id not in seen_ids:
|
||||
seen_ids.add(listing.platform_listing_id)
|
||||
all_listings.append(listing)
|
||||
|
||||
prices = sorted(l.price for l in all_listings if l.price > 0)
|
||||
if prices:
|
||||
median = prices[len(prices) // 2]
|
||||
mid = len(prices) // 2
|
||||
median = (prices[mid - 1] + prices[mid]) / 2 if len(prices) % 2 == 0 else prices[mid]
|
||||
self._store.save_market_comp(MarketComp(
|
||||
platform="ebay",
|
||||
query_hash=query_hash,
|
||||
|
|
@ -332,6 +363,6 @@ class ScrapedEbayAdapter(PlatformAdapter):
|
|||
sample_count=len(prices),
|
||||
expires_at=(datetime.now(timezone.utc) + timedelta(hours=6)).isoformat(),
|
||||
))
|
||||
return listings
|
||||
return all_listings
|
||||
except Exception:
|
||||
return []
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ class Aggregator:
|
|||
red_flags.append("account_under_30_days")
|
||||
if seller and seller.feedback_count < 10:
|
||||
red_flags.append("low_feedback_count")
|
||||
if clean["price_vs_market"] == 0:
|
||||
if signal_scores.get("price_vs_market") == 0: # only flag when data exists and price is genuinely <50% of market
|
||||
red_flags.append("suspicious_price")
|
||||
if photo_hash_duplicate:
|
||||
red_flags.append("duplicate_photo")
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import pytest
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
from app.db.store import Store
|
||||
from app.db.models import Listing, Seller, TrustScore, MarketComp
|
||||
|
|
@ -57,7 +58,7 @@ def test_save_and_get_market_comp(store):
|
|||
query_hash="abc123",
|
||||
median_price=1050.0,
|
||||
sample_count=12,
|
||||
expires_at="2026-03-26T00:00:00",
|
||||
expires_at=(datetime.now(timezone.utc) + timedelta(hours=6)).isoformat(),
|
||||
)
|
||||
store.save_market_comp(comp)
|
||||
result = store.get_market_comp("ebay", "abc123")
|
||||
|
|
|
|||
|
|
@ -27,8 +27,9 @@ _EBAY_HTML = """
|
|||
</li>
|
||||
|
||||
<!-- Real listing 1: established seller, used, fixed price -->
|
||||
<!-- Includes eBay's hidden accessibility span to test title stripping -->
|
||||
<li class="s-card" data-listingid="123456789">
|
||||
<div class="s-card__title">RTX 4090 Founders Edition GPU</div>
|
||||
<div class="s-card__title">RTX 4090 Founders Edition GPU<span class="clipped">Opens in a new window or tab</span></div>
|
||||
<a class="s-card__link" href="https://www.ebay.com/itm/123456789?somequery=1"></a>
|
||||
<span class="s-card__price">$950.00</span>
|
||||
<div class="s-card__subtitle">Used · Free shipping</div>
|
||||
|
|
@ -179,6 +180,15 @@ class TestScrapeListings:
|
|||
titles = [l.title for l in listings]
|
||||
assert "Shop on eBay" not in titles
|
||||
|
||||
def test_strips_ebay_accessibility_text_from_title(self):
|
||||
"""eBay injects a hidden 'Opens in a new window or tab' span into title links
|
||||
for screen readers. get_text() is CSS-blind so we must strip it explicitly."""
|
||||
listings = scrape_listings(_EBAY_HTML)
|
||||
for listing in listings:
|
||||
assert "Opens in a new window or tab" not in listing.title
|
||||
# Verify the actual title content is preserved
|
||||
assert listings[0].title == "RTX 4090 Founders Edition GPU"
|
||||
|
||||
def test_parses_three_real_listings(self):
|
||||
assert len(scrape_listings(_EBAY_HTML)) == 3
|
||||
|
||||
|
|
|
|||
|
|
@ -50,3 +50,46 @@ def test_partial_score_flagged_when_signals_missing():
|
|||
}
|
||||
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None)
|
||||
assert result.score_is_partial is True
|
||||
|
||||
|
||||
def test_suspicious_price_not_flagged_when_market_data_absent():
|
||||
"""None price_vs_market (no market comp) must NOT trigger suspicious_price.
|
||||
|
||||
Regression guard: clean[] replaces None with 0, so naive `clean[...] == 0`
|
||||
would fire even when the signal is simply unavailable.
|
||||
"""
|
||||
agg = Aggregator()
|
||||
scores = {
|
||||
"account_age": 15, "feedback_count": 15,
|
||||
"feedback_ratio": 20, "price_vs_market": None, # no market data
|
||||
"category_history": 0,
|
||||
}
|
||||
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None)
|
||||
assert "suspicious_price" not in result.red_flags_json
|
||||
|
||||
|
||||
def test_suspicious_price_flagged_when_price_genuinely_low():
|
||||
"""price_vs_market == 0 (explicitly, meaning >50% below median) → flag fires."""
|
||||
agg = Aggregator()
|
||||
scores = {
|
||||
"account_age": 15, "feedback_count": 15,
|
||||
"feedback_ratio": 20, "price_vs_market": 0, # price is scam-level low
|
||||
"category_history": 0,
|
||||
}
|
||||
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=None)
|
||||
assert "suspicious_price" in result.red_flags_json
|
||||
|
||||
|
||||
def test_new_account_not_flagged_when_age_absent():
|
||||
"""account_age_days=None (scraper tier) must NOT trigger new_account or account_under_30_days."""
|
||||
agg = Aggregator()
|
||||
scores = {k: 10 for k in ["account_age", "feedback_count",
|
||||
"feedback_ratio", "price_vs_market", "category_history"]}
|
||||
scraper_seller = Seller(
|
||||
platform="ebay", platform_seller_id="u", username="u",
|
||||
account_age_days=None, # not fetched at scraper tier
|
||||
feedback_count=50, feedback_ratio=0.99, category_history_json="{}",
|
||||
)
|
||||
result = agg.aggregate(scores, photo_hash_duplicate=False, seller=scraper_seller)
|
||||
assert "new_account" not in result.red_flags_json
|
||||
assert "account_under_30_days" not in result.red_flags_json
|
||||
|
|
|
|||
|
|
@ -60,6 +60,7 @@ export interface SearchFilters {
|
|||
hideNewAccounts?: boolean
|
||||
hideSuspiciousPrice?: boolean
|
||||
hideDuplicatePhotos?: boolean
|
||||
pages?: number // number of eBay result pages to fetch (48 listings/page, default 1)
|
||||
}
|
||||
|
||||
// ── Store ────────────────────────────────────────────────────────────────────
|
||||
|
|
@ -83,7 +84,8 @@ export const useSearchStore = defineStore('search', () => {
|
|||
// API does not exist yet — stub returns empty results
|
||||
const params = new URLSearchParams({ q })
|
||||
if (filters.maxPrice != null) params.set('max_price', String(filters.maxPrice))
|
||||
if (filters.minTrustScore != null) params.set('min_trust', String(filters.minTrustScore))
|
||||
if (filters.minPrice != null) params.set('min_price', String(filters.minPrice))
|
||||
if (filters.pages != null && filters.pages > 1) params.set('pages', String(filters.pages))
|
||||
const res = await fetch(`/api/search?${params}`)
|
||||
if (!res.ok) throw new Error(`Search failed: ${res.status} ${res.statusText}`)
|
||||
|
||||
|
|
|
|||
|
|
@ -41,6 +41,21 @@
|
|||
<span class="filter-range-val">{{ filters.minTrustScore ?? 0 }}</span>
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="filter-group">
|
||||
<legend class="filter-label">Pages to fetch</legend>
|
||||
<div class="filter-pages" role="group" aria-label="Number of result pages">
|
||||
<button
|
||||
v-for="p in [1, 2, 3, 5]"
|
||||
:key="p"
|
||||
type="button"
|
||||
class="filter-pages-btn"
|
||||
:class="{ 'filter-pages-btn--active': filters.pages === p }"
|
||||
@click="filters.pages = p"
|
||||
>{{ p }}</button>
|
||||
</div>
|
||||
<p class="filter-pages-hint">{{ (filters.pages ?? 1) * 48 }} listings · {{ (filters.pages ?? 1) * 2 }} Playwright calls</p>
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="filter-group">
|
||||
<legend class="filter-label">Price</legend>
|
||||
<div class="filter-row">
|
||||
|
|
@ -166,6 +181,7 @@ const filters = reactive<SearchFilters>({
|
|||
hideNewAccounts: false,
|
||||
hideSuspiciousPrice: false,
|
||||
hideDuplicatePhotos: false,
|
||||
pages: 1,
|
||||
})
|
||||
|
||||
const CONDITIONS = [
|
||||
|
|
@ -407,6 +423,43 @@ async function onSearch() {
|
|||
height: 14px;
|
||||
}
|
||||
|
||||
.filter-pages {
|
||||
display: flex;
|
||||
gap: var(--space-1);
|
||||
}
|
||||
|
||||
.filter-pages-btn {
|
||||
flex: 1;
|
||||
padding: var(--space-1) 0;
|
||||
background: var(--color-surface-raised);
|
||||
border: 1px solid var(--color-border);
|
||||
border-radius: var(--radius-sm);
|
||||
color: var(--color-text-muted);
|
||||
font-family: var(--font-body);
|
||||
font-size: 0.8125rem;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
transition: background 120ms ease, color 120ms ease, border-color 120ms ease;
|
||||
}
|
||||
|
||||
.filter-pages-btn:hover:not(.filter-pages-btn--active) {
|
||||
border-color: var(--app-primary);
|
||||
color: var(--app-primary);
|
||||
}
|
||||
|
||||
.filter-pages-btn--active {
|
||||
background: var(--app-primary);
|
||||
border-color: var(--app-primary);
|
||||
color: var(--color-text-inverse);
|
||||
}
|
||||
|
||||
.filter-pages-hint {
|
||||
font-size: 0.6875rem;
|
||||
color: var(--color-text-muted);
|
||||
margin: 0;
|
||||
opacity: 0.75;
|
||||
}
|
||||
|
||||
/* Results area */
|
||||
.results-area {
|
||||
flex: 1;
|
||||
|
|
|
|||
Loading…
Reference in a new issue