snipe/app/platforms/ebay/scraper.py
pyr0ball ea78b9c2cd feat(snipe): parallel search+comps, pagination, title fix, price flag fix
- Parallel execution: search() and get_completed_sales() now run
  concurrently via ThreadPoolExecutor — each gets its own Store/SQLite
  connection for thread safety. First cold search time ~halved.

- Pagination: SearchFilters.pages (default 1) controls how many eBay
  result pages are fetched. Both search and sold-comps support up to 3
  parallel Playwright sessions per call (capped to avoid Xvfb overload).
  UI: segmented 1/2/3/5 pages selector in filter sidebar with cost hint.

- True median: get_completed_sales() now averages the two middle values
  for even-length price lists instead of always taking the lower bound.

- Fix suspicious_price false positive: aggregator now checks
  signal_scores.get("price_vs_market") == 0 (pre-None-substitution)
  so listings without market data are never flagged as suspicious.

- Fix title pollution: scraper strips eBay's hidden screen-reader span
  ("Opens in a new window or tab") from listing titles via regex.
  Lazy-imports playwright/playwright_stealth inside _get() so pure
  parsing functions are importable without the full browser stack.

- Tests: 48 pass on host (scraper tests now runnable without Docker),
  new regression guards for all three bug fixes.
2026-03-25 22:16:08 -07:00

368 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Scraper-based eBay adapter — free tier, no API key required.
Data available from search results HTML (single page load):
✅ title, price, condition, photos, URL
✅ seller username, feedback count, feedback ratio
❌ account registration date → account_age_score = None (score_is_partial)
❌ category history → category_history_score = None (score_is_partial)
This is the MIT discovery layer. EbayAdapter (paid/CF proxy) unlocks full trust scores.
"""
from __future__ import annotations
import hashlib
import itertools
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timedelta, timezone
from typing import Optional
from bs4 import BeautifulSoup
from app.db.models import Listing, MarketComp, Seller
from app.db.store import Store
from app.platforms import PlatformAdapter, SearchFilters
EBAY_SEARCH_URL = "https://www.ebay.com/sch/i.html"
_HTML_CACHE_TTL = 300 # seconds — 5 minutes
# Module-level cache persists across per-request adapter instantiations.
# Keyed by URL; value is (html, expiry_timestamp).
_html_cache: dict[str, tuple[str, float]] = {}
# Cycle through display numbers :200:299 so concurrent/sequential Playwright
# calls don't collide on the Xvfb lock file from the previous run.
_display_counter = itertools.cycle(range(200, 300))
_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
_SELLER_RE = re.compile(r"^(.+?)\s+\(([0-9,]+)\)\s+([\d.]+)%")
_FEEDBACK_RE = re.compile(r"([\d.]+)%\s+positive\s+\(([0-9,]+)\)", re.I)
_PRICE_RE = re.compile(r"[\d,]+\.?\d*")
_ITEM_ID_RE = re.compile(r"/itm/(\d+)")
_TIME_LEFT_RE = re.compile(r"(?:(\d+)d\s*)?(?:(\d+)h\s*)?(?:(\d+)m\s*)?(?:(\d+)s\s*)?left", re.I)
# ---------------------------------------------------------------------------
# Pure HTML parsing functions (unit-testable, no HTTP)
# ---------------------------------------------------------------------------
def _parse_price(text: str) -> float:
"""Extract first numeric value from price text.
Handles '$950.00', '$900.00 to $1,050.00', '$1,234.56/ea'.
Takes the lower bound for price ranges (conservative for trust scoring).
"""
m = _PRICE_RE.search(text.replace(",", ""))
return float(m.group()) if m else 0.0
def _parse_seller(text: str) -> tuple[str, int, float]:
"""Parse eBay seller-info text into (username, feedback_count, feedback_ratio).
Input format: 'tech_seller (1,234) 99.1% positive feedback'
Returns ('tech_seller', 1234, 0.991).
Falls back gracefully if the format doesn't match.
"""
text = text.strip()
m = _SELLER_RE.match(text)
if not m:
return (text.split()[0] if text else ""), 0, 0.0
return m.group(1).strip(), int(m.group(2).replace(",", "")), float(m.group(3)) / 100.0
def _parse_time_left(text: str) -> Optional[timedelta]:
"""Parse eBay time-left text into a timedelta.
Handles '3d 14h left', '14h 23m left', '23m 45s left'.
Returns None if text doesn't match (i.e. fixed-price listing).
"""
if not text:
return None
m = _TIME_LEFT_RE.search(text)
if not m or not any(m.groups()):
return None
days = int(m.group(1) or 0)
hours = int(m.group(2) or 0)
minutes = int(m.group(3) or 0)
seconds = int(m.group(4) or 0)
if days == hours == minutes == seconds == 0:
return None
return timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)
def _extract_seller_from_card(card) -> tuple[str, int, float]:
"""Extract (username, feedback_count, feedback_ratio) from an s-card element.
New eBay layout has seller username and feedback as separate su-styled-text spans.
We find the feedback span by regex, then take the immediately preceding text as username.
"""
texts = [s.get_text(strip=True) for s in card.select("span.su-styled-text") if s.get_text(strip=True)]
username, count, ratio = "", 0, 0.0
for i, t in enumerate(texts):
m = _FEEDBACK_RE.search(t)
if m:
ratio = float(m.group(1)) / 100.0
count = int(m.group(2).replace(",", ""))
# Username is the span just before the feedback span
if i > 0:
username = texts[i - 1].strip()
break
return username, count, ratio
def scrape_listings(html: str) -> list[Listing]:
"""Parse eBay search results HTML into Listing objects."""
soup = BeautifulSoup(html, "lxml")
results = []
for item in soup.select("li.s-card"):
# Skip promos: no data-listingid or title is "Shop on eBay"
platform_listing_id = item.get("data-listingid", "")
if not platform_listing_id:
continue
title_el = item.select_one("div.s-card__title")
if not title_el or "Shop on eBay" in title_el.get_text():
continue
link_el = item.select_one('a.s-card__link[href*="/itm/"]')
url = link_el["href"].split("?")[0] if link_el else ""
price_el = item.select_one("span.s-card__price")
price = _parse_price(price_el.get_text()) if price_el else 0.0
condition_el = item.select_one("div.s-card__subtitle")
condition = condition_el.get_text(strip=True).split("·")[0].strip().lower() if condition_el else ""
seller_username, _, _ = _extract_seller_from_card(item)
img_el = item.select_one("img.s-card__image")
photo_url = img_el.get("src") or img_el.get("data-src") or "" if img_el else ""
# Auction detection via time-left text patterns in card spans
time_remaining = None
for span in item.select("span.su-styled-text"):
t = span.get_text(strip=True)
td = _parse_time_left(t)
if td:
time_remaining = td
break
buying_format = "auction" if time_remaining is not None else "fixed_price"
ends_at = (datetime.now(timezone.utc) + time_remaining).isoformat() if time_remaining else None
# Strip eBay's screen-reader accessibility text injected into title links.
# get_text() is CSS-blind and picks up visually-hidden spans.
raw_title = title_el.get_text(separator=" ", strip=True)
title = re.sub(r"\s*Opens in a new window or tab\s*", "", raw_title, flags=re.IGNORECASE).strip()
results.append(Listing(
platform="ebay",
platform_listing_id=platform_listing_id,
title=title,
price=price,
currency="USD",
condition=condition,
seller_platform_id=seller_username,
url=url,
photo_urls=[photo_url] if photo_url else [],
listing_age_days=0,
buying_format=buying_format,
ends_at=ends_at,
))
return results
def scrape_sellers(html: str) -> dict[str, Seller]:
"""Extract Seller objects from search results HTML.
Returns a dict keyed by username. account_age_days and category_history_json
are left empty — they require a separate seller profile page fetch, which
would mean one extra HTTP request per seller. That data gap is what separates
free (scraper) from paid (API) tier.
"""
soup = BeautifulSoup(html, "lxml")
sellers: dict[str, Seller] = {}
for item in soup.select("li.s-card"):
if not item.get("data-listingid"):
continue
username, count, ratio = _extract_seller_from_card(item)
if username and username not in sellers:
sellers[username] = Seller(
platform="ebay",
platform_seller_id=username,
username=username,
account_age_days=None, # not fetched at scraper tier
feedback_count=count,
feedback_ratio=ratio,
category_history_json="{}", # not available from search HTML
)
return sellers
# ---------------------------------------------------------------------------
# Adapter
# ---------------------------------------------------------------------------
class ScrapedEbayAdapter(PlatformAdapter):
"""
Scraper-based eBay adapter implementing PlatformAdapter with no API key.
Extracts seller feedback directly from search result cards — no extra
per-seller page requests. The two unavailable signals (account_age,
category_history) cause TrustScorer to set score_is_partial=True.
"""
def __init__(self, store: Store, delay: float = 1.0):
self._store = store
self._delay = delay
def _get(self, params: dict) -> str:
"""Fetch eBay search HTML via a stealthed Playwright Chromium instance.
Uses Xvfb virtual display (headless=False) to avoid Kasada's headless
detection — same pattern as other CF scrapers that face JS challenges.
Results are cached for _HTML_CACHE_TTL seconds so repeated searches
for the same query return immediately without re-scraping.
"""
url = EBAY_SEARCH_URL + "?" + "&".join(f"{k}={v}" for k, v in params.items())
cached = _html_cache.get(url)
if cached and time.time() < cached[1]:
return cached[0]
time.sleep(self._delay)
import subprocess, os
display_num = next(_display_counter)
display = f":{display_num}"
xvfb = subprocess.Popen(
["Xvfb", display, "-screen", "0", "1280x800x24"],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
)
env = os.environ.copy()
env["DISPLAY"] = display
try:
from playwright.sync_api import sync_playwright # noqa: PLC0415 — lazy: only needed in Docker
from playwright_stealth import Stealth # noqa: PLC0415
with sync_playwright() as pw:
browser = pw.chromium.launch(
headless=False,
env=env,
args=["--no-sandbox", "--disable-dev-shm-usage"],
)
ctx = browser.new_context(
user_agent=_HEADERS["User-Agent"],
viewport={"width": 1280, "height": 800},
)
page = ctx.new_page()
Stealth().apply_stealth_sync(page)
page.goto(url, wait_until="domcontentloaded", timeout=30_000)
page.wait_for_timeout(2000) # let any JS challenges resolve
html = page.content()
browser.close()
finally:
xvfb.terminate()
xvfb.wait()
_html_cache[url] = (html, time.time() + _HTML_CACHE_TTL)
return html
def search(self, query: str, filters: SearchFilters) -> list[Listing]:
base_params: dict = {"_nkw": query, "_sop": "15", "_ipg": "48"}
if filters.max_price:
base_params["_udhi"] = str(filters.max_price)
if filters.min_price:
base_params["_udlo"] = str(filters.min_price)
if filters.condition:
cond_map = {
"new": "1000", "used": "3000",
"open box": "2500", "for parts": "7000",
}
codes = [cond_map[c] for c in filters.condition if c in cond_map]
if codes:
base_params["LH_ItemCondition"] = "|".join(codes)
pages = max(1, filters.pages)
page_params = [{**base_params, "_pgn": str(p)} for p in range(1, pages + 1)]
with ThreadPoolExecutor(max_workers=min(pages, 3)) as ex:
htmls = list(ex.map(self._get, page_params))
seen_ids: set[str] = set()
listings: list[Listing] = []
sellers: dict[str, "Seller"] = {}
for html in htmls:
for listing in scrape_listings(html):
if listing.platform_listing_id not in seen_ids:
seen_ids.add(listing.platform_listing_id)
listings.append(listing)
sellers.update(scrape_sellers(html))
self._store.save_sellers(list(sellers.values()))
return listings
def get_seller(self, seller_platform_id: str) -> Optional[Seller]:
# Sellers are pre-populated during search(); no extra fetch needed
return self._store.get_seller("ebay", seller_platform_id)
def get_completed_sales(self, query: str, pages: int = 1) -> list[Listing]:
query_hash = hashlib.md5(query.encode()).hexdigest()
if self._store.get_market_comp("ebay", query_hash):
return [] # cache hit — comp already stored
base_params = {
"_nkw": query,
"LH_Sold": "1",
"LH_Complete": "1",
"_sop": "13", # sort by price+shipping, lowest first
"_ipg": "48",
}
pages = max(1, pages)
page_params = [{**base_params, "_pgn": str(p)} for p in range(1, pages + 1)]
try:
with ThreadPoolExecutor(max_workers=min(pages, 3)) as ex:
htmls = list(ex.map(self._get, page_params))
seen_ids: set[str] = set()
all_listings: list[Listing] = []
for html in htmls:
for listing in scrape_listings(html):
if listing.platform_listing_id not in seen_ids:
seen_ids.add(listing.platform_listing_id)
all_listings.append(listing)
prices = sorted(l.price for l in all_listings if l.price > 0)
if prices:
mid = len(prices) // 2
median = (prices[mid - 1] + prices[mid]) / 2 if len(prices) % 2 == 0 else prices[mid]
self._store.save_market_comp(MarketComp(
platform="ebay",
query_hash=query_hash,
median_price=median,
sample_count=len(prices),
expires_at=(datetime.now(timezone.utc) + timedelta(hours=6)).isoformat(),
))
return all_listings
except Exception:
return []