snipe/app/platforms/ebay/scraper.py
pyr0ball 6ec0f957b9 feat(snipe): auction support + easter eggs (Konami, The Steal, de-emphasis)
Auction metadata:
- Listing model gains buying_format + ends_at fields
- Migration 002 adds columns to existing databases
- scraper.py: parse s-item__time-left → absolute ends_at ISO timestamp
- normaliser.py: extract buyingOptions + itemEndDate from Browse API
- store.py: save/get updated for new fields

Easter eggs (app/ui/components/easter_eggs.py):
- Konami code detector (JS → URL param → Streamlit rerun)
- Web Audio API snipe call synthesis, gated behind sidebar checkbox
  (disabled by default for safety/accessibility)
- "The Steal" gold shimmer: trust ≥ 90, price 15–30% below market,
  no suspicious_price flag
- Auction de-emphasis: soft caption when > 1h remaining

UI updates:
- listing_row: steal banner + auction notice per row
- Search: inject CSS, check snipe mode, "Ending soon" sort option,
  pass market_price from comp cache to row renderer
- app.py: Konami detector + audio enable/disable sidebar toggle

Tests: 22 new tests (72 total, all green)
2026-03-25 14:27:02 -07:00

265 lines
9.8 KiB
Python

"""Scraper-based eBay adapter — free tier, no API key required.
Data available from search results HTML (single page load):
✅ title, price, condition, photos, URL
✅ seller username, feedback count, feedback ratio
❌ account registration date → account_age_score = None (score_is_partial)
❌ category history → category_history_score = None (score_is_partial)
This is the MIT discovery layer. EbayAdapter (paid/CF proxy) unlocks full trust scores.
"""
from __future__ import annotations
import hashlib
import re
import time
from datetime import datetime, timedelta, timezone
from typing import Optional
import requests
from bs4 import BeautifulSoup
from app.db.models import Listing, MarketComp, Seller
from app.db.store import Store
from app.platforms import PlatformAdapter, SearchFilters
EBAY_SEARCH_URL = "https://www.ebay.com/sch/i.html"
_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
_SELLER_RE = re.compile(r"^(.+?)\s+\(([0-9,]+)\)\s+([\d.]+)%")
_PRICE_RE = re.compile(r"[\d,]+\.?\d*")
_ITEM_ID_RE = re.compile(r"/itm/(\d+)")
_TIME_LEFT_RE = re.compile(r"(?:(\d+)d\s*)?(?:(\d+)h\s*)?(?:(\d+)m\s*)?(?:(\d+)s\s*)?left", re.I)
# ---------------------------------------------------------------------------
# Pure HTML parsing functions (unit-testable, no HTTP)
# ---------------------------------------------------------------------------
def _parse_price(text: str) -> float:
"""Extract first numeric value from price text.
Handles '$950.00', '$900.00 to $1,050.00', '$1,234.56/ea'.
Takes the lower bound for price ranges (conservative for trust scoring).
"""
m = _PRICE_RE.search(text.replace(",", ""))
return float(m.group()) if m else 0.0
def _parse_seller(text: str) -> tuple[str, int, float]:
"""Parse eBay seller-info text into (username, feedback_count, feedback_ratio).
Input format: 'tech_seller (1,234) 99.1% positive feedback'
Returns ('tech_seller', 1234, 0.991).
Falls back gracefully if the format doesn't match.
"""
text = text.strip()
m = _SELLER_RE.match(text)
if not m:
return (text.split()[0] if text else ""), 0, 0.0
return m.group(1).strip(), int(m.group(2).replace(",", "")), float(m.group(3)) / 100.0
def _parse_time_left(text: str) -> Optional[timedelta]:
"""Parse eBay time-left text into a timedelta.
Handles '3d 14h left', '14h 23m left', '23m 45s left'.
Returns None if text doesn't match (i.e. fixed-price listing).
"""
if not text:
return None
m = _TIME_LEFT_RE.search(text)
if not m or not any(m.groups()):
return None
days = int(m.group(1) or 0)
hours = int(m.group(2) or 0)
minutes = int(m.group(3) or 0)
seconds = int(m.group(4) or 0)
if days == hours == minutes == seconds == 0:
return None
return timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)
def scrape_listings(html: str) -> list[Listing]:
"""Parse eBay search results HTML into Listing objects."""
soup = BeautifulSoup(html, "lxml")
results = []
for item in soup.select("li.s-item"):
# eBay injects a ghost "Shop on eBay" promo as the first item — skip it
title_el = item.select_one("h3.s-item__title span, div.s-item__title span")
if not title_el or "Shop on eBay" in title_el.text:
continue
link_el = item.select_one("a.s-item__link")
url = link_el["href"].split("?")[0] if link_el else ""
id_match = _ITEM_ID_RE.search(url)
platform_listing_id = (
id_match.group(1) if id_match else hashlib.md5(url.encode()).hexdigest()[:12]
)
price_el = item.select_one("span.s-item__price")
price = _parse_price(price_el.text) if price_el else 0.0
condition_el = item.select_one("span.SECONDARY_INFO")
condition = condition_el.text.strip().lower() if condition_el else ""
seller_el = item.select_one("span.s-item__seller-info-text")
seller_username = _parse_seller(seller_el.text)[0] if seller_el else ""
# Images are lazy-loaded — check data-src before src
img_el = item.select_one("div.s-item__image-wrapper img, .s-item__image img")
photo_url = ""
if img_el:
photo_url = img_el.get("data-src") or img_el.get("src") or ""
# Auction detection: presence of s-item__time-left means auction format
time_el = item.select_one("span.s-item__time-left")
time_remaining = _parse_time_left(time_el.text) if time_el else None
buying_format = "auction" if time_remaining is not None else "fixed_price"
ends_at = None
if time_remaining is not None:
ends_at = (datetime.now(timezone.utc) + time_remaining).isoformat()
results.append(Listing(
platform="ebay",
platform_listing_id=platform_listing_id,
title=title_el.text.strip(),
price=price,
currency="USD",
condition=condition,
seller_platform_id=seller_username,
url=url,
photo_urls=[photo_url] if photo_url else [],
listing_age_days=0, # not reliably in search HTML
buying_format=buying_format,
ends_at=ends_at,
))
return results
def scrape_sellers(html: str) -> dict[str, Seller]:
"""Extract Seller objects from search results HTML.
Returns a dict keyed by username. account_age_days and category_history_json
are left empty — they require a separate seller profile page fetch, which
would mean one extra HTTP request per seller. That data gap is what separates
free (scraper) from paid (API) tier.
"""
soup = BeautifulSoup(html, "lxml")
sellers: dict[str, Seller] = {}
for item in soup.select("li.s-item"):
seller_el = item.select_one("span.s-item__seller-info-text")
if not seller_el:
continue
username, count, ratio = _parse_seller(seller_el.text)
if username and username not in sellers:
sellers[username] = Seller(
platform="ebay",
platform_seller_id=username,
username=username,
account_age_days=0, # not available from search HTML
feedback_count=count,
feedback_ratio=ratio,
category_history_json="{}", # not available from search HTML
)
return sellers
# ---------------------------------------------------------------------------
# Adapter
# ---------------------------------------------------------------------------
class ScrapedEbayAdapter(PlatformAdapter):
"""
Scraper-based eBay adapter implementing PlatformAdapter with no API key.
Extracts seller feedback directly from search result cards — no extra
per-seller page requests. The two unavailable signals (account_age,
category_history) cause TrustScorer to set score_is_partial=True.
"""
def __init__(self, store: Store, delay: float = 0.5):
self._store = store
self._delay = delay
self._session = requests.Session()
self._session.headers.update(_HEADERS)
def _get(self, params: dict) -> str:
time.sleep(self._delay)
resp = self._session.get(EBAY_SEARCH_URL, params=params, timeout=15)
resp.raise_for_status()
return resp.text
def search(self, query: str, filters: SearchFilters) -> list[Listing]:
params: dict = {"_nkw": query, "_sop": "15", "_ipg": "48"}
if filters.max_price:
params["_udhi"] = str(filters.max_price)
if filters.min_price:
params["_udlo"] = str(filters.min_price)
if filters.condition:
cond_map = {
"new": "1000", "used": "3000",
"open box": "2500", "for parts": "7000",
}
codes = [cond_map[c] for c in filters.condition if c in cond_map]
if codes:
params["LH_ItemCondition"] = "|".join(codes)
html = self._get(params)
listings = scrape_listings(html)
# Cache seller objects extracted from the same page
for seller in scrape_sellers(html).values():
self._store.save_seller(seller)
return listings
def get_seller(self, seller_platform_id: str) -> Optional[Seller]:
# Sellers are pre-populated during search(); no extra fetch needed
return self._store.get_seller("ebay", seller_platform_id)
def get_completed_sales(self, query: str) -> list[Listing]:
query_hash = hashlib.md5(query.encode()).hexdigest()
if self._store.get_market_comp("ebay", query_hash):
return [] # cache hit — comp already stored
params = {
"_nkw": query,
"LH_Sold": "1",
"LH_Complete": "1",
"_sop": "13", # price + shipping: lowest first
"_ipg": "48",
}
try:
html = self._get(params)
listings = scrape_listings(html)
prices = sorted(l.price for l in listings if l.price > 0)
if prices:
median = prices[len(prices) // 2]
self._store.save_market_comp(MarketComp(
platform="ebay",
query_hash=query_hash,
median_price=median,
sample_count=len(prices),
expires_at=(datetime.now(timezone.utc) + timedelta(hours=6)).isoformat(),
))
return listings
except Exception:
return []