Auction metadata: - Listing model gains buying_format + ends_at fields - Migration 002 adds columns to existing databases - scraper.py: parse s-item__time-left → absolute ends_at ISO timestamp - normaliser.py: extract buyingOptions + itemEndDate from Browse API - store.py: save/get updated for new fields Easter eggs (app/ui/components/easter_eggs.py): - Konami code detector (JS → URL param → Streamlit rerun) - Web Audio API snipe call synthesis, gated behind sidebar checkbox (disabled by default for safety/accessibility) - "The Steal" gold shimmer: trust ≥ 90, price 15–30% below market, no suspicious_price flag - Auction de-emphasis: soft caption when > 1h remaining UI updates: - listing_row: steal banner + auction notice per row - Search: inject CSS, check snipe mode, "Ending soon" sort option, pass market_price from comp cache to row renderer - app.py: Konami detector + audio enable/disable sidebar toggle Tests: 22 new tests (72 total, all green)
265 lines
9.8 KiB
Python
265 lines
9.8 KiB
Python
"""Scraper-based eBay adapter — free tier, no API key required.
|
|
|
|
Data available from search results HTML (single page load):
|
|
✅ title, price, condition, photos, URL
|
|
✅ seller username, feedback count, feedback ratio
|
|
❌ account registration date → account_age_score = None (score_is_partial)
|
|
❌ category history → category_history_score = None (score_is_partial)
|
|
|
|
This is the MIT discovery layer. EbayAdapter (paid/CF proxy) unlocks full trust scores.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import re
|
|
import time
|
|
from datetime import datetime, timedelta, timezone
|
|
from typing import Optional
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
from app.db.models import Listing, MarketComp, Seller
|
|
from app.db.store import Store
|
|
from app.platforms import PlatformAdapter, SearchFilters
|
|
|
|
EBAY_SEARCH_URL = "https://www.ebay.com/sch/i.html"
|
|
|
|
_HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
|
),
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.5",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
"DNT": "1",
|
|
"Connection": "keep-alive",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
}
|
|
|
|
_SELLER_RE = re.compile(r"^(.+?)\s+\(([0-9,]+)\)\s+([\d.]+)%")
|
|
_PRICE_RE = re.compile(r"[\d,]+\.?\d*")
|
|
_ITEM_ID_RE = re.compile(r"/itm/(\d+)")
|
|
_TIME_LEFT_RE = re.compile(r"(?:(\d+)d\s*)?(?:(\d+)h\s*)?(?:(\d+)m\s*)?(?:(\d+)s\s*)?left", re.I)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Pure HTML parsing functions (unit-testable, no HTTP)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _parse_price(text: str) -> float:
|
|
"""Extract first numeric value from price text.
|
|
|
|
Handles '$950.00', '$900.00 to $1,050.00', '$1,234.56/ea'.
|
|
Takes the lower bound for price ranges (conservative for trust scoring).
|
|
"""
|
|
m = _PRICE_RE.search(text.replace(",", ""))
|
|
return float(m.group()) if m else 0.0
|
|
|
|
|
|
def _parse_seller(text: str) -> tuple[str, int, float]:
|
|
"""Parse eBay seller-info text into (username, feedback_count, feedback_ratio).
|
|
|
|
Input format: 'tech_seller (1,234) 99.1% positive feedback'
|
|
Returns ('tech_seller', 1234, 0.991).
|
|
Falls back gracefully if the format doesn't match.
|
|
"""
|
|
text = text.strip()
|
|
m = _SELLER_RE.match(text)
|
|
if not m:
|
|
return (text.split()[0] if text else ""), 0, 0.0
|
|
return m.group(1).strip(), int(m.group(2).replace(",", "")), float(m.group(3)) / 100.0
|
|
|
|
|
|
def _parse_time_left(text: str) -> Optional[timedelta]:
|
|
"""Parse eBay time-left text into a timedelta.
|
|
|
|
Handles '3d 14h left', '14h 23m left', '23m 45s left'.
|
|
Returns None if text doesn't match (i.e. fixed-price listing).
|
|
"""
|
|
if not text:
|
|
return None
|
|
m = _TIME_LEFT_RE.search(text)
|
|
if not m or not any(m.groups()):
|
|
return None
|
|
days = int(m.group(1) or 0)
|
|
hours = int(m.group(2) or 0)
|
|
minutes = int(m.group(3) or 0)
|
|
seconds = int(m.group(4) or 0)
|
|
if days == hours == minutes == seconds == 0:
|
|
return None
|
|
return timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)
|
|
|
|
|
|
def scrape_listings(html: str) -> list[Listing]:
|
|
"""Parse eBay search results HTML into Listing objects."""
|
|
soup = BeautifulSoup(html, "lxml")
|
|
results = []
|
|
|
|
for item in soup.select("li.s-item"):
|
|
# eBay injects a ghost "Shop on eBay" promo as the first item — skip it
|
|
title_el = item.select_one("h3.s-item__title span, div.s-item__title span")
|
|
if not title_el or "Shop on eBay" in title_el.text:
|
|
continue
|
|
|
|
link_el = item.select_one("a.s-item__link")
|
|
url = link_el["href"].split("?")[0] if link_el else ""
|
|
id_match = _ITEM_ID_RE.search(url)
|
|
platform_listing_id = (
|
|
id_match.group(1) if id_match else hashlib.md5(url.encode()).hexdigest()[:12]
|
|
)
|
|
|
|
price_el = item.select_one("span.s-item__price")
|
|
price = _parse_price(price_el.text) if price_el else 0.0
|
|
|
|
condition_el = item.select_one("span.SECONDARY_INFO")
|
|
condition = condition_el.text.strip().lower() if condition_el else ""
|
|
|
|
seller_el = item.select_one("span.s-item__seller-info-text")
|
|
seller_username = _parse_seller(seller_el.text)[0] if seller_el else ""
|
|
|
|
# Images are lazy-loaded — check data-src before src
|
|
img_el = item.select_one("div.s-item__image-wrapper img, .s-item__image img")
|
|
photo_url = ""
|
|
if img_el:
|
|
photo_url = img_el.get("data-src") or img_el.get("src") or ""
|
|
|
|
# Auction detection: presence of s-item__time-left means auction format
|
|
time_el = item.select_one("span.s-item__time-left")
|
|
time_remaining = _parse_time_left(time_el.text) if time_el else None
|
|
buying_format = "auction" if time_remaining is not None else "fixed_price"
|
|
ends_at = None
|
|
if time_remaining is not None:
|
|
ends_at = (datetime.now(timezone.utc) + time_remaining).isoformat()
|
|
|
|
results.append(Listing(
|
|
platform="ebay",
|
|
platform_listing_id=platform_listing_id,
|
|
title=title_el.text.strip(),
|
|
price=price,
|
|
currency="USD",
|
|
condition=condition,
|
|
seller_platform_id=seller_username,
|
|
url=url,
|
|
photo_urls=[photo_url] if photo_url else [],
|
|
listing_age_days=0, # not reliably in search HTML
|
|
buying_format=buying_format,
|
|
ends_at=ends_at,
|
|
))
|
|
|
|
return results
|
|
|
|
|
|
def scrape_sellers(html: str) -> dict[str, Seller]:
|
|
"""Extract Seller objects from search results HTML.
|
|
|
|
Returns a dict keyed by username. account_age_days and category_history_json
|
|
are left empty — they require a separate seller profile page fetch, which
|
|
would mean one extra HTTP request per seller. That data gap is what separates
|
|
free (scraper) from paid (API) tier.
|
|
"""
|
|
soup = BeautifulSoup(html, "lxml")
|
|
sellers: dict[str, Seller] = {}
|
|
|
|
for item in soup.select("li.s-item"):
|
|
seller_el = item.select_one("span.s-item__seller-info-text")
|
|
if not seller_el:
|
|
continue
|
|
username, count, ratio = _parse_seller(seller_el.text)
|
|
if username and username not in sellers:
|
|
sellers[username] = Seller(
|
|
platform="ebay",
|
|
platform_seller_id=username,
|
|
username=username,
|
|
account_age_days=0, # not available from search HTML
|
|
feedback_count=count,
|
|
feedback_ratio=ratio,
|
|
category_history_json="{}", # not available from search HTML
|
|
)
|
|
|
|
return sellers
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Adapter
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class ScrapedEbayAdapter(PlatformAdapter):
|
|
"""
|
|
Scraper-based eBay adapter implementing PlatformAdapter with no API key.
|
|
|
|
Extracts seller feedback directly from search result cards — no extra
|
|
per-seller page requests. The two unavailable signals (account_age,
|
|
category_history) cause TrustScorer to set score_is_partial=True.
|
|
"""
|
|
|
|
def __init__(self, store: Store, delay: float = 0.5):
|
|
self._store = store
|
|
self._delay = delay
|
|
self._session = requests.Session()
|
|
self._session.headers.update(_HEADERS)
|
|
|
|
def _get(self, params: dict) -> str:
|
|
time.sleep(self._delay)
|
|
resp = self._session.get(EBAY_SEARCH_URL, params=params, timeout=15)
|
|
resp.raise_for_status()
|
|
return resp.text
|
|
|
|
def search(self, query: str, filters: SearchFilters) -> list[Listing]:
|
|
params: dict = {"_nkw": query, "_sop": "15", "_ipg": "48"}
|
|
|
|
if filters.max_price:
|
|
params["_udhi"] = str(filters.max_price)
|
|
if filters.min_price:
|
|
params["_udlo"] = str(filters.min_price)
|
|
if filters.condition:
|
|
cond_map = {
|
|
"new": "1000", "used": "3000",
|
|
"open box": "2500", "for parts": "7000",
|
|
}
|
|
codes = [cond_map[c] for c in filters.condition if c in cond_map]
|
|
if codes:
|
|
params["LH_ItemCondition"] = "|".join(codes)
|
|
|
|
html = self._get(params)
|
|
listings = scrape_listings(html)
|
|
|
|
# Cache seller objects extracted from the same page
|
|
for seller in scrape_sellers(html).values():
|
|
self._store.save_seller(seller)
|
|
|
|
return listings
|
|
|
|
def get_seller(self, seller_platform_id: str) -> Optional[Seller]:
|
|
# Sellers are pre-populated during search(); no extra fetch needed
|
|
return self._store.get_seller("ebay", seller_platform_id)
|
|
|
|
def get_completed_sales(self, query: str) -> list[Listing]:
|
|
query_hash = hashlib.md5(query.encode()).hexdigest()
|
|
if self._store.get_market_comp("ebay", query_hash):
|
|
return [] # cache hit — comp already stored
|
|
|
|
params = {
|
|
"_nkw": query,
|
|
"LH_Sold": "1",
|
|
"LH_Complete": "1",
|
|
"_sop": "13", # price + shipping: lowest first
|
|
"_ipg": "48",
|
|
}
|
|
try:
|
|
html = self._get(params)
|
|
listings = scrape_listings(html)
|
|
prices = sorted(l.price for l in listings if l.price > 0)
|
|
if prices:
|
|
median = prices[len(prices) // 2]
|
|
self._store.save_market_comp(MarketComp(
|
|
platform="ebay",
|
|
query_hash=query_hash,
|
|
median_price=median,
|
|
sample_count=len(prices),
|
|
expires_at=(datetime.now(timezone.utc) + timedelta(hours=6)).isoformat(),
|
|
))
|
|
return listings
|
|
except Exception:
|
|
return []
|