From 68a98791911d40ac9b32aa7cfbe3f5d8c616743c Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Wed, 25 Mar 2026 14:12:29 -0700 Subject: [PATCH] feat: add scraper adapter with auto-detect fallback and partial score logging --- .env.example | 13 +- app/platforms/ebay/scraper.py | 234 +++++++++++++++++++++++++++ app/ui/Search.py | 42 +++-- pyproject.toml | 2 + tests/platforms/test_ebay_scraper.py | 151 +++++++++++++++++ 5 files changed, 427 insertions(+), 15 deletions(-) create mode 100644 app/platforms/ebay/scraper.py create mode 100644 tests/platforms/test_ebay_scraper.py diff --git a/.env.example b/.env.example index a82c944..6d9c360 100644 --- a/.env.example +++ b/.env.example @@ -1,4 +1,11 @@ -EBAY_CLIENT_ID=your-client-id-here -EBAY_CLIENT_SECRET=your-client-secret-here -EBAY_ENV=production # or: sandbox +# Snipe works out of the box with the scraper (no credentials needed). +# Set EBAY_CLIENT_ID + EBAY_CLIENT_SECRET to unlock full trust scores +# (account age and category history signals require the eBay Browse API). +# Without credentials the app logs a warning and uses the scraper automatically. + +# Optional — eBay API credentials (self-hosters / paid CF cloud tier) +# EBAY_CLIENT_ID=your-client-id-here +# EBAY_CLIENT_SECRET=your-client-secret-here +# EBAY_ENV=production # or: sandbox + SNIPE_DB=data/snipe.db diff --git a/app/platforms/ebay/scraper.py b/app/platforms/ebay/scraper.py new file mode 100644 index 0000000..c953e21 --- /dev/null +++ b/app/platforms/ebay/scraper.py @@ -0,0 +1,234 @@ +"""Scraper-based eBay adapter — free tier, no API key required. + +Data available from search results HTML (single page load): + ✅ title, price, condition, photos, URL + ✅ seller username, feedback count, feedback ratio + ❌ account registration date → account_age_score = None (score_is_partial) + ❌ category history → category_history_score = None (score_is_partial) + +This is the MIT discovery layer. EbayAdapter (paid/CF proxy) unlocks full trust scores. +""" +from __future__ import annotations + +import hashlib +import re +import time +from datetime import datetime, timedelta, timezone +from typing import Optional + +import requests +from bs4 import BeautifulSoup + +from app.db.models import Listing, MarketComp, Seller +from app.db.store import Store +from app.platforms import PlatformAdapter, SearchFilters + +EBAY_SEARCH_URL = "https://www.ebay.com/sch/i.html" + +_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" + ), + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Accept-Encoding": "gzip, deflate, br", + "DNT": "1", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", +} + +_SELLER_RE = re.compile(r"^(.+?)\s+\(([0-9,]+)\)\s+([\d.]+)%") +_PRICE_RE = re.compile(r"[\d,]+\.?\d*") +_ITEM_ID_RE = re.compile(r"/itm/(\d+)") + + +# --------------------------------------------------------------------------- +# Pure HTML parsing functions (unit-testable, no HTTP) +# --------------------------------------------------------------------------- + +def _parse_price(text: str) -> float: + """Extract first numeric value from price text. + + Handles '$950.00', '$900.00 to $1,050.00', '$1,234.56/ea'. + Takes the lower bound for price ranges (conservative for trust scoring). + """ + m = _PRICE_RE.search(text.replace(",", "")) + return float(m.group()) if m else 0.0 + + +def _parse_seller(text: str) -> tuple[str, int, float]: + """Parse eBay seller-info text into (username, feedback_count, feedback_ratio). + + Input format: 'tech_seller (1,234) 99.1% positive feedback' + Returns ('tech_seller', 1234, 0.991). + Falls back gracefully if the format doesn't match. + """ + text = text.strip() + m = _SELLER_RE.match(text) + if not m: + return (text.split()[0] if text else ""), 0, 0.0 + return m.group(1).strip(), int(m.group(2).replace(",", "")), float(m.group(3)) / 100.0 + + +def scrape_listings(html: str) -> list[Listing]: + """Parse eBay search results HTML into Listing objects.""" + soup = BeautifulSoup(html, "lxml") + results = [] + + for item in soup.select("li.s-item"): + # eBay injects a ghost "Shop on eBay" promo as the first item — skip it + title_el = item.select_one("h3.s-item__title span, div.s-item__title span") + if not title_el or "Shop on eBay" in title_el.text: + continue + + link_el = item.select_one("a.s-item__link") + url = link_el["href"].split("?")[0] if link_el else "" + id_match = _ITEM_ID_RE.search(url) + platform_listing_id = ( + id_match.group(1) if id_match else hashlib.md5(url.encode()).hexdigest()[:12] + ) + + price_el = item.select_one("span.s-item__price") + price = _parse_price(price_el.text) if price_el else 0.0 + + condition_el = item.select_one("span.SECONDARY_INFO") + condition = condition_el.text.strip().lower() if condition_el else "" + + seller_el = item.select_one("span.s-item__seller-info-text") + seller_username = _parse_seller(seller_el.text)[0] if seller_el else "" + + # Images are lazy-loaded — check data-src before src + img_el = item.select_one("div.s-item__image-wrapper img, .s-item__image img") + photo_url = "" + if img_el: + photo_url = img_el.get("data-src") or img_el.get("src") or "" + + results.append(Listing( + platform="ebay", + platform_listing_id=platform_listing_id, + title=title_el.text.strip(), + price=price, + currency="USD", + condition=condition, + seller_platform_id=seller_username, + url=url, + photo_urls=[photo_url] if photo_url else [], + listing_age_days=0, # not reliably in search HTML + )) + + return results + + +def scrape_sellers(html: str) -> dict[str, Seller]: + """Extract Seller objects from search results HTML. + + Returns a dict keyed by username. account_age_days and category_history_json + are left empty — they require a separate seller profile page fetch, which + would mean one extra HTTP request per seller. That data gap is what separates + free (scraper) from paid (API) tier. + """ + soup = BeautifulSoup(html, "lxml") + sellers: dict[str, Seller] = {} + + for item in soup.select("li.s-item"): + seller_el = item.select_one("span.s-item__seller-info-text") + if not seller_el: + continue + username, count, ratio = _parse_seller(seller_el.text) + if username and username not in sellers: + sellers[username] = Seller( + platform="ebay", + platform_seller_id=username, + username=username, + account_age_days=0, # not available from search HTML + feedback_count=count, + feedback_ratio=ratio, + category_history_json="{}", # not available from search HTML + ) + + return sellers + + +# --------------------------------------------------------------------------- +# Adapter +# --------------------------------------------------------------------------- + +class ScrapedEbayAdapter(PlatformAdapter): + """ + Scraper-based eBay adapter implementing PlatformAdapter with no API key. + + Extracts seller feedback directly from search result cards — no extra + per-seller page requests. The two unavailable signals (account_age, + category_history) cause TrustScorer to set score_is_partial=True. + """ + + def __init__(self, store: Store, delay: float = 0.5): + self._store = store + self._delay = delay + self._session = requests.Session() + self._session.headers.update(_HEADERS) + + def _get(self, params: dict) -> str: + time.sleep(self._delay) + resp = self._session.get(EBAY_SEARCH_URL, params=params, timeout=15) + resp.raise_for_status() + return resp.text + + def search(self, query: str, filters: SearchFilters) -> list[Listing]: + params: dict = {"_nkw": query, "_sop": "15", "_ipg": "48"} + + if filters.max_price: + params["_udhi"] = str(filters.max_price) + if filters.min_price: + params["_udlo"] = str(filters.min_price) + if filters.condition: + cond_map = { + "new": "1000", "used": "3000", + "open box": "2500", "for parts": "7000", + } + codes = [cond_map[c] for c in filters.condition if c in cond_map] + if codes: + params["LH_ItemCondition"] = "|".join(codes) + + html = self._get(params) + listings = scrape_listings(html) + + # Cache seller objects extracted from the same page + for seller in scrape_sellers(html).values(): + self._store.save_seller(seller) + + return listings + + def get_seller(self, seller_platform_id: str) -> Optional[Seller]: + # Sellers are pre-populated during search(); no extra fetch needed + return self._store.get_seller("ebay", seller_platform_id) + + def get_completed_sales(self, query: str) -> list[Listing]: + query_hash = hashlib.md5(query.encode()).hexdigest() + if self._store.get_market_comp("ebay", query_hash): + return [] # cache hit — comp already stored + + params = { + "_nkw": query, + "LH_Sold": "1", + "LH_Complete": "1", + "_sop": "13", # price + shipping: lowest first + "_ipg": "48", + } + try: + html = self._get(params) + listings = scrape_listings(html) + prices = sorted(l.price for l in listings if l.price > 0) + if prices: + median = prices[len(prices) // 2] + self._store.save_market_comp(MarketComp( + platform="ebay", + query_hash=query_hash, + median_price=median, + sample_count=len(prices), + expires_at=(datetime.now(timezone.utc) + timedelta(hours=6)).isoformat(), + )) + return listings + except Exception: + return [] diff --git a/app/ui/Search.py b/app/ui/Search.py index 3af9558..9965ff5 100644 --- a/app/ui/Search.py +++ b/app/ui/Search.py @@ -1,30 +1,47 @@ """Main search + results page.""" from __future__ import annotations +import logging import os from pathlib import Path import streamlit as st from circuitforge_core.config import load_env from app.db.store import Store -from app.platforms import SearchFilters -from app.platforms.ebay.auth import EbayTokenManager -from app.platforms.ebay.adapter import EbayAdapter +from app.platforms import PlatformAdapter, SearchFilters from app.trust import TrustScorer from app.ui.components.filters import build_filter_options, render_filter_sidebar, FilterState from app.ui.components.listing_row import render_listing_row +log = logging.getLogger(__name__) + load_env(Path(".env")) _DB_PATH = Path(os.environ.get("SNIPE_DB", "data/snipe.db")) _DB_PATH.parent.mkdir(exist_ok=True) -def _get_adapter() -> EbayAdapter: - store = Store(_DB_PATH) - tokens = EbayTokenManager( - client_id=os.environ.get("EBAY_CLIENT_ID", ""), - client_secret=os.environ.get("EBAY_CLIENT_SECRET", ""), - env=os.environ.get("EBAY_ENV", "production"), +def _get_adapter(store: Store) -> PlatformAdapter: + """Return the best available eBay adapter based on what's configured. + + Auto-detects: if EBAY_CLIENT_ID + EBAY_CLIENT_SECRET are present, use the + full API adapter (all 5 trust signals). Otherwise fall back to the scraper + (3/5 signals, score_is_partial=True) and warn to logs so ops can see why + scores are partial without touching the UI. + """ + client_id = os.environ.get("EBAY_CLIENT_ID", "").strip() + client_secret = os.environ.get("EBAY_CLIENT_SECRET", "").strip() + + if client_id and client_secret: + from app.platforms.ebay.adapter import EbayAdapter + from app.platforms.ebay.auth import EbayTokenManager + env = os.environ.get("EBAY_ENV", "production") + return EbayAdapter(EbayTokenManager(client_id, client_secret, env), store, env=env) + + log.warning( + "EBAY_CLIENT_ID / EBAY_CLIENT_SECRET not set — " + "falling back to scraper (partial trust scores: account_age and " + "category_history signals unavailable). Set API credentials for full scoring." ) - return EbayAdapter(tokens, store, env=os.environ.get("EBAY_ENV", "production")) + from app.platforms.ebay.scraper import ScrapedEbayAdapter + return ScrapedEbayAdapter(store) def _passes_filter(listing, trust, seller, state: FilterState) -> bool: @@ -68,9 +85,11 @@ def render() -> None: st.info("Enter a search term and click Search.") return + store = Store(_DB_PATH) + adapter = _get_adapter(store) + with st.spinner("Fetching listings..."): try: - adapter = _get_adapter() filters = SearchFilters(max_price=max_price if max_price > 0 else None) listings = adapter.search(query, filters) adapter.get_completed_sales(query) # warm the comps cache @@ -82,7 +101,6 @@ def render() -> None: st.warning("No listings found.") return - store = Store(_DB_PATH) for listing in listings: store.save_listing(listing) if listing.seller_platform_id: diff --git a/pyproject.toml b/pyproject.toml index b6fdb7c..8114277 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,8 @@ dependencies = [ "imagehash>=4.3", "Pillow>=10.0", "python-dotenv>=1.0", + "beautifulsoup4>=4.12", + "lxml>=5.0", ] [tool.setuptools.packages.find] diff --git a/tests/platforms/test_ebay_scraper.py b/tests/platforms/test_ebay_scraper.py new file mode 100644 index 0000000..18c4141 --- /dev/null +++ b/tests/platforms/test_ebay_scraper.py @@ -0,0 +1,151 @@ +"""Tests for the scraper-based eBay adapter. + +Uses a minimal HTML fixture that mirrors eBay's search results structure. +No HTTP requests are made — all tests operate on the pure parsing functions. +""" +import pytest +from app.platforms.ebay.scraper import scrape_listings, scrape_sellers, _parse_price, _parse_seller + +# --------------------------------------------------------------------------- +# Minimal eBay search results HTML fixture +# --------------------------------------------------------------------------- + +_EBAY_HTML = """ + + + +""" + + +# --------------------------------------------------------------------------- +# Unit tests: pure parsing functions +# --------------------------------------------------------------------------- + +class TestParsePrice: + def test_simple_price(self): + assert _parse_price("$950.00") == 950.0 + + def test_price_range_takes_lower_bound(self): + assert _parse_price("$900.00 to $1,050.00") == 900.0 + + def test_price_with_commas(self): + assert _parse_price("$1,100.00") == 1100.0 + + def test_empty_returns_zero(self): + assert _parse_price("") == 0.0 + + +class TestParseSeller: + def test_standard_format(self): + username, count, ratio = _parse_seller("techguy (1,234) 99.1% positive feedback") + assert username == "techguy" + assert count == 1234 + assert ratio == pytest.approx(0.991, abs=0.001) + + def test_low_count(self): + username, count, ratio = _parse_seller("new_user_2024 (2) 100.0% positive feedback") + assert username == "new_user_2024" + assert count == 2 + assert ratio == pytest.approx(1.0, abs=0.001) + + def test_fallback_on_malformed(self): + username, count, ratio = _parse_seller("weirdformat") + assert username == "weirdformat" + assert count == 0 + assert ratio == 0.0 + + +# --------------------------------------------------------------------------- +# Integration tests: HTML fixture → domain objects +# --------------------------------------------------------------------------- + +class TestScrapeListings: + def test_skips_shop_on_ebay_ghost(self): + listings = scrape_listings(_EBAY_HTML) + titles = [l.title for l in listings] + assert all("Shop on eBay" not in t for t in titles) + + def test_parses_three_real_listings(self): + listings = scrape_listings(_EBAY_HTML) + assert len(listings) == 3 + + def test_extracts_platform_listing_id_from_url(self): + listings = scrape_listings(_EBAY_HTML) + assert listings[0].platform_listing_id == "123456789" + assert listings[1].platform_listing_id == "987654321" + + def test_price_range_takes_lower(self): + listings = scrape_listings(_EBAY_HTML) + assert listings[1].price == 1100.0 + + def test_condition_lowercased(self): + listings = scrape_listings(_EBAY_HTML) + assert listings[0].condition == "used" + assert listings[1].condition == "new" + + def test_photo_prefers_data_src(self): + listings = scrape_listings(_EBAY_HTML) + # Listing 2 has data-src set, src empty + assert listings[1].photo_urls == ["https://i.ebayimg.com/thumbs/2.jpg"] + + def test_seller_platform_id_set(self): + listings = scrape_listings(_EBAY_HTML) + assert listings[0].seller_platform_id == "techguy" + assert listings[2].seller_platform_id == "new_user_2024" + + +class TestScrapeSellers: + def test_extracts_three_sellers(self): + sellers = scrape_sellers(_EBAY_HTML) + assert len(sellers) == 3 + + def test_feedback_count_and_ratio(self): + sellers = scrape_sellers(_EBAY_HTML) + assert sellers["techguy"].feedback_count == 1234 + assert sellers["techguy"].feedback_ratio == pytest.approx(0.991, abs=0.001) + + def test_account_age_is_zero(self): + """account_age_days is always 0 from scraper — signals partial score.""" + sellers = scrape_sellers(_EBAY_HTML) + assert all(s.account_age_days == 0 for s in sellers.values()) + + def test_category_history_is_empty(self): + """category_history_json is always '{}' from scraper — signals partial score.""" + sellers = scrape_sellers(_EBAY_HTML) + assert all(s.category_history_json == "{}" for s in sellers.values())