feat: add scraper adapter with auto-detect fallback and partial score logging

This commit is contained in:
pyr0ball 2026-03-25 14:12:29 -07:00
parent 4977e517fe
commit 68a9879191
5 changed files with 427 additions and 15 deletions

View file

@ -1,4 +1,11 @@
EBAY_CLIENT_ID=your-client-id-here
EBAY_CLIENT_SECRET=your-client-secret-here
EBAY_ENV=production # or: sandbox
# Snipe works out of the box with the scraper (no credentials needed).
# Set EBAY_CLIENT_ID + EBAY_CLIENT_SECRET to unlock full trust scores
# (account age and category history signals require the eBay Browse API).
# Without credentials the app logs a warning and uses the scraper automatically.
# Optional — eBay API credentials (self-hosters / paid CF cloud tier)
# EBAY_CLIENT_ID=your-client-id-here
# EBAY_CLIENT_SECRET=your-client-secret-here
# EBAY_ENV=production # or: sandbox
SNIPE_DB=data/snipe.db

View file

@ -0,0 +1,234 @@
"""Scraper-based eBay adapter — free tier, no API key required.
Data available from search results HTML (single page load):
title, price, condition, photos, URL
seller username, feedback count, feedback ratio
account registration date account_age_score = None (score_is_partial)
category history category_history_score = None (score_is_partial)
This is the MIT discovery layer. EbayAdapter (paid/CF proxy) unlocks full trust scores.
"""
from __future__ import annotations
import hashlib
import re
import time
from datetime import datetime, timedelta, timezone
from typing import Optional
import requests
from bs4 import BeautifulSoup
from app.db.models import Listing, MarketComp, Seller
from app.db.store import Store
from app.platforms import PlatformAdapter, SearchFilters
EBAY_SEARCH_URL = "https://www.ebay.com/sch/i.html"
_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
_SELLER_RE = re.compile(r"^(.+?)\s+\(([0-9,]+)\)\s+([\d.]+)%")
_PRICE_RE = re.compile(r"[\d,]+\.?\d*")
_ITEM_ID_RE = re.compile(r"/itm/(\d+)")
# ---------------------------------------------------------------------------
# Pure HTML parsing functions (unit-testable, no HTTP)
# ---------------------------------------------------------------------------
def _parse_price(text: str) -> float:
"""Extract first numeric value from price text.
Handles '$950.00', '$900.00 to $1,050.00', '$1,234.56/ea'.
Takes the lower bound for price ranges (conservative for trust scoring).
"""
m = _PRICE_RE.search(text.replace(",", ""))
return float(m.group()) if m else 0.0
def _parse_seller(text: str) -> tuple[str, int, float]:
"""Parse eBay seller-info text into (username, feedback_count, feedback_ratio).
Input format: 'tech_seller (1,234) 99.1% positive feedback'
Returns ('tech_seller', 1234, 0.991).
Falls back gracefully if the format doesn't match.
"""
text = text.strip()
m = _SELLER_RE.match(text)
if not m:
return (text.split()[0] if text else ""), 0, 0.0
return m.group(1).strip(), int(m.group(2).replace(",", "")), float(m.group(3)) / 100.0
def scrape_listings(html: str) -> list[Listing]:
"""Parse eBay search results HTML into Listing objects."""
soup = BeautifulSoup(html, "lxml")
results = []
for item in soup.select("li.s-item"):
# eBay injects a ghost "Shop on eBay" promo as the first item — skip it
title_el = item.select_one("h3.s-item__title span, div.s-item__title span")
if not title_el or "Shop on eBay" in title_el.text:
continue
link_el = item.select_one("a.s-item__link")
url = link_el["href"].split("?")[0] if link_el else ""
id_match = _ITEM_ID_RE.search(url)
platform_listing_id = (
id_match.group(1) if id_match else hashlib.md5(url.encode()).hexdigest()[:12]
)
price_el = item.select_one("span.s-item__price")
price = _parse_price(price_el.text) if price_el else 0.0
condition_el = item.select_one("span.SECONDARY_INFO")
condition = condition_el.text.strip().lower() if condition_el else ""
seller_el = item.select_one("span.s-item__seller-info-text")
seller_username = _parse_seller(seller_el.text)[0] if seller_el else ""
# Images are lazy-loaded — check data-src before src
img_el = item.select_one("div.s-item__image-wrapper img, .s-item__image img")
photo_url = ""
if img_el:
photo_url = img_el.get("data-src") or img_el.get("src") or ""
results.append(Listing(
platform="ebay",
platform_listing_id=platform_listing_id,
title=title_el.text.strip(),
price=price,
currency="USD",
condition=condition,
seller_platform_id=seller_username,
url=url,
photo_urls=[photo_url] if photo_url else [],
listing_age_days=0, # not reliably in search HTML
))
return results
def scrape_sellers(html: str) -> dict[str, Seller]:
"""Extract Seller objects from search results HTML.
Returns a dict keyed by username. account_age_days and category_history_json
are left empty they require a separate seller profile page fetch, which
would mean one extra HTTP request per seller. That data gap is what separates
free (scraper) from paid (API) tier.
"""
soup = BeautifulSoup(html, "lxml")
sellers: dict[str, Seller] = {}
for item in soup.select("li.s-item"):
seller_el = item.select_one("span.s-item__seller-info-text")
if not seller_el:
continue
username, count, ratio = _parse_seller(seller_el.text)
if username and username not in sellers:
sellers[username] = Seller(
platform="ebay",
platform_seller_id=username,
username=username,
account_age_days=0, # not available from search HTML
feedback_count=count,
feedback_ratio=ratio,
category_history_json="{}", # not available from search HTML
)
return sellers
# ---------------------------------------------------------------------------
# Adapter
# ---------------------------------------------------------------------------
class ScrapedEbayAdapter(PlatformAdapter):
"""
Scraper-based eBay adapter implementing PlatformAdapter with no API key.
Extracts seller feedback directly from search result cards no extra
per-seller page requests. The two unavailable signals (account_age,
category_history) cause TrustScorer to set score_is_partial=True.
"""
def __init__(self, store: Store, delay: float = 0.5):
self._store = store
self._delay = delay
self._session = requests.Session()
self._session.headers.update(_HEADERS)
def _get(self, params: dict) -> str:
time.sleep(self._delay)
resp = self._session.get(EBAY_SEARCH_URL, params=params, timeout=15)
resp.raise_for_status()
return resp.text
def search(self, query: str, filters: SearchFilters) -> list[Listing]:
params: dict = {"_nkw": query, "_sop": "15", "_ipg": "48"}
if filters.max_price:
params["_udhi"] = str(filters.max_price)
if filters.min_price:
params["_udlo"] = str(filters.min_price)
if filters.condition:
cond_map = {
"new": "1000", "used": "3000",
"open box": "2500", "for parts": "7000",
}
codes = [cond_map[c] for c in filters.condition if c in cond_map]
if codes:
params["LH_ItemCondition"] = "|".join(codes)
html = self._get(params)
listings = scrape_listings(html)
# Cache seller objects extracted from the same page
for seller in scrape_sellers(html).values():
self._store.save_seller(seller)
return listings
def get_seller(self, seller_platform_id: str) -> Optional[Seller]:
# Sellers are pre-populated during search(); no extra fetch needed
return self._store.get_seller("ebay", seller_platform_id)
def get_completed_sales(self, query: str) -> list[Listing]:
query_hash = hashlib.md5(query.encode()).hexdigest()
if self._store.get_market_comp("ebay", query_hash):
return [] # cache hit — comp already stored
params = {
"_nkw": query,
"LH_Sold": "1",
"LH_Complete": "1",
"_sop": "13", # price + shipping: lowest first
"_ipg": "48",
}
try:
html = self._get(params)
listings = scrape_listings(html)
prices = sorted(l.price for l in listings if l.price > 0)
if prices:
median = prices[len(prices) // 2]
self._store.save_market_comp(MarketComp(
platform="ebay",
query_hash=query_hash,
median_price=median,
sample_count=len(prices),
expires_at=(datetime.now(timezone.utc) + timedelta(hours=6)).isoformat(),
))
return listings
except Exception:
return []

View file

@ -1,30 +1,47 @@
"""Main search + results page."""
from __future__ import annotations
import logging
import os
from pathlib import Path
import streamlit as st
from circuitforge_core.config import load_env
from app.db.store import Store
from app.platforms import SearchFilters
from app.platforms.ebay.auth import EbayTokenManager
from app.platforms.ebay.adapter import EbayAdapter
from app.platforms import PlatformAdapter, SearchFilters
from app.trust import TrustScorer
from app.ui.components.filters import build_filter_options, render_filter_sidebar, FilterState
from app.ui.components.listing_row import render_listing_row
log = logging.getLogger(__name__)
load_env(Path(".env"))
_DB_PATH = Path(os.environ.get("SNIPE_DB", "data/snipe.db"))
_DB_PATH.parent.mkdir(exist_ok=True)
def _get_adapter() -> EbayAdapter:
store = Store(_DB_PATH)
tokens = EbayTokenManager(
client_id=os.environ.get("EBAY_CLIENT_ID", ""),
client_secret=os.environ.get("EBAY_CLIENT_SECRET", ""),
env=os.environ.get("EBAY_ENV", "production"),
def _get_adapter(store: Store) -> PlatformAdapter:
"""Return the best available eBay adapter based on what's configured.
Auto-detects: if EBAY_CLIENT_ID + EBAY_CLIENT_SECRET are present, use the
full API adapter (all 5 trust signals). Otherwise fall back to the scraper
(3/5 signals, score_is_partial=True) and warn to logs so ops can see why
scores are partial without touching the UI.
"""
client_id = os.environ.get("EBAY_CLIENT_ID", "").strip()
client_secret = os.environ.get("EBAY_CLIENT_SECRET", "").strip()
if client_id and client_secret:
from app.platforms.ebay.adapter import EbayAdapter
from app.platforms.ebay.auth import EbayTokenManager
env = os.environ.get("EBAY_ENV", "production")
return EbayAdapter(EbayTokenManager(client_id, client_secret, env), store, env=env)
log.warning(
"EBAY_CLIENT_ID / EBAY_CLIENT_SECRET not set — "
"falling back to scraper (partial trust scores: account_age and "
"category_history signals unavailable). Set API credentials for full scoring."
)
return EbayAdapter(tokens, store, env=os.environ.get("EBAY_ENV", "production"))
from app.platforms.ebay.scraper import ScrapedEbayAdapter
return ScrapedEbayAdapter(store)
def _passes_filter(listing, trust, seller, state: FilterState) -> bool:
@ -68,9 +85,11 @@ def render() -> None:
st.info("Enter a search term and click Search.")
return
store = Store(_DB_PATH)
adapter = _get_adapter(store)
with st.spinner("Fetching listings..."):
try:
adapter = _get_adapter()
filters = SearchFilters(max_price=max_price if max_price > 0 else None)
listings = adapter.search(query, filters)
adapter.get_completed_sales(query) # warm the comps cache
@ -82,7 +101,6 @@ def render() -> None:
st.warning("No listings found.")
return
store = Store(_DB_PATH)
for listing in listings:
store.save_listing(listing)
if listing.seller_platform_id:

View file

@ -14,6 +14,8 @@ dependencies = [
"imagehash>=4.3",
"Pillow>=10.0",
"python-dotenv>=1.0",
"beautifulsoup4>=4.12",
"lxml>=5.0",
]
[tool.setuptools.packages.find]

View file

@ -0,0 +1,151 @@
"""Tests for the scraper-based eBay adapter.
Uses a minimal HTML fixture that mirrors eBay's search results structure.
No HTTP requests are made all tests operate on the pure parsing functions.
"""
import pytest
from app.platforms.ebay.scraper import scrape_listings, scrape_sellers, _parse_price, _parse_seller
# ---------------------------------------------------------------------------
# Minimal eBay search results HTML fixture
# ---------------------------------------------------------------------------
_EBAY_HTML = """
<html><body>
<ul class="srp-results">
<!-- eBay injects this ghost item first should be skipped -->
<li class="s-item">
<div class="s-item__title"><span>Shop on eBay</span></div>
<a class="s-item__link" href="https://ebay.com/shop"></a>
</li>
<!-- Real listing 1: established seller, normal price -->
<li class="s-item">
<h3 class="s-item__title"><span>RTX 4090 Founders Edition GPU</span></h3>
<a class="s-item__link" href="https://www.ebay.com/itm/123456789"></a>
<span class="s-item__price">$950.00</span>
<span class="SECONDARY_INFO">Used</span>
<div class="s-item__image-wrapper"><img src="https://i.ebayimg.com/thumbs/1.jpg"/></div>
<span class="s-item__seller-info-text">techguy (1,234) 99.1% positive feedback</span>
</li>
<!-- Real listing 2: price range, new condition -->
<li class="s-item">
<h3 class="s-item__title"><span>RTX 4090 Gaming OC 24GB</span></h3>
<a class="s-item__link" href="https://www.ebay.com/itm/987654321"></a>
<span class="s-item__price">$1,100.00 to $1,200.00</span>
<span class="SECONDARY_INFO">New</span>
<div class="s-item__image-wrapper"><img data-src="https://i.ebayimg.com/thumbs/2.jpg" src=""/></div>
<span class="s-item__seller-info-text">gpu_warehouse (450) 98.7% positive feedback</span>
</li>
<!-- Real listing 3: low feedback seller, suspicious price -->
<li class="s-item">
<h3 class="s-item__title"><span>RTX 4090 BNIB Sealed</span></h3>
<a class="s-item__link" href="https://www.ebay.com/itm/555000111"></a>
<span class="s-item__price">$499.00</span>
<span class="SECONDARY_INFO">New</span>
<div class="s-item__image-wrapper"><img src="https://i.ebayimg.com/thumbs/3.jpg"/></div>
<span class="s-item__seller-info-text">new_user_2024 (2) 100.0% positive feedback</span>
</li>
</ul>
</body></html>
"""
# ---------------------------------------------------------------------------
# Unit tests: pure parsing functions
# ---------------------------------------------------------------------------
class TestParsePrice:
def test_simple_price(self):
assert _parse_price("$950.00") == 950.0
def test_price_range_takes_lower_bound(self):
assert _parse_price("$900.00 to $1,050.00") == 900.0
def test_price_with_commas(self):
assert _parse_price("$1,100.00") == 1100.0
def test_empty_returns_zero(self):
assert _parse_price("") == 0.0
class TestParseSeller:
def test_standard_format(self):
username, count, ratio = _parse_seller("techguy (1,234) 99.1% positive feedback")
assert username == "techguy"
assert count == 1234
assert ratio == pytest.approx(0.991, abs=0.001)
def test_low_count(self):
username, count, ratio = _parse_seller("new_user_2024 (2) 100.0% positive feedback")
assert username == "new_user_2024"
assert count == 2
assert ratio == pytest.approx(1.0, abs=0.001)
def test_fallback_on_malformed(self):
username, count, ratio = _parse_seller("weirdformat")
assert username == "weirdformat"
assert count == 0
assert ratio == 0.0
# ---------------------------------------------------------------------------
# Integration tests: HTML fixture → domain objects
# ---------------------------------------------------------------------------
class TestScrapeListings:
def test_skips_shop_on_ebay_ghost(self):
listings = scrape_listings(_EBAY_HTML)
titles = [l.title for l in listings]
assert all("Shop on eBay" not in t for t in titles)
def test_parses_three_real_listings(self):
listings = scrape_listings(_EBAY_HTML)
assert len(listings) == 3
def test_extracts_platform_listing_id_from_url(self):
listings = scrape_listings(_EBAY_HTML)
assert listings[0].platform_listing_id == "123456789"
assert listings[1].platform_listing_id == "987654321"
def test_price_range_takes_lower(self):
listings = scrape_listings(_EBAY_HTML)
assert listings[1].price == 1100.0
def test_condition_lowercased(self):
listings = scrape_listings(_EBAY_HTML)
assert listings[0].condition == "used"
assert listings[1].condition == "new"
def test_photo_prefers_data_src(self):
listings = scrape_listings(_EBAY_HTML)
# Listing 2 has data-src set, src empty
assert listings[1].photo_urls == ["https://i.ebayimg.com/thumbs/2.jpg"]
def test_seller_platform_id_set(self):
listings = scrape_listings(_EBAY_HTML)
assert listings[0].seller_platform_id == "techguy"
assert listings[2].seller_platform_id == "new_user_2024"
class TestScrapeSellers:
def test_extracts_three_sellers(self):
sellers = scrape_sellers(_EBAY_HTML)
assert len(sellers) == 3
def test_feedback_count_and_ratio(self):
sellers = scrape_sellers(_EBAY_HTML)
assert sellers["techguy"].feedback_count == 1234
assert sellers["techguy"].feedback_ratio == pytest.approx(0.991, abs=0.001)
def test_account_age_is_zero(self):
"""account_age_days is always 0 from scraper — signals partial score."""
sellers = scrape_sellers(_EBAY_HTML)
assert all(s.account_age_days == 0 for s in sellers.values())
def test_category_history_is_empty(self):
"""category_history_json is always '{}' from scraper — signals partial score."""
sellers = scrape_sellers(_EBAY_HTML)
assert all(s.category_history_json == "{}" for s in sellers.values())