snipe/app/platforms/mercari/scraper.py
pyr0ball 15996472b7 feat(mercari): Phase 2 — MercariAdapter with Xvfb stability fixes
Implements full Mercari scraping support for the trust-scoring pipeline:

- `app/platforms/mercari/` — new MercariAdapter (scraper-based), scraper
  (parse_search_html / parse_listing_html), and __init__
- `app/platforms/__init__.py` — adds "mercari" to SUPPORTED_PLATFORMS
- `api/main.py` — platform routing: _make_adapter, OR-group guard, seller
  lookup, BTF/Trading API guards all parameterised by platform
- `web/src/views/SearchView.vue` — enables Mercari tab in platform picker

BrowserPool stability fixes (browser_pool.py):
- Add -ac flag to Xvfb (disables X11 auth requirement in Docker containers)
- Shift display counter from :100-:199 to :200-:399 (avoids ghost kernel
  socket conflicts with low-numbered displays)
- Add wait_for_selector / wait_for_timeout_ms params to fetch_html,
  _fetch_with_slot, _fetch_fresh
- Add time.sleep(0.3) in _fetch_fresh after Xvfb start (was missing)

Mercari scraper fix:
- Remove sortBy=SORT_SCORE from build_search_url — that param is deprecated
  on Mercari and causes an empty 85KB response instead of search results

Probe + debug scripts in scripts/:
- probe_mercari.py — standalone Cloudflare bypass test
- debug_fetch_fresh.py — pool simulation diagnostic

Trust signal coverage: feedback_count, feedback_ratio partial score
(account_age_days, category_history absent = score_is_partial=True).
get_completed_sales stubbed for Phase 3.
Tracks: snipe#53 (pool thread-safety fix, follow-up)
2026-05-03 18:39:25 -07:00

165 lines
5.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Mercari search + listing page scraper.
Uses the shared eBay browser pool (headed Chromium + Xvfb + playwright-stealth)
which already bypasses Cloudflare Turnstile. Import the pool singleton from
ebay.browser_pool so both platforms share the same warm Chromium instances.
Seller data is NOT available from search results HTML — only from individual
listing pages. The adapter lazily fetches listing pages in get_seller().
"""
from __future__ import annotations
import logging
import re
from typing import Optional
from urllib.parse import urlencode
from bs4 import BeautifulSoup, NavigableString
log = logging.getLogger(__name__)
_BASE = "https://www.mercari.com"
_SEARCH_PATH = "/search/"
_ITEM_PATH = "/us/item/"
_PRICE_RE = re.compile(r"[\d,]+\.?\d*")
_POSTED_RE = re.compile(r"(\d{2})/(\d{2})/(\d{2,4})") # MM/DD/YY or MM/DD/YYYY
def build_search_url(query: str, max_price: Optional[float] = None, min_price: Optional[float] = None) -> str:
# No explicit sortBy — Mercari's default (relevance) is the most useful order.
# "sortBy=SORT_SCORE" was a deprecated value that returns an empty results page.
params: dict = {"keyword": query}
# Mercari accepts priceMin/priceMax as whole dollar strings (not cents)
if min_price is not None and min_price > 0:
params["priceMin"] = str(int(min_price))
if max_price is not None and max_price > 0:
params["priceMax"] = str(int(max_price))
return f"{_BASE}{_SEARCH_PATH}?{urlencode(params)}"
def parse_search_html(html: str) -> list[dict]:
"""Parse Mercari search results HTML into a list of raw listing dicts."""
soup = BeautifulSoup(html, "html.parser")
results: list[dict] = []
for item in soup.find_all(attrs={"data-testid": "ItemContainer"}):
pid = item.get("data-productid", "")
if not pid:
continue
parent = item.parent
href = parent.get("href") if parent and parent.name == "a" else None
url = f"{_BASE}{href}" if href else f"{_BASE}{_ITEM_PATH}{pid}/"
name_el = item.find(attrs={"data-testid": "ItemName"})
title = name_el.get_text(strip=True) if name_el else ""
price = _extract_current_price(item)
img_el = item.find("img")
photo_url = img_el.get("src", "") if img_el else ""
results.append({
"product_id": pid,
"url": url,
"title": title,
"price": price,
"photo_url": photo_url,
"brand": item.get("data-brand", ""),
"is_on_sale": item.get("data-is-on-sale") == "true",
})
return results
def _extract_current_price(item: BeautifulSoup) -> float:
"""Return the current (non-strikethrough) price from an ItemContainer."""
price_el = item.find(attrs={"data-testid": "ProductThumbItemPrice"})
if not price_el:
return 0.0
# Direct text nodes are the current price; the nested span is the original.
price_text = "".join(
str(c) for c in price_el.children if isinstance(c, NavigableString)
).strip()
m = _PRICE_RE.search(price_text)
if m:
try:
return float(m.group().replace(",", ""))
except ValueError:
pass
return 0.0
def parse_listing_html(html: str, product_id: str) -> dict:
"""Parse a Mercari listing page into a raw seller dict."""
soup = BeautifulSoup(html, "html.parser")
def _text(testid: str) -> str:
el = soup.find(attrs={"data-testid": testid})
return el.get_text(strip=True) if el else ""
username_raw = _text("ItemDetailsSellerUserName")
username = username_raw.lstrip("@")
num_sales = _safe_int(_text("NumSales"))
rating_count = _safe_int(_text("SellerRatingCount"))
stars = 0.0
rw = soup.find(attrs={"data-testid": "ReviewStarsWrapper"})
if rw:
try:
stars = float(rw.get("data-stars", 0))
except (ValueError, TypeError):
pass
condition = _text("ItemDetailsCondition").lower()
posted_text = _text("ItemDetailsPosted")
listing_age_days = _parse_listing_age(posted_text)
price_text = _text("ItemPrice")
price = 0.0
m = _PRICE_RE.search(price_text.replace(",", ""))
if m:
try:
price = float(m.group())
except ValueError:
pass
return {
"product_id": product_id,
"username": username,
"num_sales": num_sales, # completed sales → maps to feedback_count
"rating_count": rating_count, # number of reviews (additional signal)
"stars": stars, # 0.05.0 → divide by 5 = feedback_ratio
"condition": condition,
"listing_age_days": listing_age_days,
"price": price,
}
def _safe_int(text: str) -> int:
m = _PRICE_RE.search(text.replace(",", ""))
if m:
try:
return int(float(m.group()))
except ValueError:
pass
return 0
def _parse_listing_age(posted_text: str) -> int:
"""Convert a posted date like '04/10/26' to days since posted."""
from datetime import datetime, timezone
m = _POSTED_RE.search(posted_text)
if not m:
return 0
try:
month, day, year = int(m.group(1)), int(m.group(2)), int(m.group(3))
if year < 100:
year += 2000
posted = datetime(year, month, day, tzinfo=timezone.utc)
return (datetime.now(timezone.utc) - posted).days
except (ValueError, OverflowError):
return 0