Implements full Mercari scraping support for the trust-scoring pipeline: - `app/platforms/mercari/` — new MercariAdapter (scraper-based), scraper (parse_search_html / parse_listing_html), and __init__ - `app/platforms/__init__.py` — adds "mercari" to SUPPORTED_PLATFORMS - `api/main.py` — platform routing: _make_adapter, OR-group guard, seller lookup, BTF/Trading API guards all parameterised by platform - `web/src/views/SearchView.vue` — enables Mercari tab in platform picker BrowserPool stability fixes (browser_pool.py): - Add -ac flag to Xvfb (disables X11 auth requirement in Docker containers) - Shift display counter from :100-:199 to :200-:399 (avoids ghost kernel socket conflicts with low-numbered displays) - Add wait_for_selector / wait_for_timeout_ms params to fetch_html, _fetch_with_slot, _fetch_fresh - Add time.sleep(0.3) in _fetch_fresh after Xvfb start (was missing) Mercari scraper fix: - Remove sortBy=SORT_SCORE from build_search_url — that param is deprecated on Mercari and causes an empty 85KB response instead of search results Probe + debug scripts in scripts/: - probe_mercari.py — standalone Cloudflare bypass test - debug_fetch_fresh.py — pool simulation diagnostic Trust signal coverage: feedback_count, feedback_ratio partial score (account_age_days, category_history absent = score_is_partial=True). get_completed_sales stubbed for Phase 3. Tracks: snipe#53 (pool thread-safety fix, follow-up)
113 lines
3.6 KiB
Python
113 lines
3.6 KiB
Python
"""One-shot Mercari probe using the same headed Chromium + Xvfb + stealth stack
|
|
as the eBay scraper. Run inside the snipe-api container:
|
|
|
|
docker exec -it snipe-api-1 python /app/scripts/probe_mercari.py
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import itertools
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
|
|
_display_counter = itertools.count(200)
|
|
_CHROMIUM_ARGS = ["--no-sandbox", "--disable-dev-shm-usage"]
|
|
_USER_AGENT = (
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
|
)
|
|
|
|
SEARCH_URL = "https://www.mercari.com/search/?keyword=rtx+4090"
|
|
# Give Cloudflare challenge time to resolve (if it does)
|
|
WAIT_MS = 8_000
|
|
|
|
|
|
def probe(url: str) -> str:
|
|
from playwright.sync_api import sync_playwright
|
|
from playwright_stealth import Stealth
|
|
|
|
display_num = next(_display_counter)
|
|
display = f":{display_num}"
|
|
env = os.environ.copy()
|
|
env["DISPLAY"] = display
|
|
|
|
xvfb = subprocess.Popen(
|
|
["Xvfb", display, "-screen", "0", "1280x800x24"],
|
|
stdout=subprocess.DEVNULL,
|
|
stderr=subprocess.DEVNULL,
|
|
)
|
|
time.sleep(0.5)
|
|
|
|
try:
|
|
with sync_playwright() as pw:
|
|
browser = pw.chromium.launch(
|
|
headless=False,
|
|
env=env,
|
|
args=_CHROMIUM_ARGS,
|
|
)
|
|
ctx = browser.new_context(
|
|
user_agent=_USER_AGENT,
|
|
viewport={"width": 1280, "height": 800},
|
|
)
|
|
page = ctx.new_page()
|
|
Stealth().apply_stealth_sync(page)
|
|
print(f"[probe] Navigating to {url} …", flush=True)
|
|
response = page.goto(url, wait_until="domcontentloaded", timeout=40_000)
|
|
print(f"[probe] HTTP status: {response.status if response else 'unknown'}", flush=True)
|
|
print(f"[probe] Waiting {WAIT_MS}ms for JS / Turnstile …", flush=True)
|
|
page.wait_for_timeout(WAIT_MS)
|
|
html = page.content()
|
|
title = page.title()
|
|
print(f"[probe] Page title: {title!r}", flush=True)
|
|
browser.close()
|
|
finally:
|
|
xvfb.terminate()
|
|
xvfb.wait()
|
|
|
|
return html
|
|
|
|
|
|
def analyse(html: str) -> None:
|
|
from bs4 import BeautifulSoup
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
# Cloudflare challenge indicators
|
|
if "Just a moment" in html or "cf-challenge" in html or "turnstile" in html.lower():
|
|
print("[result] BLOCKED — Cloudflare Turnstile still active")
|
|
return
|
|
|
|
print("[result] Cloudflare challenge NOT detected — page appears to have loaded")
|
|
|
|
# Try to find listing cards
|
|
# Mercari US uses data-testid or item cards in the DOM
|
|
candidates = [
|
|
soup.select("[data-testid='ItemCell']"),
|
|
soup.select("[data-testid='item-cell']"),
|
|
soup.select("li[data-testid]"),
|
|
soup.select(".merList .merListItem"),
|
|
soup.select("[class*='ItemCell']"),
|
|
soup.select("[class*='item-cell']"),
|
|
]
|
|
for sel_result in candidates:
|
|
if sel_result:
|
|
print(f"[result] Found {len(sel_result)} listing card(s) via selector")
|
|
card = sel_result[0]
|
|
print(f"[result] First card snippet:\n{card.prettify()[:800]}")
|
|
return
|
|
|
|
# Fallback: show body text summary
|
|
body = soup.find("body")
|
|
text = body.get_text(separator=" ", strip=True)[:500] if body else html[:500]
|
|
print(f"[result] No listing cards found. Body text preview:\n{text}")
|
|
# Save full HTML for manual inspection
|
|
out = "/tmp/mercari_probe.html"
|
|
with open(out, "w") as fh:
|
|
fh.write(html)
|
|
print(f"[result] Full HTML saved to {out}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
html = probe(SEARCH_URL)
|
|
analyse(html)
|