snipe/app/platforms/ebay/browser_pool.py
pyr0ball 15996472b7 feat(mercari): Phase 2 — MercariAdapter with Xvfb stability fixes
Implements full Mercari scraping support for the trust-scoring pipeline:

- `app/platforms/mercari/` — new MercariAdapter (scraper-based), scraper
  (parse_search_html / parse_listing_html), and __init__
- `app/platforms/__init__.py` — adds "mercari" to SUPPORTED_PLATFORMS
- `api/main.py` — platform routing: _make_adapter, OR-group guard, seller
  lookup, BTF/Trading API guards all parameterised by platform
- `web/src/views/SearchView.vue` — enables Mercari tab in platform picker

BrowserPool stability fixes (browser_pool.py):
- Add -ac flag to Xvfb (disables X11 auth requirement in Docker containers)
- Shift display counter from :100-:199 to :200-:399 (avoids ghost kernel
  socket conflicts with low-numbered displays)
- Add wait_for_selector / wait_for_timeout_ms params to fetch_html,
  _fetch_with_slot, _fetch_fresh
- Add time.sleep(0.3) in _fetch_fresh after Xvfb start (was missing)

Mercari scraper fix:
- Remove sortBy=SORT_SCORE from build_search_url — that param is deprecated
  on Mercari and causes an empty 85KB response instead of search results

Probe + debug scripts in scripts/:
- probe_mercari.py — standalone Cloudflare bypass test
- debug_fetch_fresh.py — pool simulation diagnostic

Trust signal coverage: feedback_count, feedback_ratio partial score
(account_age_days, category_history absent = score_is_partial=True).
get_completed_sales stubbed for Phase 3.
Tracks: snipe#53 (pool thread-safety fix, follow-up)
2026-05-03 18:39:25 -07:00

444 lines
15 KiB
Python

"""Pre-warmed Chromium browser pool for the eBay scraper.
Eliminates cold-start latency (5-10s per call) by keeping a small pool of
long-lived Playwright browser instances with fresh contexts ready to serve.
Key design:
- Pool slots: ``(xvfb_proc, pw_instance, browser, context, display_num, last_used_ts)``
One headed Chromium browser per slot — keeps the Kasada fingerprint clean.
- Display numbering: :200-:399 (avoids host :0 and low-numbered kernel socket conflicts).
- Thread safety: ``queue.Queue`` with blocking get (timeout=3s before fresh fallback).
- Replenishment: after each use, the dirty context is closed and a new context is
opened on the *same* browser, then returned to the queue. Browser launch overhead
is only paid at startup and during idle-cleanup replenishment.
- Idle cleanup: daemon thread closes slots idle for >5 minutes to avoid memory leaks
when the service is quiet.
- Graceful degradation: if Playwright / Xvfb is unavailable (host-side test env),
``fetch_html`` falls back to launching a fresh browser per call — same behavior
as before this module existed.
Pool size is controlled via ``BROWSER_POOL_SIZE`` env var (default: 2).
"""
from __future__ import annotations
import itertools
import logging
import os
import queue
import subprocess
import threading
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field
from typing import Optional
log = logging.getLogger(__name__)
# Display counter shared by pool warmup and _fetch_fresh fallback.
# Range :200-:399 avoids low-numbered displays that may be pre-occupied by
# the host X server or lingering kernel sockets from previous runs.
_pool_display_counter = itertools.cycle(range(200, 400))
_IDLE_TIMEOUT_SECS = 300 # 5 minutes
_CLEANUP_INTERVAL_SECS = 60
_QUEUE_TIMEOUT_SECS = 3.0
_CHROMIUM_ARGS = ["--no-sandbox", "--disable-dev-shm-usage"]
_XVFB_ARGS = ["-screen", "0", "1280x800x24", "-ac"] # -ac: disable X auth (safe in isolated Docker)
_USER_AGENT = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
_VIEWPORT = {"width": 1280, "height": 800}
@dataclass
class _PooledBrowser:
"""One slot in the browser pool."""
xvfb: subprocess.Popen
pw: object # playwright instance (sync_playwright().__enter__())
browser: object # playwright Browser
ctx: object # playwright BrowserContext (fresh per use)
display_num: int
last_used_ts: float = field(default_factory=time.time)
def _launch_slot() -> "_PooledBrowser":
"""Launch a new Xvfb display + headed Chromium browser + fresh context.
Raises on failure — callers must catch and handle gracefully.
"""
from playwright.sync_api import sync_playwright
from playwright_stealth import Stealth # noqa: F401 — imported here to confirm availability
display_num = next(_pool_display_counter)
display = f":{display_num}"
env = os.environ.copy()
env["DISPLAY"] = display
xvfb = subprocess.Popen(
["Xvfb", display] + _XVFB_ARGS,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
# Small grace period for Xvfb to bind the display socket.
time.sleep(0.3)
pw = sync_playwright().start()
try:
browser = pw.chromium.launch(
headless=False,
env=env,
args=_CHROMIUM_ARGS,
)
ctx = browser.new_context(
user_agent=_USER_AGENT,
viewport=_VIEWPORT,
)
except Exception:
pw.stop()
xvfb.terminate()
xvfb.wait()
raise
return _PooledBrowser(
xvfb=xvfb,
pw=pw,
browser=browser,
ctx=ctx,
display_num=display_num,
last_used_ts=time.time(),
)
def _close_slot(slot: _PooledBrowser) -> None:
"""Cleanly close a pool slot: context → browser → Playwright → Xvfb."""
try:
slot.ctx.close()
except Exception:
pass
try:
slot.browser.close()
except Exception:
pass
try:
slot.pw.stop()
except Exception:
pass
try:
slot.xvfb.terminate()
slot.xvfb.wait(timeout=5)
except Exception:
pass
def _replenish_slot(slot: _PooledBrowser) -> _PooledBrowser:
"""Close the used context and open a fresh one on the same browser.
Returns a new _PooledBrowser sharing the same xvfb/pw/browser but with a
clean context — avoids paying browser launch overhead on every fetch.
"""
try:
slot.ctx.close()
except Exception:
pass
new_ctx = slot.browser.new_context(
user_agent=_USER_AGENT,
viewport=_VIEWPORT,
)
return _PooledBrowser(
xvfb=slot.xvfb,
pw=slot.pw,
browser=slot.browser,
ctx=new_ctx,
display_num=slot.display_num,
last_used_ts=time.time(),
)
class BrowserPool:
"""Thread-safe pool of pre-warmed Playwright browser contexts."""
def __init__(self, size: int = 2) -> None:
self._size = size
self._q: queue.Queue[_PooledBrowser] = queue.Queue()
self._lock = threading.Lock()
self._started = False
self._stopped = False
self._playwright_available: Optional[bool] = None # cached after first check
# ------------------------------------------------------------------
# Lifecycle
# ------------------------------------------------------------------
def start(self) -> None:
"""Pre-warm N browser slots in background threads.
Non-blocking: returns immediately; slots appear in the queue as they
finish launching. Safe to call multiple times (no-op after first).
"""
with self._lock:
if self._started:
return
self._started = True
if not self._check_playwright():
log.warning(
"BrowserPool: Playwright / Xvfb not available — "
"pool disabled, falling back to per-call fresh browser."
)
return
def _warm_one(_: int) -> None:
try:
slot = _launch_slot()
self._q.put(slot)
log.debug("BrowserPool: slot :%d ready", slot.display_num)
except Exception as exc:
log.warning("BrowserPool: pre-warm failed: %s", exc)
with ThreadPoolExecutor(max_workers=self._size) as ex:
futures = [ex.submit(_warm_one, i) for i in range(self._size)]
# Don't wait — executor exits after submitting, threads continue.
# Actually ThreadPoolExecutor.__exit__ waits for completion, which
# is fine: pre-warming completes in background relative to FastAPI
# startup because this whole method is called from a thread.
for f in as_completed(futures):
pass # propagate exceptions via logging, not raises
_idle_cleaner = threading.Thread(
target=self._idle_cleanup_loop, daemon=True, name="browser-pool-idle-cleaner"
)
_idle_cleaner.start()
log.info("BrowserPool: started with %d slots", self._q.qsize())
def stop(self) -> None:
"""Drain and close all pool slots. Called at FastAPI shutdown."""
with self._lock:
self._stopped = True
closed = 0
while True:
try:
slot = self._q.get_nowait()
_close_slot(slot)
closed += 1
except queue.Empty:
break
log.info("BrowserPool: stopped, closed %d slot(s)", closed)
# ------------------------------------------------------------------
# Core fetch
# ------------------------------------------------------------------
def fetch_html(
self,
url: str,
delay: float = 1.0,
wait_for_selector: Optional[str] = None,
wait_for_timeout_ms: int = 2000,
) -> str:
"""Navigate to *url* and return the rendered HTML.
Borrows a browser context from the pool (blocks up to 3s), uses it to
fetch the page, then replenishes the slot with a fresh context.
Falls back to a fully fresh browser if the pool is empty after the
timeout or if Playwright is unavailable.
Args:
wait_for_selector: CSS/data-testid selector to wait for before capturing
HTML (e.g. ``"[data-testid='SearchResults']"``). When set, the fixed
*wait_for_timeout_ms* sleep is skipped — the page is captured as soon
as the selector appears (or after 15s timeout, whichever comes first).
wait_for_timeout_ms: static post-navigation sleep in ms when
*wait_for_selector* is None. Default 2000; set higher (e.g. 8000)
for sites with JS challenge pages (Cloudflare Turnstile).
"""
time.sleep(delay)
slot: Optional[_PooledBrowser] = None
try:
slot = self._q.get(timeout=_QUEUE_TIMEOUT_SECS)
except queue.Empty:
log.debug("BrowserPool: pool empty after %.1fs — using fresh browser", _QUEUE_TIMEOUT_SECS)
if slot is not None:
try:
html = self._fetch_with_slot(
slot, url,
wait_for_selector=wait_for_selector,
wait_for_timeout_ms=wait_for_timeout_ms,
)
# Replenish: close dirty context, open fresh one, return to queue.
try:
fresh_slot = _replenish_slot(slot)
self._q.put(fresh_slot)
except Exception as exc:
log.warning("BrowserPool: replenish failed, slot discarded: %s", exc)
_close_slot(slot)
return html
except Exception as exc:
log.warning("BrowserPool: pooled fetch failed (%s) — closing slot", exc)
_close_slot(slot)
# Fall through to fresh browser below.
# Fallback: fresh browser (same code as old scraper._fetch_url).
return self._fetch_fresh(
url,
wait_for_selector=wait_for_selector,
wait_for_timeout_ms=wait_for_timeout_ms,
)
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
def _check_playwright(self) -> bool:
"""Return True if Playwright and Xvfb are importable/runnable."""
if self._playwright_available is not None:
return self._playwright_available
try:
import playwright # noqa: F401
from playwright_stealth import Stealth # noqa: F401
self._playwright_available = True
except ImportError:
self._playwright_available = False
return self._playwright_available
def _fetch_with_slot(
self,
slot: _PooledBrowser,
url: str,
wait_for_selector: Optional[str] = None,
wait_for_timeout_ms: int = 2000,
) -> str:
"""Open a new page on *slot.ctx*, navigate to *url*, return HTML."""
from playwright_stealth import Stealth
page = slot.ctx.new_page()
try:
Stealth().apply_stealth_sync(page)
page.goto(url, wait_until="domcontentloaded", timeout=30_000)
if wait_for_selector:
try:
page.wait_for_selector(wait_for_selector, timeout=15_000)
except Exception:
pass # selector didn't appear; return whatever loaded
else:
page.wait_for_timeout(wait_for_timeout_ms)
return page.content()
finally:
try:
page.close()
except Exception:
pass
def _fetch_fresh(
self,
url: str,
wait_for_selector: Optional[str] = None,
wait_for_timeout_ms: int = 2000,
) -> str:
"""Launch a fully fresh browser, fetch *url*, close everything."""
import subprocess as _subprocess
try:
from playwright.sync_api import sync_playwright
from playwright_stealth import Stealth
except ImportError as exc:
raise RuntimeError(
"Playwright not installed — cannot fetch pages. "
"Install playwright and playwright-stealth in the Docker image."
) from exc
display_num = next(_pool_display_counter)
display = f":{display_num}"
env = os.environ.copy()
env["DISPLAY"] = display
xvfb = _subprocess.Popen(
["Xvfb", display] + _XVFB_ARGS,
stdout=_subprocess.DEVNULL,
stderr=_subprocess.DEVNULL,
)
time.sleep(0.3) # wait for Xvfb to bind the display socket before Chromium starts
try:
with sync_playwright() as pw:
browser = pw.chromium.launch(
headless=False,
env=env,
args=_CHROMIUM_ARGS,
)
ctx = browser.new_context(
user_agent=_USER_AGENT,
viewport=_VIEWPORT,
)
page = ctx.new_page()
Stealth().apply_stealth_sync(page)
page.goto(url, wait_until="domcontentloaded", timeout=30_000)
if wait_for_selector:
try:
page.wait_for_selector(wait_for_selector, timeout=15_000)
except Exception:
pass # selector didn't appear; return whatever loaded
else:
page.wait_for_timeout(wait_for_timeout_ms)
html = page.content()
browser.close()
finally:
xvfb.terminate()
xvfb.wait()
return html
def _idle_cleanup_loop(self) -> None:
"""Daemon thread: drain slots idle for >5 minutes every 60 seconds."""
while not self._stopped:
time.sleep(_CLEANUP_INTERVAL_SECS)
if self._stopped:
break
now = time.time()
idle_cutoff = now - _IDLE_TIMEOUT_SECS
# Drain the entire queue, keep non-idle slots, close idle ones.
kept: list[_PooledBrowser] = []
closed = 0
while True:
try:
slot = self._q.get_nowait()
except queue.Empty:
break
if slot.last_used_ts < idle_cutoff:
_close_slot(slot)
closed += 1
else:
kept.append(slot)
for slot in kept:
self._q.put(slot)
if closed:
log.info("BrowserPool: idle cleanup closed %d slot(s)", closed)
# ---------------------------------------------------------------------------
# Module-level singleton
# ---------------------------------------------------------------------------
_pool: Optional[BrowserPool] = None
_pool_lock = threading.Lock()
def get_pool() -> BrowserPool:
"""Return the module-level BrowserPool singleton (creates it if needed).
Pool size is read from ``BROWSER_POOL_SIZE`` env var (default: 2).
Call ``get_pool().start()`` at FastAPI startup to pre-warm slots.
"""
global _pool
if _pool is None:
with _pool_lock:
if _pool is None:
size = int(os.environ.get("BROWSER_POOL_SIZE", "2"))
_pool = BrowserPool(size)
return _pool