snipe/app/platforms/mercari/adapter.py
pyr0ball 15996472b7 feat(mercari): Phase 2 — MercariAdapter with Xvfb stability fixes
Implements full Mercari scraping support for the trust-scoring pipeline:

- `app/platforms/mercari/` — new MercariAdapter (scraper-based), scraper
  (parse_search_html / parse_listing_html), and __init__
- `app/platforms/__init__.py` — adds "mercari" to SUPPORTED_PLATFORMS
- `api/main.py` — platform routing: _make_adapter, OR-group guard, seller
  lookup, BTF/Trading API guards all parameterised by platform
- `web/src/views/SearchView.vue` — enables Mercari tab in platform picker

BrowserPool stability fixes (browser_pool.py):
- Add -ac flag to Xvfb (disables X11 auth requirement in Docker containers)
- Shift display counter from :100-:199 to :200-:399 (avoids ghost kernel
  socket conflicts with low-numbered displays)
- Add wait_for_selector / wait_for_timeout_ms params to fetch_html,
  _fetch_with_slot, _fetch_fresh
- Add time.sleep(0.3) in _fetch_fresh after Xvfb start (was missing)

Mercari scraper fix:
- Remove sortBy=SORT_SCORE from build_search_url — that param is deprecated
  on Mercari and causes an empty 85KB response instead of search results

Probe + debug scripts in scripts/:
- probe_mercari.py — standalone Cloudflare bypass test
- debug_fetch_fresh.py — pool simulation diagnostic

Trust signal coverage: feedback_count, feedback_ratio partial score
(account_age_days, category_history absent = score_is_partial=True).
get_completed_sales stubbed for Phase 3.
Tracks: snipe#53 (pool thread-safety fix, follow-up)
2026-05-03 18:39:25 -07:00

173 lines
6.2 KiB
Python

"""MercariAdapter — scraper-based Mercari platform adapter.
Trust signal coverage vs eBay:
✅ feedback_count (NumSales from listing page)
✅ feedback_ratio (ReviewStarsWrapper data-stars / 5)
❌ account_age_days (requires seller profile page — future work)
❌ category_history (not exposed in HTML — future work)
✅ price_vs_market (computed by trust scorer from comps, same as eBay)
Because account_age and category_history are always None, TrustScore.score_is_partial
will be True for all Mercari results. The aggregator handles this correctly
by scoring only from available signals.
seller_platform_id on Listing objects holds the product_id (e.g. "m86032668393")
rather than the seller username, because search results don't expose seller identity.
get_seller() resolves the product_id → seller by fetching the listing page.
The DB lookup key is (platform="mercari", platform_seller_id=product_id).
"""
from __future__ import annotations
import json
import logging
import time
from typing import Optional
from app.db.models import Listing, MarketComp, Seller
from app.db.store import Store
from app.platforms import PlatformAdapter, SearchFilters
from app.platforms.mercari.scraper import (
build_search_url,
parse_listing_html,
parse_search_html,
)
log = logging.getLogger(__name__)
_SELLER_CACHE_TTL_HOURS = 6
_BETWEEN_LISTING_FETCH_SECS = 1.5
class MercariAdapter(PlatformAdapter):
def __init__(self, store: Store) -> None:
self._store = store
def search(self, query: str, filters: SearchFilters) -> list[Listing]:
from app.platforms.ebay.browser_pool import get_pool
url = build_search_url(query, filters.max_price, filters.min_price)
log.info("mercari: fetching search URL: %s", url)
html = get_pool().fetch_html(
url,
delay=1.0,
wait_for_timeout_ms=8000,
)
raw_listings = parse_search_html(html)
listings: list[Listing] = []
seen: set[str] = set()
for raw in raw_listings:
pid = raw["product_id"]
if pid in seen:
continue
seen.add(pid)
listings.append(_normalise_listing(raw, query))
log.info("mercari: parsed %d listings for %r", len(listings), query)
# Client-side keyword filter (mirrors eBay scraper behaviour).
if filters.must_include:
listings = _apply_keyword_filter(listings, filters.must_include, filters.must_include_mode)
if filters.must_exclude:
listings = _apply_exclude_filter(listings, filters.must_exclude)
return listings
def get_seller(self, seller_platform_id: str) -> Optional[Seller]:
"""Fetch seller data from the listing page identified by seller_platform_id.
For Mercari, seller_platform_id is the product_id (e.g. "m86032668393")
because seller usernames aren't available from search results HTML.
"""
cached = self._store.get_seller("mercari", seller_platform_id)
if cached:
return cached
from app.platforms.ebay.browser_pool import get_pool
url = f"https://www.mercari.com/us/item/{seller_platform_id}/"
try:
time.sleep(_BETWEEN_LISTING_FETCH_SECS)
html = get_pool().fetch_html(
url,
delay=0.5,
wait_for_timeout_ms=6000,
)
raw = parse_listing_html(html, seller_platform_id)
seller = _normalise_seller(raw)
self._store.save_seller(seller)
return seller
except Exception as exc:
log.warning("mercari: get_seller failed for %s: %s", seller_platform_id, exc)
return None
def get_completed_sales(self, query: str, pages: int = 1) -> list[Listing]:
"""Mercari sold-listing comps — stubbed for Phase 3.
Mercari exposes sold listings via ?status=ITEM_STATUS_TRADING but the
data is sparse. Phase 3 will implement comp extraction here; for now
the trust scorer falls back to price_vs_market=None (partial score).
"""
return []
# ---------------------------------------------------------------------------
# Normalisation helpers
# ---------------------------------------------------------------------------
def _normalise_listing(raw: dict, query: str) -> Listing:
return Listing(
platform="mercari",
platform_listing_id=raw["product_id"],
title=raw["title"],
price=raw["price"],
currency="USD",
condition="", # not available from search results; get_seller() populates this
seller_platform_id=raw["product_id"], # see module docstring
url=raw["url"],
photo_urls=[raw["photo_url"]] if raw.get("photo_url") else [],
listing_age_days=0,
buying_format="fixed_price",
category_name=None,
)
def _normalise_seller(raw: dict) -> Seller:
stars = raw.get("stars", 0.0)
feedback_ratio = min(stars / 5.0, 1.0) if stars > 0 else 0.0
return Seller(
platform="mercari",
platform_seller_id=raw["product_id"],
username=raw.get("username", ""),
account_age_days=None, # not available without seller profile page
feedback_count=raw.get("num_sales", 0),
feedback_ratio=feedback_ratio,
category_history_json=json.dumps({}),
)
def _apply_keyword_filter(listings: list[Listing], must_include: list[str], mode: str) -> list[Listing]:
if not must_include:
return listings
def _matches(listing: Listing) -> bool:
title = listing.title.lower()
if mode == "any":
return any(kw.lower() in title for kw in must_include)
# "all" (default) and "groups" both require all terms present
return all(kw.lower() in title for kw in must_include)
return [l for l in listings if _matches(l)]
def _apply_exclude_filter(listings: list[Listing], must_exclude: list[str]) -> list[Listing]:
if not must_exclude:
return listings
def _clean(listing: Listing) -> bool:
title = listing.title.lower()
return not any(term.lower() in title for term in must_exclude)
return [l for l in listings if _clean(l)]