Implements full Mercari scraping support for the trust-scoring pipeline: - `app/platforms/mercari/` — new MercariAdapter (scraper-based), scraper (parse_search_html / parse_listing_html), and __init__ - `app/platforms/__init__.py` — adds "mercari" to SUPPORTED_PLATFORMS - `api/main.py` — platform routing: _make_adapter, OR-group guard, seller lookup, BTF/Trading API guards all parameterised by platform - `web/src/views/SearchView.vue` — enables Mercari tab in platform picker BrowserPool stability fixes (browser_pool.py): - Add -ac flag to Xvfb (disables X11 auth requirement in Docker containers) - Shift display counter from :100-:199 to :200-:399 (avoids ghost kernel socket conflicts with low-numbered displays) - Add wait_for_selector / wait_for_timeout_ms params to fetch_html, _fetch_with_slot, _fetch_fresh - Add time.sleep(0.3) in _fetch_fresh after Xvfb start (was missing) Mercari scraper fix: - Remove sortBy=SORT_SCORE from build_search_url — that param is deprecated on Mercari and causes an empty 85KB response instead of search results Probe + debug scripts in scripts/: - probe_mercari.py — standalone Cloudflare bypass test - debug_fetch_fresh.py — pool simulation diagnostic Trust signal coverage: feedback_count, feedback_ratio partial score (account_age_days, category_history absent = score_is_partial=True). get_completed_sales stubbed for Phase 3. Tracks: snipe#53 (pool thread-safety fix, follow-up)
173 lines
6.2 KiB
Python
173 lines
6.2 KiB
Python
"""MercariAdapter — scraper-based Mercari platform adapter.
|
|
|
|
Trust signal coverage vs eBay:
|
|
✅ feedback_count (NumSales from listing page)
|
|
✅ feedback_ratio (ReviewStarsWrapper data-stars / 5)
|
|
❌ account_age_days (requires seller profile page — future work)
|
|
❌ category_history (not exposed in HTML — future work)
|
|
✅ price_vs_market (computed by trust scorer from comps, same as eBay)
|
|
|
|
Because account_age and category_history are always None, TrustScore.score_is_partial
|
|
will be True for all Mercari results. The aggregator handles this correctly
|
|
by scoring only from available signals.
|
|
|
|
seller_platform_id on Listing objects holds the product_id (e.g. "m86032668393")
|
|
rather than the seller username, because search results don't expose seller identity.
|
|
get_seller() resolves the product_id → seller by fetching the listing page.
|
|
The DB lookup key is (platform="mercari", platform_seller_id=product_id).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import time
|
|
from typing import Optional
|
|
|
|
from app.db.models import Listing, MarketComp, Seller
|
|
from app.db.store import Store
|
|
from app.platforms import PlatformAdapter, SearchFilters
|
|
from app.platforms.mercari.scraper import (
|
|
build_search_url,
|
|
parse_listing_html,
|
|
parse_search_html,
|
|
)
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
_SELLER_CACHE_TTL_HOURS = 6
|
|
_BETWEEN_LISTING_FETCH_SECS = 1.5
|
|
|
|
|
|
class MercariAdapter(PlatformAdapter):
|
|
def __init__(self, store: Store) -> None:
|
|
self._store = store
|
|
|
|
def search(self, query: str, filters: SearchFilters) -> list[Listing]:
|
|
from app.platforms.ebay.browser_pool import get_pool
|
|
|
|
url = build_search_url(query, filters.max_price, filters.min_price)
|
|
log.info("mercari: fetching search URL: %s", url)
|
|
|
|
html = get_pool().fetch_html(
|
|
url,
|
|
delay=1.0,
|
|
wait_for_timeout_ms=8000,
|
|
)
|
|
raw_listings = parse_search_html(html)
|
|
|
|
listings: list[Listing] = []
|
|
seen: set[str] = set()
|
|
for raw in raw_listings:
|
|
pid = raw["product_id"]
|
|
if pid in seen:
|
|
continue
|
|
seen.add(pid)
|
|
listings.append(_normalise_listing(raw, query))
|
|
|
|
log.info("mercari: parsed %d listings for %r", len(listings), query)
|
|
|
|
# Client-side keyword filter (mirrors eBay scraper behaviour).
|
|
if filters.must_include:
|
|
listings = _apply_keyword_filter(listings, filters.must_include, filters.must_include_mode)
|
|
if filters.must_exclude:
|
|
listings = _apply_exclude_filter(listings, filters.must_exclude)
|
|
|
|
return listings
|
|
|
|
def get_seller(self, seller_platform_id: str) -> Optional[Seller]:
|
|
"""Fetch seller data from the listing page identified by seller_platform_id.
|
|
|
|
For Mercari, seller_platform_id is the product_id (e.g. "m86032668393")
|
|
because seller usernames aren't available from search results HTML.
|
|
"""
|
|
cached = self._store.get_seller("mercari", seller_platform_id)
|
|
if cached:
|
|
return cached
|
|
|
|
from app.platforms.ebay.browser_pool import get_pool
|
|
|
|
url = f"https://www.mercari.com/us/item/{seller_platform_id}/"
|
|
try:
|
|
time.sleep(_BETWEEN_LISTING_FETCH_SECS)
|
|
html = get_pool().fetch_html(
|
|
url,
|
|
delay=0.5,
|
|
wait_for_timeout_ms=6000,
|
|
)
|
|
raw = parse_listing_html(html, seller_platform_id)
|
|
seller = _normalise_seller(raw)
|
|
self._store.save_seller(seller)
|
|
return seller
|
|
except Exception as exc:
|
|
log.warning("mercari: get_seller failed for %s: %s", seller_platform_id, exc)
|
|
return None
|
|
|
|
def get_completed_sales(self, query: str, pages: int = 1) -> list[Listing]:
|
|
"""Mercari sold-listing comps — stubbed for Phase 3.
|
|
|
|
Mercari exposes sold listings via ?status=ITEM_STATUS_TRADING but the
|
|
data is sparse. Phase 3 will implement comp extraction here; for now
|
|
the trust scorer falls back to price_vs_market=None (partial score).
|
|
"""
|
|
return []
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Normalisation helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _normalise_listing(raw: dict, query: str) -> Listing:
|
|
return Listing(
|
|
platform="mercari",
|
|
platform_listing_id=raw["product_id"],
|
|
title=raw["title"],
|
|
price=raw["price"],
|
|
currency="USD",
|
|
condition="", # not available from search results; get_seller() populates this
|
|
seller_platform_id=raw["product_id"], # see module docstring
|
|
url=raw["url"],
|
|
photo_urls=[raw["photo_url"]] if raw.get("photo_url") else [],
|
|
listing_age_days=0,
|
|
buying_format="fixed_price",
|
|
category_name=None,
|
|
)
|
|
|
|
|
|
def _normalise_seller(raw: dict) -> Seller:
|
|
stars = raw.get("stars", 0.0)
|
|
feedback_ratio = min(stars / 5.0, 1.0) if stars > 0 else 0.0
|
|
|
|
return Seller(
|
|
platform="mercari",
|
|
platform_seller_id=raw["product_id"],
|
|
username=raw.get("username", ""),
|
|
account_age_days=None, # not available without seller profile page
|
|
feedback_count=raw.get("num_sales", 0),
|
|
feedback_ratio=feedback_ratio,
|
|
category_history_json=json.dumps({}),
|
|
)
|
|
|
|
|
|
def _apply_keyword_filter(listings: list[Listing], must_include: list[str], mode: str) -> list[Listing]:
|
|
if not must_include:
|
|
return listings
|
|
|
|
def _matches(listing: Listing) -> bool:
|
|
title = listing.title.lower()
|
|
if mode == "any":
|
|
return any(kw.lower() in title for kw in must_include)
|
|
# "all" (default) and "groups" both require all terms present
|
|
return all(kw.lower() in title for kw in must_include)
|
|
|
|
return [l for l in listings if _matches(l)]
|
|
|
|
|
|
def _apply_exclude_filter(listings: list[Listing], must_exclude: list[str]) -> list[Listing]:
|
|
if not must_exclude:
|
|
return listings
|
|
|
|
def _clean(listing: Listing) -> bool:
|
|
title = listing.title.lower()
|
|
return not any(term.lower() in title for term in must_exclude)
|
|
|
|
return [l for l in listings if _clean(l)]
|