snipe/app/trust/photo.py
pyr0ball 58263d814a feat(snipe): FastAPI layer, Playwright+Xvfb scraper, caching, tests
- FastAPI service (port 8510) wrapping scraper + trust scorer
- Playwright+Xvfb+stealth transport to bypass eBay Kasada bot protection
- li.s-card selector migration (eBay markup change from li.s-item)
- Three-layer caching: HTML (5min), phash (permanent), market comp (6h SQLite)
- Batch DB writes (executemany + single commit) — warm requests <1s
- Unique Xvfb display counter (:200–:299) prevents lock file collisions
- Vue 3 nginx web service (port 8509) proxying /api/ to FastAPI
- Auction card de-emphasis: opacity 0.72 for listings with >1h remaining
- 35 scraper unit tests updated for new li.s-card fixture markup
- tests/ volume-mounted in compose.override.yml for live test editing
2026-03-25 20:09:30 -07:00

82 lines
2.8 KiB
Python

"""Perceptual hash deduplication within a result set (free tier, v0.1)."""
from __future__ import annotations
from typing import Optional
import io
import requests
try:
import imagehash
from PIL import Image
_IMAGEHASH_AVAILABLE = True
except ImportError:
_IMAGEHASH_AVAILABLE = False
# Module-level phash cache: url → hash string (or None on failure).
# Avoids re-downloading the same eBay CDN image on repeated searches.
_phash_cache: dict[str, Optional[str]] = {}
class PhotoScorer:
"""
check_duplicates: compare images within a single result set.
Cross-session dedup (PhotoHash table) is v0.2.
Vision analysis (real/marketing/EM bag) is v0.2 paid tier.
"""
def check_duplicates(self, photo_urls_per_listing: list[list[str]]) -> list[bool]:
"""
Returns a list of booleans parallel to photo_urls_per_listing.
True = this listing's primary photo is a duplicate of another listing in the set.
Falls back to URL-equality check if imagehash is unavailable or fetch fails.
"""
if not _IMAGEHASH_AVAILABLE:
return self._url_dedup(photo_urls_per_listing)
primary_urls = [urls[0] if urls else "" for urls in photo_urls_per_listing]
# Fast path: URL equality is a trivial duplicate signal (no fetch needed)
url_results = self._url_dedup([[u] for u in primary_urls])
hashes: list[Optional[str]] = []
for url in primary_urls:
hashes.append(self._fetch_hash(url))
results = list(url_results) # start from URL-equality results
seen: dict[str, int] = {}
for i, h in enumerate(hashes):
if h is None:
continue
if h in seen:
results[i] = True
results[seen[h]] = True
else:
seen[h] = i
return results
def _fetch_hash(self, url: str) -> Optional[str]:
if not url:
return None
if url in _phash_cache:
return _phash_cache[url]
try:
resp = requests.get(url, timeout=5, stream=True)
resp.raise_for_status()
img = Image.open(io.BytesIO(resp.content))
result: Optional[str] = str(imagehash.phash(img))
except Exception:
result = None
_phash_cache[url] = result
return result
def _url_dedup(self, photo_urls_per_listing: list[list[str]]) -> list[bool]:
seen: set[str] = set()
results = []
for urls in photo_urls_per_listing:
primary = urls[0] if urls else ""
if primary and primary in seen:
results.append(True)
else:
if primary:
seen.add(primary)
results.append(False)
return results