snipe/app/trust/photo.py
pyr0ball eb05be0612
Some checks are pending
CI / API — lint + test (pull_request) Waiting to run
CI / Web — typecheck + test + build (pull_request) Waiting to run
feat: wire Forgejo Actions CI/CD workflows (#22)
- ci.yml: API lint (ruff F+I) + pytest, web vue-tsc + vitest + build
- mirror.yml: push to GitHub (CircuitForgeLLC) + Codeberg (CircuitForge) on main/tags
- release.yml: Docker build → Forgejo registry + release via API; GHCR deferred pending BSL policy (cf-agents#3)
- .cliff.toml: git-cliff changelog config for semver releases
- pyproject.toml: add [dev] extras (pytest, ruff), ruff config
- Fix 45 ruff violations across codebase (import sorting, unused vars, unused imports)
2026-04-06 00:00:28 -07:00

84 lines
2.8 KiB
Python

"""Perceptual hash deduplication within a result set (free tier, v0.1)."""
from __future__ import annotations
import io
from typing import Optional
import requests
try:
import imagehash
from PIL import Image
_IMAGEHASH_AVAILABLE = True
except ImportError:
_IMAGEHASH_AVAILABLE = False
# Module-level phash cache: url → hash string (or None on failure).
# Avoids re-downloading the same eBay CDN image on repeated searches.
_phash_cache: dict[str, Optional[str]] = {}
class PhotoScorer:
"""
check_duplicates: compare images within a single result set.
Cross-session dedup (PhotoHash table) is v0.2.
Vision analysis (real/marketing/EM bag) is v0.2 paid tier.
"""
def check_duplicates(self, photo_urls_per_listing: list[list[str]]) -> list[bool]:
"""
Returns a list of booleans parallel to photo_urls_per_listing.
True = this listing's primary photo is a duplicate of another listing in the set.
Falls back to URL-equality check if imagehash is unavailable or fetch fails.
"""
if not _IMAGEHASH_AVAILABLE:
return self._url_dedup(photo_urls_per_listing)
primary_urls = [urls[0] if urls else "" for urls in photo_urls_per_listing]
# Fast path: URL equality is a trivial duplicate signal (no fetch needed)
url_results = self._url_dedup([[u] for u in primary_urls])
hashes: list[Optional[str]] = []
for url in primary_urls:
hashes.append(self._fetch_hash(url))
results = list(url_results) # start from URL-equality results
seen: dict[str, int] = {}
for i, h in enumerate(hashes):
if h is None:
continue
if h in seen:
results[i] = True
results[seen[h]] = True
else:
seen[h] = i
return results
def _fetch_hash(self, url: str) -> Optional[str]:
if not url:
return None
if url in _phash_cache:
return _phash_cache[url]
try:
resp = requests.get(url, timeout=5, stream=True)
resp.raise_for_status()
img = Image.open(io.BytesIO(resp.content))
result: Optional[str] = str(imagehash.phash(img))
except Exception:
result = None
_phash_cache[url] = result
return result
def _url_dedup(self, photo_urls_per_listing: list[list[str]]) -> list[bool]:
seen: set[str] = set()
results = []
for urls in photo_urls_per_listing:
primary = urls[0] if urls else ""
if primary and primary in seen:
results.append(True)
else:
if primary:
seen.add(primary)
results.append(False)
return results