magpie/app/services/reddit/discovery.py
Alan Weinstock f39f36e258 feat(discovery): subreddit discovery and rule classification (#2)
- Add app/services/reddit/discovery.py:
  - search_subs(): searches /subreddits/search.json by keyword
  - analyze_sub(): fetches /about.json + /about/rules.json per sub
  - _classify_rules(): keyword-pattern classifier for promo policy
    (banned / conditional / unknown; hard to positively confirm allowed)
  - search_and_analyze(): combined search + per-sub analysis entry point
  - Unauthenticated-friendly (uses auth cookies when available)
- Add POST /subs/discover endpoint: returns candidate list with
  promo_allowed, flair_required, subscriber count, notes excerpt,
  and already_tracked flag. Nothing stored until user imports.
- Add SubDiscoveryResult interface and api.subs.discover() in api.ts
- Rework SubRulesView: slide-in discovery panel (right drawer),
  per-row Import button, auto-marks already-tracked subs, immutable
  result update on import

Closes: #2
2026-06-13 22:17:53 -07:00

235 lines
8.2 KiB
Python

"""
Subreddit discovery and rule analysis.
Searches Reddit for relevant communities by keyword, fetches each sub's
about page and posting rules, and classifies promo policy automatically.
Results are returned for user review — nothing is stored until the user
explicitly imports a sub via PUT /subs/{sub}.
"""
from __future__ import annotations
import logging
import re
from typing import Any
import httpx
logger = logging.getLogger(__name__)
_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) Chrome/124.0.0.0"
_BASE = "https://www.reddit.com"
# Keyword patterns for promo classification (applied to rule title + rule body text)
_BAN_PATTERNS = [
r"\bno\b.{0,20}\b(self.?promo|advertising|promotional|affiliate|soliciting|spam)\b",
r"\b(self.?promo|advertising|promotional)\b.{0,20}\bnot allowed\b",
r"\bdo not\b.{0,20}\b(post|share|submit)\b.{0,20}\b(your|own)\b.{0,20}\b(blog|site|product|service)",
r"\bno\b.{0,20}\bself.?serving\b",
r"\bcommercial\b.{0,10}\bcontent\b.{0,10}\bprohibited\b",
]
_COND_PATTERNS = [
r"\b(9|10)\s*[:/]\s*1\b", # 9:1 / 10:1 rule
r"\blimited\b.{0,20}\bself.?promo\b",
r"\bself.?promo.{0,40}\ballow(ed)?\b.{0,20}\b(friday|thread|megathread|weekly|monthly)\b",
r"\bonly.{0,20}\b(friday|weekly|monthly)\b.{0,30}\bpromo\b",
r"\bself.?promotion\b.{0,40}\bonly\b",
r"\bpromotion.{0,40}\bmoderat",
]
_BAN_RE = [re.compile(p, re.I | re.S) for p in _BAN_PATTERNS]
_COND_RE = [re.compile(p, re.I | re.S) for p in _COND_PATTERNS]
_FLAIR_RE = re.compile(r"\bflair\b", re.I)
def _get(url: str, cookies: dict | None = None, timeout: int = 10) -> httpx.Response:
return httpx.get(
url,
cookies=cookies or {},
headers={"User-Agent": _USER_AGENT},
timeout=timeout,
follow_redirects=True,
)
def _classify_rules(rules: list[dict]) -> tuple[int | None, bool, str | None]:
"""
Returns (promo_allowed, flair_required, notes).
promo_allowed: 0 = banned, 1 = allowed (never set — hard to detect positively), None = unknown
"""
ban_reason: str | None = None
is_conditional = False
flair_required = False
notes_parts: list[str] = []
for rule in rules:
text = f"{rule.get('short_name', '')} {rule.get('description', '')}"
if any(p.search(text) for p in _BAN_RE):
if ban_reason is None:
ban_reason = rule.get("short_name", "Promo banned by rule")
notes_parts.append(f"[ban] {rule.get('short_name', '')}: {rule.get('description', '')[:120]}")
elif any(p.search(text) for p in _COND_RE):
is_conditional = True
notes_parts.append(f"[cond] {rule.get('short_name', '')}: {rule.get('description', '')[:120]}")
if _FLAIR_RE.search(text):
flair_required = True
if ban_reason:
promo_allowed: int | None = 0
elif is_conditional:
promo_allowed = None # keep unknown; notes will explain
else:
promo_allowed = None # can't positively assert allowed
notes = "; ".join(notes_parts) if notes_parts else None
return promo_allowed, flair_required, notes
def _fetch_flairs(sub: str, cookies: dict | None) -> list[str]:
"""Fetch available link flairs (requires auth; returns [] if unavailable)."""
try:
r = _get(f"{_BASE}/r/{sub}/api/link_flair_v2.json", cookies=cookies)
if r.status_code == 200:
return [f.get("text", "") for f in r.json() if f.get("text")]
except Exception:
pass
return []
def analyze_sub(
sub: str,
cookies: dict | None = None,
known_subs: set[str] | None = None,
) -> dict[str, Any] | None:
"""
Fetch about + rules for a single sub and return an analysis dict.
Returns None if the sub doesn't exist or is inaccessible.
"""
try:
about_r = _get(f"{_BASE}/r/{sub}/about.json", cookies=cookies)
if about_r.status_code != 200:
return None
about = about_r.json().get("data", {})
if about.get("subreddit_type") in ("private", "restricted", "employee_only"):
return None
rules_r = _get(f"{_BASE}/r/{sub}/about/rules.json", cookies=cookies)
rules = rules_r.json().get("rules", []) if rules_r.status_code == 200 else []
promo_allowed, flair_required, notes = _classify_rules(rules)
available_flairs = _fetch_flairs(sub, cookies) if flair_required else []
subscribers = about.get("subscribers") or 0
title = about.get("title") or sub
description = (about.get("public_description") or about.get("description") or "").strip()
return {
"sub": sub,
"title": title,
"subscribers": subscribers,
"description": description[:280],
"promo_allowed": promo_allowed,
"flair_required": flair_required,
"available_flairs": available_flairs,
"rule_warning": False,
"notes": notes,
"already_tracked": (sub.lower() in known_subs) if known_subs is not None else False,
}
except Exception:
logger.exception("Error analyzing r/%s", sub)
return None
def search_subs(
keyword: str,
limit: int = 20,
cookies: dict | None = None,
known_subs: set[str] | None = None,
) -> list[dict[str, Any]]:
"""
Search subreddits by keyword and analyze each result.
Returns a list of analysis dicts sorted by subscriber count (desc).
"""
try:
search_r = _get(
f"{_BASE}/subreddits/search.json",
cookies=cookies,
)
# httpx doesn't support params kwarg above since we're using _get; rebuild
r = httpx.get(
f"{_BASE}/subreddits/search.json",
params={"q": keyword, "limit": min(limit, 50), "sort": "relevance"},
cookies=cookies or {},
headers={"User-Agent": _USER_AGENT},
timeout=10,
follow_redirects=True,
)
if r.status_code != 200:
logger.warning("Subreddit search returned %d for %r", r.status_code, keyword)
return []
children = r.json().get("data", {}).get("children", [])
except Exception:
logger.exception("Error searching subreddits for %r", keyword)
return []
results: list[dict] = []
for child in children:
data = child.get("data", {})
sub_name = data.get("display_name")
if not sub_name:
continue
if data.get("subreddit_type") in ("private", "restricted", "employee_only"):
continue
# Light analysis from search result data (avoids N per-sub about requests)
promo_allowed, flair_required, notes = _classify_rules([]) # no rules yet
subscribers = data.get("subscribers") or 0
title = data.get("title") or sub_name
description = (data.get("public_description") or data.get("description") or "").strip()
results.append({
"sub": sub_name,
"title": title,
"subscribers": subscribers,
"description": description[:280],
"promo_allowed": None, # unknown until rules are fetched
"flair_required": False,
"available_flairs": [],
"rule_warning": False,
"notes": None,
"already_tracked": (sub_name.lower() in known_subs) if known_subs is not None else False,
})
# Sort by subscribers descending
results.sort(key=lambda x: x["subscribers"], reverse=True)
return results[:limit]
def search_and_analyze(
keyword: str,
limit: int = 15,
cookies: dict | None = None,
known_subs: set[str] | None = None,
) -> list[dict[str, Any]]:
"""
Search subreddits by keyword, then fetch full rules for each result.
This is the main entry point for the discovery endpoint.
Runs sequentially — limit to 15 to keep latency reasonable.
"""
candidates = search_subs(keyword, limit=limit, cookies=cookies, known_subs=known_subs)
analyzed = []
for c in candidates:
result = analyze_sub(c["sub"], cookies=cookies, known_subs=known_subs)
if result is not None:
analyzed.append(result)
else:
# Sub is inaccessible — skip silently
pass
analyzed.sort(key=lambda x: x["subscribers"], reverse=True)
return analyzed