feat(discovery): subreddit discovery and rule classification (#2)

- Add app/services/reddit/discovery.py:
  - search_subs(): searches /subreddits/search.json by keyword
  - analyze_sub(): fetches /about.json + /about/rules.json per sub
  - _classify_rules(): keyword-pattern classifier for promo policy
    (banned / conditional / unknown; hard to positively confirm allowed)
  - search_and_analyze(): combined search + per-sub analysis entry point
  - Unauthenticated-friendly (uses auth cookies when available)
- Add POST /subs/discover endpoint: returns candidate list with
  promo_allowed, flair_required, subscriber count, notes excerpt,
  and already_tracked flag. Nothing stored until user imports.
- Add SubDiscoveryResult interface and api.subs.discover() in api.ts
- Rework SubRulesView: slide-in discovery panel (right drawer),
  per-row Import button, auto-marks already-tracked subs, immutable
  result update on import

Closes: #2
This commit is contained in:
Alan Weinstock 2026-06-13 22:17:53 -07:00
parent dfdde692b8
commit f39f36e258
4 changed files with 552 additions and 5 deletions

View file

@ -11,6 +11,11 @@ from app.db.store import Store
router = APIRouter(prefix="/subs", tags=["subs"]) router = APIRouter(prefix="/subs", tags=["subs"])
class DiscoverBody(BaseModel):
keyword: str
limit: int = 15
def _in_thread(fn): def _in_thread(fn):
store = Store(get_settings().db_path) store = Store(get_settings().db_path)
try: try:
@ -52,3 +57,31 @@ async def upsert_sub_rules(sub: str, body: SubRulesUpsert, platform: str = "redd
return await asyncio.to_thread( return await asyncio.to_thread(
_in_thread, lambda s: s.upsert_sub_rules(sub, platform, **fields) _in_thread, lambda s: s.upsert_sub_rules(sub, platform, **fields)
) )
@router.post("/discover")
async def discover_subs(body: DiscoverBody):
"""
Search Reddit for subreddits matching a keyword and analyze their posting rules.
Returns a list of candidates with promo classification. Nothing is stored
the caller decides which subs to import via PUT /subs/{sub}.
"""
from app.services.reddit.discovery import search_and_analyze
def _run(store: Store):
# Collect already-tracked sub names so the UI can flag them
existing = {r["sub"].lower() for r in store.list_sub_rules("reddit")}
try:
from app.services.reddit.client import RedditClient
cookies = RedditClient().cookies
except Exception:
cookies = None
return search_and_analyze(
keyword=body.keyword,
limit=body.limit,
cookies=cookies,
known_subs=existing,
)
return await asyncio.to_thread(_in_thread, _run)

View file

@ -0,0 +1,235 @@
"""
Subreddit discovery and rule analysis.
Searches Reddit for relevant communities by keyword, fetches each sub's
about page and posting rules, and classifies promo policy automatically.
Results are returned for user review nothing is stored until the user
explicitly imports a sub via PUT /subs/{sub}.
"""
from __future__ import annotations
import logging
import re
from typing import Any
import httpx
logger = logging.getLogger(__name__)
_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) Chrome/124.0.0.0"
_BASE = "https://www.reddit.com"
# Keyword patterns for promo classification (applied to rule title + rule body text)
_BAN_PATTERNS = [
r"\bno\b.{0,20}\b(self.?promo|advertising|promotional|affiliate|soliciting|spam)\b",
r"\b(self.?promo|advertising|promotional)\b.{0,20}\bnot allowed\b",
r"\bdo not\b.{0,20}\b(post|share|submit)\b.{0,20}\b(your|own)\b.{0,20}\b(blog|site|product|service)",
r"\bno\b.{0,20}\bself.?serving\b",
r"\bcommercial\b.{0,10}\bcontent\b.{0,10}\bprohibited\b",
]
_COND_PATTERNS = [
r"\b(9|10)\s*[:/]\s*1\b", # 9:1 / 10:1 rule
r"\blimited\b.{0,20}\bself.?promo\b",
r"\bself.?promo.{0,40}\ballow(ed)?\b.{0,20}\b(friday|thread|megathread|weekly|monthly)\b",
r"\bonly.{0,20}\b(friday|weekly|monthly)\b.{0,30}\bpromo\b",
r"\bself.?promotion\b.{0,40}\bonly\b",
r"\bpromotion.{0,40}\bmoderat",
]
_BAN_RE = [re.compile(p, re.I | re.S) for p in _BAN_PATTERNS]
_COND_RE = [re.compile(p, re.I | re.S) for p in _COND_PATTERNS]
_FLAIR_RE = re.compile(r"\bflair\b", re.I)
def _get(url: str, cookies: dict | None = None, timeout: int = 10) -> httpx.Response:
return httpx.get(
url,
cookies=cookies or {},
headers={"User-Agent": _USER_AGENT},
timeout=timeout,
follow_redirects=True,
)
def _classify_rules(rules: list[dict]) -> tuple[int | None, bool, str | None]:
"""
Returns (promo_allowed, flair_required, notes).
promo_allowed: 0 = banned, 1 = allowed (never set hard to detect positively), None = unknown
"""
ban_reason: str | None = None
is_conditional = False
flair_required = False
notes_parts: list[str] = []
for rule in rules:
text = f"{rule.get('short_name', '')} {rule.get('description', '')}"
if any(p.search(text) for p in _BAN_RE):
if ban_reason is None:
ban_reason = rule.get("short_name", "Promo banned by rule")
notes_parts.append(f"[ban] {rule.get('short_name', '')}: {rule.get('description', '')[:120]}")
elif any(p.search(text) for p in _COND_RE):
is_conditional = True
notes_parts.append(f"[cond] {rule.get('short_name', '')}: {rule.get('description', '')[:120]}")
if _FLAIR_RE.search(text):
flair_required = True
if ban_reason:
promo_allowed: int | None = 0
elif is_conditional:
promo_allowed = None # keep unknown; notes will explain
else:
promo_allowed = None # can't positively assert allowed
notes = "; ".join(notes_parts) if notes_parts else None
return promo_allowed, flair_required, notes
def _fetch_flairs(sub: str, cookies: dict | None) -> list[str]:
"""Fetch available link flairs (requires auth; returns [] if unavailable)."""
try:
r = _get(f"{_BASE}/r/{sub}/api/link_flair_v2.json", cookies=cookies)
if r.status_code == 200:
return [f.get("text", "") for f in r.json() if f.get("text")]
except Exception:
pass
return []
def analyze_sub(
sub: str,
cookies: dict | None = None,
known_subs: set[str] | None = None,
) -> dict[str, Any] | None:
"""
Fetch about + rules for a single sub and return an analysis dict.
Returns None if the sub doesn't exist or is inaccessible.
"""
try:
about_r = _get(f"{_BASE}/r/{sub}/about.json", cookies=cookies)
if about_r.status_code != 200:
return None
about = about_r.json().get("data", {})
if about.get("subreddit_type") in ("private", "restricted", "employee_only"):
return None
rules_r = _get(f"{_BASE}/r/{sub}/about/rules.json", cookies=cookies)
rules = rules_r.json().get("rules", []) if rules_r.status_code == 200 else []
promo_allowed, flair_required, notes = _classify_rules(rules)
available_flairs = _fetch_flairs(sub, cookies) if flair_required else []
subscribers = about.get("subscribers") or 0
title = about.get("title") or sub
description = (about.get("public_description") or about.get("description") or "").strip()
return {
"sub": sub,
"title": title,
"subscribers": subscribers,
"description": description[:280],
"promo_allowed": promo_allowed,
"flair_required": flair_required,
"available_flairs": available_flairs,
"rule_warning": False,
"notes": notes,
"already_tracked": (sub.lower() in known_subs) if known_subs is not None else False,
}
except Exception:
logger.exception("Error analyzing r/%s", sub)
return None
def search_subs(
keyword: str,
limit: int = 20,
cookies: dict | None = None,
known_subs: set[str] | None = None,
) -> list[dict[str, Any]]:
"""
Search subreddits by keyword and analyze each result.
Returns a list of analysis dicts sorted by subscriber count (desc).
"""
try:
search_r = _get(
f"{_BASE}/subreddits/search.json",
cookies=cookies,
)
# httpx doesn't support params kwarg above since we're using _get; rebuild
r = httpx.get(
f"{_BASE}/subreddits/search.json",
params={"q": keyword, "limit": min(limit, 50), "sort": "relevance"},
cookies=cookies or {},
headers={"User-Agent": _USER_AGENT},
timeout=10,
follow_redirects=True,
)
if r.status_code != 200:
logger.warning("Subreddit search returned %d for %r", r.status_code, keyword)
return []
children = r.json().get("data", {}).get("children", [])
except Exception:
logger.exception("Error searching subreddits for %r", keyword)
return []
results: list[dict] = []
for child in children:
data = child.get("data", {})
sub_name = data.get("display_name")
if not sub_name:
continue
if data.get("subreddit_type") in ("private", "restricted", "employee_only"):
continue
# Light analysis from search result data (avoids N per-sub about requests)
promo_allowed, flair_required, notes = _classify_rules([]) # no rules yet
subscribers = data.get("subscribers") or 0
title = data.get("title") or sub_name
description = (data.get("public_description") or data.get("description") or "").strip()
results.append({
"sub": sub_name,
"title": title,
"subscribers": subscribers,
"description": description[:280],
"promo_allowed": None, # unknown until rules are fetched
"flair_required": False,
"available_flairs": [],
"rule_warning": False,
"notes": None,
"already_tracked": (sub_name.lower() in known_subs) if known_subs is not None else False,
})
# Sort by subscribers descending
results.sort(key=lambda x: x["subscribers"], reverse=True)
return results[:limit]
def search_and_analyze(
keyword: str,
limit: int = 15,
cookies: dict | None = None,
known_subs: set[str] | None = None,
) -> list[dict[str, Any]]:
"""
Search subreddits by keyword, then fetch full rules for each result.
This is the main entry point for the discovery endpoint.
Runs sequentially limit to 15 to keep latency reasonable.
"""
candidates = search_subs(keyword, limit=limit, cookies=cookies, known_subs=known_subs)
analyzed = []
for c in candidates:
result = analyze_sub(c["sub"], cookies=cookies, known_subs=known_subs)
if result is not None:
analyzed.append(result)
else:
# Sub is inaccessible — skip silently
pass
analyzed.sort(key=lambda x: x["subscribers"], reverse=True)
return analyzed

View file

@ -2,8 +2,11 @@
<div> <div>
<div class="page-header"> <div class="page-header">
<h1 class="page-title">Sub / Channel Rules</h1> <h1 class="page-title">Sub / Channel Rules</h1>
<div style="display: flex; gap: var(--spacing-sm);">
<button class="btn btn-secondary" @click="showDiscover = true">+ Discover</button>
<button class="btn btn-primary" @click="showAdd = true">+ Add Sub</button> <button class="btn btn-primary" @click="showAdd = true">+ Add Sub</button>
</div> </div>
</div>
<div class="card" style="padding: 0; overflow: hidden;"> <div class="card" style="padding: 0; overflow: hidden;">
<table class="table table-responsive"> <table class="table table-responsive">
@ -35,7 +38,7 @@
<span v-if="r.rule_warning" class="badge badge-warning">yes</span> <span v-if="r.rule_warning" class="badge badge-warning">yes</span>
<span v-else style="color: var(--color-text-muted);"></span> <span v-else style="color: var(--color-text-muted);"></span>
</td> </td>
<td data-label="Notes" style="color: var(--color-text-muted); max-width: 260px; white-space: normal; word-break: break-word;" :title="r.notes ?? ''"> <td data-label="Notes" class="notes-cell" :title="r.notes ?? ''">
{{ r.notes ?? '—' }} {{ r.notes ?? '—' }}
</td> </td>
<td data-label=""> <td data-label="">
@ -108,16 +111,88 @@
</div> </div>
</div> </div>
</div> </div>
<!-- Discovery panel -->
<div v-if="showDiscover" class="discover-backdrop" @click.self="closeDiscover">
<div class="discover-panel card">
<div class="discover-header">
<h2 style="font-size: 16px; margin: 0;">Discover Subreddits</h2>
<button class="btn btn-ghost btn-sm" @click="closeDiscover"></button>
</div>
<div class="discover-search">
<input
class="form-input"
v-model="keyword"
placeholder="e.g. open source, self-hosted, neurodivergent"
@keydown.enter="runDiscover"
/>
<button class="btn btn-primary" :disabled="!keyword.trim() || discovering" @click="runDiscover">
{{ discovering ? 'Searching…' : 'Search' }}
</button>
</div>
<div v-if="discovering" class="discover-status">
Fetching subreddit rules this may take 10-20s
</div>
<div v-if="!discovering && discoverResults.length > 0" class="discover-results">
<div
v-for="r in discoverResults"
:key="r.sub"
:class="['discover-row', r.already_tracked ? 'already-tracked' : '']"
>
<div class="discover-row-main">
<div class="discover-row-name">
<a :href="`https://reddit.com/r/${r.sub}`" target="_blank" class="sub-link">r/{{ r.sub }}</a>
<span class="sub-size">{{ formatSubs(r.subscribers) }}</span>
<span v-if="r.already_tracked" class="badge badge-muted">tracked</span>
</div>
<div v-if="r.description" class="discover-row-desc">{{ r.description }}</div>
</div>
<div class="discover-row-meta">
<span class="promo-badge" :class="promoClass(r.promo_allowed)">
{{ promoLabel(r.promo_allowed) }}
</span>
<span v-if="r.flair_required" class="badge badge-warning" title="Flair required">flair</span>
<span v-if="r.notes" class="notes-hint" :title="r.notes"> see notes</span>
</div>
<button
class="btn btn-sm"
:class="r.already_tracked ? 'btn-ghost' : 'btn-secondary'"
:disabled="r.already_tracked"
@click="importSub(r)"
>
{{ r.already_tracked ? 'Already added' : 'Import' }}
</button>
</div>
</div>
<div v-if="!discovering && discoverResults.length === 0 && didSearch" class="discover-status">
No results found for <strong>{{ lastKeyword }}</strong>.
</div>
</div>
</div>
</div> </div>
</template> </template>
<script setup lang="ts"> <script setup lang="ts">
import { onMounted, reactive, ref } from 'vue' import { onMounted, reactive, ref } from 'vue'
import { api, type SubRules } from '@/services/api' import { useToast } from '@/composables/useToast'
import { api, type SubRules, type SubDiscoveryResult, type SubRulesUpsert } from '@/services/api'
const rules = ref<SubRules[]>([]) const rules = ref<SubRules[]>([])
const showAdd = ref(false) const showAdd = ref(false)
const editing = ref<SubRules | null>(null) const editing = ref<SubRules | null>(null)
const toast = useToast()
// Discovery state
const showDiscover = ref(false)
const keyword = ref('')
const lastKeyword = ref('')
const discovering = ref(false)
const didSearch = ref(false)
const discoverResults = ref<SubDiscoveryResult[]>([])
const form = reactive({ const form = reactive({
sub: '', sub: '',
@ -173,9 +248,197 @@ async function save() {
} }
closeModal() closeModal()
} }
// --- Discovery ---
function closeDiscover() {
showDiscover.value = false
discoverResults.value = []
didSearch.value = false
keyword.value = ''
}
async function runDiscover() {
if (!keyword.value.trim() || discovering.value) return
discovering.value = true
didSearch.value = false
lastKeyword.value = keyword.value.trim()
try {
discoverResults.value = await api.subs.discover(lastKeyword.value)
didSearch.value = true
} catch {
toast.error('Discovery failed — check API logs')
} finally {
discovering.value = false
}
}
async function importSub(r: SubDiscoveryResult) {
const payload: SubRulesUpsert = {
flair_required: r.flair_required,
flair_to_use: r.available_flairs[0] ?? null,
promo_allowed: r.promo_allowed === null ? null : r.promo_allowed === 1,
rule_warning: r.rule_warning,
notes: r.notes,
}
try {
const saved = await api.subs.upsertRules(r.sub, payload, 'reddit')
const idx = rules.value.findIndex(x => x.sub === r.sub)
if (idx !== -1) {
rules.value = [...rules.value.slice(0, idx), saved, ...rules.value.slice(idx + 1)]
} else {
rules.value = [...rules.value, saved]
}
// Mark as tracked in results list (immutable update)
discoverResults.value = discoverResults.value.map(x =>
x.sub === r.sub ? { ...x, already_tracked: true } : x
)
toast.success(`r/${r.sub} added to tracked subs`)
} catch {
toast.error(`Failed to import r/${r.sub}`)
}
}
function formatSubs(n: number): string {
if (n >= 1_000_000) return `${(n / 1_000_000).toFixed(1)}M`
if (n >= 1_000) return `${Math.round(n / 1_000)}k`
return String(n)
}
function promoLabel(v: number | null): string {
if (v === 0) return 'banned'
if (v === 1) return 'allowed'
return 'unknown'
}
function promoClass(v: number | null): string {
if (v === 0) return 'badge badge-danger'
if (v === 1) return 'badge badge-success'
return 'badge badge-muted'
}
</script> </script>
<style scoped> <style scoped>
.modal-backdrop { position: fixed; inset: 0; background: rgba(0,0,0,0.6); display: flex; align-items: center; justify-content: center; z-index: 100; } .notes-cell {
.modal { padding: var(--spacing-lg); } color: var(--color-text-muted);
max-width: 260px;
white-space: normal;
word-break: break-word;
}
/* Discovery panel */
.discover-backdrop {
position: fixed;
inset: 0;
background: rgba(0, 0, 0, 0.5);
z-index: 100;
display: flex;
justify-content: flex-end;
}
.discover-panel {
width: 520px;
max-width: 100vw;
height: 100vh;
overflow-y: auto;
display: flex;
flex-direction: column;
gap: var(--spacing-md);
padding: var(--spacing-lg);
border-radius: 0;
}
.discover-header {
display: flex;
align-items: center;
justify-content: space-between;
}
.discover-search {
display: flex;
gap: var(--spacing-sm);
}
.discover-search .form-input {
flex: 1;
}
.discover-status {
color: var(--color-text-muted);
font-size: 13px;
padding: var(--spacing-sm) 0;
}
.discover-results {
display: flex;
flex-direction: column;
gap: var(--spacing-sm);
}
.discover-row {
display: flex;
flex-direction: column;
gap: 6px;
padding: var(--spacing-sm) var(--spacing-md);
border: 1px solid var(--color-border);
border-radius: var(--radius-md);
background: var(--color-surface);
}
.discover-row.already-tracked {
opacity: 0.6;
}
.discover-row-main {
display: flex;
flex-direction: column;
gap: 2px;
}
.discover-row-name {
display: flex;
align-items: center;
gap: var(--spacing-sm);
font-weight: 500;
}
.sub-link {
color: var(--color-primary);
text-decoration: none;
font-size: 14px;
}
.sub-size {
font-size: 12px;
color: var(--color-text-muted);
}
.discover-row-desc {
font-size: 12px;
color: var(--color-text-muted);
line-height: 1.4;
}
.discover-row-meta {
display: flex;
align-items: center;
gap: var(--spacing-xs);
flex-wrap: wrap;
}
.promo-badge {
font-size: 11px;
}
.notes-hint {
font-size: 11px;
color: var(--color-warning, #f59e0b);
cursor: help;
}
@media (max-width: 600px) {
.discover-panel {
width: 100vw;
}
}
</style> </style>

View file

@ -98,6 +98,19 @@ export interface SubRules {
updated_at: string updated_at: string
} }
export interface SubDiscoveryResult {
sub: string
title: string
subscribers: number
description: string
promo_allowed: number | null // 0=banned, 1=allowed, null=unknown
flair_required: boolean
available_flairs: string[]
rule_warning: boolean
notes: string | null
already_tracked: boolean
}
export interface SubRulesUpsert { export interface SubRulesUpsert {
flair_required?: boolean flair_required?: boolean
flair_to_use?: string | null flair_to_use?: string | null
@ -272,6 +285,9 @@ export const api = {
upsertRules: (sub: string, data: SubRulesUpsert, platform = 'reddit') => upsertRules: (sub: string, data: SubRulesUpsert, platform = 'reddit') =>
http.put<SubRules>(`/subs/${sub}`, data, { params: { platform } }).then(r => r.data), http.put<SubRules>(`/subs/${sub}`, data, { params: { platform } }).then(r => r.data),
discover: (keyword: string, limit = 15) =>
http.post<SubDiscoveryResult[]>('/subs/discover', { keyword, limit }).then(r => r.data),
}, },
posts: { posts: {