magpie/app/services/lemmy/client.py
pyr0ball a6ea0b9c58 feat(#7,#10): signal crawler -- Reddit + Lemmy community monitoring
Implements the full signal detection pipeline:

Backend:
- app/services/lemmy/client.py: async Lemmy API v3 client, community@instance
  addressing, integer cursor dedup, normalised post dicts
- app/services/scraper.py: platform-agnostic scraper; Reddit (.json API,
  fullname cursor) + Lemmy (integer ID cursor); keyword/regex/all match modes,
  min_score gate, NormalizedPost shape, upsert dedup via UNIQUE post_id
- app/api/endpoints/signals.py: CRUD for signal_rules + signals queue;
  POST /signals/scrape manual trigger; scrape-state viewer
- migrations 010-012: signal_rules, signals, signal_scrape_state tables
- scheduler: interval job every 30 min (scraper_enabled=True in config)
- Fixed migration collision: 007_signal_rules.sql → 010, 008 → 011, 009 → 012

Frontend:
- SignalsView.vue: signal feed with status filter (new/saved/dismissed),
  keyword chips, score/comment counts, save/dismiss actions, rules editor panel
- api.ts: SignalRule, Signal types + signalRules/signals API methods
- Nav: Signals as default landing route (replaces /campaigns default)

Closes #7 (signal extraction), closes #10 (Lemmy JSON crawler)
2026-04-22 11:00:14 -07:00

122 lines
3.7 KiB
Python

"""
Lemmy API v3 client — read-only, no auth required for public communities.
Community addressing convention: <community>@<instance>
e.g. "selfhosted@lemmy.world", "technology@reddthat.com"
API reference: https://{instance}/api/v3/
Key endpoints used:
GET /api/v3/post/list?community_name=<name>&sort=New&limit=<n>&page=<p>
GET /api/v3/community?name=<name> (validation / metadata)
"""
from __future__ import annotations
import logging
from typing import Any
import httpx
logger = logging.getLogger(__name__)
# Instances known to be reliably federated and publicly accessible
KNOWN_INSTANCES = frozenset([
"lemmy.world",
"lemmy.ml",
"beehaw.org",
"reddthat.com",
"sh.itjust.works",
"feddit.de",
"aussie.zone",
"lemmy.ca",
])
_DEFAULT_TIMEOUT = 15.0
_DEFAULT_USER_AGENT = "Magpie/0.1 signal-monitor (by CircuitForge)"
def parse_community_target(sub: str) -> tuple[str, str]:
"""
Parse a 'community@instance' string into (community_name, instance_host).
Raises ValueError if the format is invalid.
"""
if "@" not in sub:
raise ValueError(
f"Lemmy community must be in 'community@instance' format, got: {sub!r}"
)
community, instance = sub.rsplit("@", 1)
if not community or not instance:
raise ValueError(f"Invalid Lemmy community target: {sub!r}")
return community.strip(), instance.strip().lower()
async def fetch_new_posts(
community: str,
instance: str,
last_seen_id: int | None,
limit: int = 50,
user_agent: str = _DEFAULT_USER_AGENT,
) -> tuple[list[dict[str, Any]], int | None]:
"""
Fetch new posts from a Lemmy community.
Fetches the first page sorted by New, then filters to posts with
id > last_seen_id (client-side cursor — Lemmy has no 'before' param).
Returns:
posts — filtered list of post dicts, newest first
new_max_id — highest post ID seen this run (use as next cursor), or None
"""
url = f"https://{instance}/api/v3/post/list"
params = {
"community_name": community,
"sort": "New",
"limit": min(limit, 50), # Lemmy max is 50 per page
"type_": "Local", # stay within the instance's local community
}
async with httpx.AsyncClient(
headers={"User-Agent": user_agent},
follow_redirects=True,
timeout=_DEFAULT_TIMEOUT,
) as client:
resp = await client.get(url, params=params)
resp.raise_for_status()
data = resp.json()
post_views = data.get("posts", [])
# Normalize to flat dicts and apply cursor filter
posts = []
new_max_id: int | None = None
for pv in post_views:
post = pv.get("post", {})
counts = pv.get("counts", {})
creator = pv.get("creator", {})
post_id: int = post.get("id", 0)
# Track max ID regardless of cursor (always advance)
if new_max_id is None or post_id > new_max_id:
new_max_id = post_id
# Apply cursor: skip posts we've already seen
if last_seen_id is not None and post_id <= last_seen_id:
continue
posts.append({
"id": post_id,
"ap_id": post.get("ap_id", f"https://{instance}/post/{post_id}"),
"title": post.get("name", ""),
"body": post.get("body", "") or "",
"url": post.get("ap_id", ""), # ap_id is the canonical URL
"author": creator.get("name", ""),
"score": counts.get("score", 0),
"comment_count": counts.get("comments", 0),
"published": post.get("published", ""),
"community": community,
"instance": instance,
})
return posts, new_max_id