From 24a16ee4b0a0e57b86f6faf290cd88ab269f0f40 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Thu, 19 Mar 2026 20:28:23 -0700 Subject: [PATCH] docs: add digest scrape queue design spec --- .../specs/2026-03-19-digest-queue-design.md | 419 ++++++++++++++++++ 1 file changed, 419 insertions(+) create mode 100644 docs/superpowers/specs/2026-03-19-digest-queue-design.md diff --git a/docs/superpowers/specs/2026-03-19-digest-queue-design.md b/docs/superpowers/specs/2026-03-19-digest-queue-design.md new file mode 100644 index 0000000..3e11d3c --- /dev/null +++ b/docs/superpowers/specs/2026-03-19-digest-queue-design.md @@ -0,0 +1,419 @@ +# Digest Scrape Queue โ€” Design Spec + +**Date:** 2026-03-19 +**Status:** Approved โ€” ready for implementation planning + +--- + +## Goal + +When a user clicks the ๐Ÿ“ฐ Digest chip on a signal banner, the email is added to a persistent digest queue accessible via a dedicated nav tab. The user browses queued digest emails, selects extracted job links to process, and queues them through the existing discovery pipeline as `status='pending'` jobs in `staging.db`. + +--- + +## Decisions Made + +| Decision | Choice | +|---|---| +| Digest tab placement | Separate top-level nav tab "๐Ÿ“ฐ Digest", between Interviews and Apply | +| Storage | New `digest_queue` table in `staging.db`; unique on `job_contact_id` | +| Table creation | In `scripts/db.py` `init_db()` โ€” canonical schema location, not `dev-api.py` | +| Link extraction | On-demand, backend regex against HTML-stripped plain-text body โ€” no background task needed | +| Extraction UX | Show ranked link list; job-likely pre-checked, others unchecked; user ticks and submits | +| After queueing | Entry stays in digest list for reference; `[โœ•]` removes explicitly | +| Failure handling | Digest chip dismisses signal optimistically regardless of `POST /api/digest-queue` success | +| Duplicate protection | `UNIQUE(job_contact_id)` in table; `POST /api/digest-queue` returns `{ created: false }` on duplicate (no 409) | +| Mobile nav | Digest tab does NOT appear in mobile bottom tab bar (all 5 slots occupied; deferred) | +| URL validation | Non-http/https schemes and blank URLs skipped silently in `queue-jobs`; validation deferred to pipeline | + +--- + +## Data Model + +### New table: `digest_queue` + +Added to `scripts/db.py` `init_db()`: + +```sql +CREATE TABLE IF NOT EXISTS digest_queue ( + id INTEGER PRIMARY KEY, + job_contact_id INTEGER NOT NULL REFERENCES job_contacts(id), + created_at TEXT DEFAULT (datetime('now')), + UNIQUE(job_contact_id) +) +``` + +`init_db()` is called at app startup and by `dev-api.py` startup โ€” adding the `CREATE TABLE IF NOT EXISTS` there is safe and idempotent. + +--- + +## Backend + +### New endpoints in `dev-api.py` + +#### `GET /api/digest-queue` + +Returns all queued entries joined with `job_contacts`. `body` is HTML-stripped via `_strip_html()` before returning (display only โ€” extraction uses a separate raw read, see `extract-links`): + +```python +SELECT dq.id, dq.job_contact_id, dq.created_at, + jc.subject, jc.from_addr, jc.received_at, jc.body +FROM digest_queue dq +JOIN job_contacts jc ON jc.id = dq.job_contact_id +ORDER BY dq.created_at DESC +``` + +Response: array of `{ id, job_contact_id, created_at, subject, from_addr, received_at, body }`. + +--- + +#### `POST /api/digest-queue` + +Body: `{ job_contact_id: int }` + +- Verify `job_contact_id` exists in `job_contacts` โ†’ 404 if not found +- `INSERT OR IGNORE INTO digest_queue (job_contact_id) VALUES (?)` +- Returns `{ ok: true, created: true }` on insert, `{ ok: true, created: false }` if already present +- Never returns 409 โ€” the `created` field is the duplicate signal + +--- + +#### `POST /api/digest-queue/{id}/extract-links` + +Extracts and ranks URLs from the entry's email body. No request body. + +**Important:** this endpoint reads the **raw** `body` from `job_contacts` directly and runs `URL_RE` against it **before** any HTML stripping. `_strip_html()` calls `BeautifulSoup.get_text()`, which extracts visible text only โ€” it does not preserve `href` attribute values. A URL that appears only as an `href` target (e.g., `Click here`) would be lost after stripping. Running the regex on raw HTML captures those URLs correctly because `URL_RE`'s character exclusion class (`[^\s<>"')\]]`) stops at `"`, so it cleanly extracts href values without matching surrounding markup. + +```python +# Fetch raw body from DB โ€” do NOT strip before extraction +row = db.execute( + "SELECT jc.body FROM digest_queue dq JOIN job_contacts jc ON jc.id = dq.job_contact_id WHERE dq.id = ?", + (digest_id,) +).fetchone() +if not row: + raise HTTPException(404, "Digest entry not found") +return {"links": extract_links(row["body"] or "")} +``` + +**Extraction algorithm:** + +```python +import re +from urllib.parse import urlparse + +JOB_DOMAINS = { + 'greenhouse.io', 'lever.co', 'workday.com', 'linkedin.com', + 'ashbyhq.com', 'smartrecruiters.com', 'icims.com', 'taleo.net', + 'jobvite.com', 'breezy.hr', 'recruitee.com', 'bamboohr.com', + 'myworkdayjobs.com', 'careers.', 'jobs.', +} + +FILTER_PATTERNS = re.compile( + r'(unsubscribe|mailto:|/track/|pixel\.|\.gif|\.png|\.jpg' + r'|/open\?|/click\?|list-unsubscribe)', + re.I +) + +URL_RE = re.compile(r'https?://[^\s<>"\')\]]+', re.I) + +def _score_url(url: str) -> int: + parsed = urlparse(url) + hostname = parsed.hostname or '' + path = parsed.path.lower() + if FILTER_PATTERNS.search(url): + return -1 # exclude + for domain in JOB_DOMAINS: + if domain in hostname or domain in path: + return 2 # job-likely + return 1 # other + +def extract_links(body: str) -> list[dict]: + if not body: + return [] + seen = set() + results = [] + for m in URL_RE.finditer(body): + url = m.group(0).rstrip('.,;)') + if url in seen: + continue + seen.add(url) + score = _score_url(url) + if score < 0: + continue + # Title hint: last line of text immediately before the URL (up to 60 chars) + start = max(0, m.start() - 60) + hint = body[start:m.start()].strip().split('\n')[-1].strip() + results.append({'url': url, 'score': score, 'hint': hint}) + results.sort(key=lambda x: -x['score']) + return results +``` + +Response: `{ links: [{ url, score, hint }] }` โ€” `score=2` means job-likely (pre-check in UI), `score=1` means other (unchecked). + +--- + +#### `POST /api/digest-queue/{id}/queue-jobs` + +Body: `{ urls: [string] }` + +- 404 if digest entry not found +- 400 if `urls` is empty +- Non-http/https URLs and blank strings are skipped silently (counted as `skipped`) + +Calls `insert_job` from `scripts/db.py`. The actual signature is `insert_job(db_path, job)` where `job` is a dict. The `status` field is **not** passed โ€” the schema default of `'pending'` handles it: + +```python +from scripts.db import insert_job +from datetime import datetime + +queued = 0 +skipped = 0 +for url in body.urls: + if not url or not url.startswith(('http://', 'https://')): + skipped += 1 + continue + result = insert_job(DB_PATH, { + 'url': url, + 'title': '', + 'company': '', + 'source': 'digest', + 'date_found': datetime.utcnow().isoformat(), + }) + if result: + queued += 1 + else: + skipped += 1 # duplicate URL โ€” insert_job returns None on UNIQUE conflict +return {'ok': True, 'queued': queued, 'skipped': skipped} +``` + +--- + +#### `DELETE /api/digest-queue/{id}` + +Removes entry from `digest_queue`. Does not affect `job_contacts`. +Returns `{ ok: true }`. 404 if not found. + +--- + +## Frontend Changes + +### Chip handler update (`InterviewCard.vue` + `InterviewsView.vue`) + +When `newLabel === 'digest'`, the handler fires a **third call** after the existing reclassify + dismiss calls. Note: `sig.id` is `job_contacts.id` โ€” this is the correct value for `job_contact_id` (the `StageSignal.id` field maps directly to the `job_contacts` primary key): + +```typescript +// After existing reclassify + dismiss calls: +if (newLabel === 'digest') { + fetch('/api/digest-queue', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ job_contact_id: sig.id }), // sig.id === job_contacts.id + }).catch(() => {}) // best-effort; signal already dismissed optimistically +} +``` + +Signal is removed from local array optimistically before this call (same as current dismiss behavior). + +--- + +### New store: `web/src/stores/digest.ts` + +```typescript +import { defineStore } from 'pinia' +import { ref } from 'vue' + +export interface DigestEntry { + id: number + job_contact_id: number + created_at: string + subject: string + from_addr: string | null + received_at: string + body: string | null +} + +export interface DigestLink { + url: string + score: number // 2 = job-likely, 1 = other + hint: string +} + +export const useDigestStore = defineStore('digest', () => { + const entries = ref([]) + + async function fetchAll() { + const res = await fetch('/api/digest-queue') + entries.value = await res.json() + } + + async function remove(id: number) { + entries.value = entries.value.filter(e => e.id !== id) + await fetch(`/api/digest-queue/${id}`, { method: 'DELETE' }) + } + + return { entries, fetchAll, remove } +}) +``` + +--- + +### New page: `web/src/views/DigestView.vue` + +**Layout โ€” collapsed entry (default):** + +``` +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ โ–ธ TechCrunch Jobs Weekly โ”‚ +โ”‚ From: digest@techcrunch.com ยท Mar 19 โ”‚ +โ”‚ [Extract] [โœ•] โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ +``` + +**Layout โ€” expanded entry (after Extract):** + +``` +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ โ–พ LinkedIn Job Digest โ”‚ +โ”‚ From: jobs@linkedin.com ยท Mar 18 โ”‚ +โ”‚ [Re-extract] [โœ•] โ”‚ +โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ +โ”‚ โ”‚ โ˜‘ Senior Engineer โ€” Acme Corp โ”‚ โ”‚ โ† score=2, pre-checked +โ”‚ โ”‚ greenhouse.io/acme/jobs/456 โ”‚ โ”‚ +โ”‚ โ”‚ โ˜‘ Staff Designer โ€” Globex โ”‚ โ”‚ +โ”‚ โ”‚ lever.co/globex/staff-designer โ”‚ โ”‚ +โ”‚ โ”‚ โ”€โ”€โ”€ Other links โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ โ”‚ โ”‚ +โ”‚ โ”‚ โ˜ acme.com/blog/engineering โ”‚ โ”‚ โ† score=1, unchecked +โ”‚ โ”‚ โ˜ linkedin.com/company/acme โ”‚ โ”‚ +โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ +โ”‚ [Queue 2 selected โ†’] โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ +``` + +**After queueing:** + +Inline confirmation replaces the link list: +``` +โœ… 2 jobs queued for review, 1 skipped (already in pipeline) +``` + +Entry remains in the list. `[โœ•]` removes it. + +**Empty state:** +``` +๐Ÿฆ… No digest emails queued. + When you mark an email as ๐Ÿ“ฐ Digest, it appears here. +``` + +**Component state (per entry, keyed by `DigestEntry.id`):** + +```typescript +const expandedIds = ref>({}) +const linkResults = ref>({}) +const selectedUrls = ref>>({}) +const queueResult = ref>({}) +const extracting = ref>({}) +const queuing = ref>({}) +``` + +`selectedUrls` uses `Set`. Toggling a URL uses the spread-copy pattern to trigger Vue 3 reactivity โ€” same pattern as `expandedSignalIds` in `InterviewCard.vue`: + +```typescript +function toggleUrl(entryId: number, url: string) { + const prev = selectedUrls.value[entryId] ?? new Set() + const next = new Set(prev) + next.has(url) ? next.delete(url) : next.add(url) + selectedUrls.value = { ...selectedUrls.value, [entryId]: next } +} +``` + +--- + +### Router + Nav + +Add to `web/src/router/index.ts`: +```typescript +{ path: '/digest', component: () => import('../views/DigestView.vue') } +``` + +**AppNav.vue changes:** + +Add `NewspaperIcon` to the Heroicons import (already imported from `@heroicons/vue/24/outline`), then append to `navLinks` after `Interviews`: + +```typescript +import { NewspaperIcon } from '@heroicons/vue/24/outline' + +const navLinks = [ + { to: '/', icon: HomeIcon, label: 'Home' }, + { to: '/review', icon: ClipboardDocumentListIcon, label: 'Job Review' }, + { to: '/apply', icon: PencilSquareIcon, label: 'Apply' }, + { to: '/interviews', icon: CalendarDaysIcon, label: 'Interviews' }, + { to: '/digest', icon: NewspaperIcon, label: 'Digest' }, // NEW + { to: '/prep', icon: LightBulbIcon, label: 'Interview Prep' }, + { to: '/survey', icon: MagnifyingGlassIcon, label: 'Survey' }, +] +``` + +`navLinks` remains a static array. The badge count is rendered as a separate reactive expression in the template alongside the Digest link โ€” keep `navLinks` as-is and add the digest store separately: + +```typescript +// In AppNav.vue