docs: digest parsers implementation plan (TDD, 6 tasks)

docs: add privacy policy reference
feat: add LLM suggest button to Skills & Keywords section
2026-03-05 22:41:40 -08:00 · 2026-03-05 20:59:01 -08:00 · 2026-03-05 15:13:57 -08:00 · 2026-03-05 15:08:07 -08:00 · 2026-03-05 15:00:53 -08:00 · 2026-03-05 13:40:53 -08:00
11 changed files with 2112 additions and 71 deletions
--- a/PRIVACY.md
+++ b/PRIVACY.md
@ -0,0 +1,7 @@
 # Privacy Policy
 CircuitForge LLC's privacy policy applies to this product and is published at:
 **<https://circuitforge.tech/privacy>**
 Last reviewed: March 2026.
--- a/app/Home.py
+++ b/app/Home.py
@ -25,17 +25,45 @@ from scripts.task_runner import submit_task
 init_db(DEFAULT_DB)
 def _email_configured() -> bool:
    _e = Path(__file__).parent.parent / "config" / "email.yaml"
    if not _e.exists():
        return False
    import yaml as _yaml
    _cfg = _yaml.safe_load(_e.read_text()) or {}
    return bool(_cfg.get("username") or _cfg.get("user") or _cfg.get("imap_host"))
 def _notion_configured() -> bool:
    _n = Path(__file__).parent.parent / "config" / "notion.yaml"
    if not _n.exists():
        return False
    import yaml as _yaml
    _cfg = _yaml.safe_load(_n.read_text()) or {}
    return bool(_cfg.get("token"))
 def _keywords_configured() -> bool:
    _k = Path(__file__).parent.parent / "config" / "resume_keywords.yaml"
    if not _k.exists():
        return False
    import yaml as _yaml
    _cfg = _yaml.safe_load(_k.read_text()) or {}
    return bool(_cfg.get("keywords") or _cfg.get("required") or _cfg.get("preferred"))
 _SETUP_BANNERS = [
    {"key": "connect_cloud",       "text": "Connect a cloud service for resume/cover letter storage",
-     "link_label": "Settings → Integrations"},
+     "link_label": "Settings → Integrations",
     "done": _notion_configured},
    {"key": "setup_email",         "text": "Set up email sync to catch recruiter outreach",
-     "link_label": "Settings → Email"},
+     "link_label": "Settings → Email",
     "done": _email_configured},
    {"key": "setup_email_labels",  "text": "Set up email label filters for auto-classification",
-     "link_label": "Settings → Email (label guide)"},
+     "link_label": "Settings → Email (label guide)",
     "done": _email_configured},
    {"key": "tune_mission",        "text": "Tune your mission preferences for better cover letters",
     "link_label": "Settings → My Profile"},
    {"key": "configure_keywords",  "text": "Configure keywords and blocklist for smarter search",
-     "link_label": "Settings → Search"},
+     "link_label": "Settings → Search",
     "done": _keywords_configured},
    {"key": "upload_corpus",       "text": "Upload your cover letter corpus for voice fine-tuning",
     "link_label": "Settings → Fine-Tune"},
    {"key": "configure_linkedin",  "text": "Configure LinkedIn Easy Apply automation",
@ -513,7 +541,10 @@ with st.expander("⚠️ Danger Zone", expanded=False):
 # ── Setup banners ─────────────────────────────────────────────────────────────
 if _profile and _profile.wizard_complete:
    _dismissed = set(_profile.dismissed_banners)
-    _pending_banners = [b for b in _SETUP_BANNERS if b["key"] not in _dismissed]
+    _pending_banners = [
        b for b in _SETUP_BANNERS
        if b["key"] not in _dismissed and not b.get("done", lambda: False)()
    ]
    if _pending_banners:
        st.divider()
        st.markdown("#### Finish setting up Peregrine")
--- a/app/pages/2_Settings.py
+++ b/app/pages/2_Settings.py
@ -36,47 +36,18 @@ def save_yaml(path: Path, data: dict) -> None:
    path.write_text(yaml.dump(data, default_flow_style=False, allow_unicode=True))
-def _suggest_search_terms(current_titles: list[str], resume_path: Path) -> dict:
+from scripts.suggest_helpers import (
-    """Call LLM to suggest additional job titles and exclude keywords."""
+    suggest_search_terms as _suggest_search_terms_impl,
-    import json
+    suggest_resume_keywords as _suggest_resume_keywords,
-    import re
+)
    from scripts.llm_router import LLMRouter
-    resume_context = ""
+def _suggest_search_terms(current_titles, resume_path, blocklist=None, user_profile=None):
-    if resume_path.exists():
+    return _suggest_search_terms_impl(
-        resume = load_yaml(resume_path)
+        current_titles,
-        lines = []
+        resume_path,
-        for exp in (resume.get("experience_details") or [])[:3]:
+        blocklist or {},
-            pos = exp.get("position", "")
+        user_profile or {},
-            co = exp.get("company", "")
+    )
            skills = ", ".join((exp.get("skills_acquired") or [])[:5])
            lines.append(f"- {pos} at {co}: {skills}")
        resume_context = "\n".join(lines)
    titles_str = "\n".join(f"- {t}" for t in current_titles)
    prompt = f"""You are helping a job seeker optimize their search criteria.
 Their background (from resume):
 {resume_context or "Customer success and technical account management leader"}
 Current job titles being searched:
 {titles_str}
 Suggest:
 1. 5-8 additional job titles they might be missing (alternative names, adjacent roles, senior variants)
 2. 3-5 keywords to add to the exclusion filter (to screen out irrelevant postings)
 Return ONLY valid JSON in this exact format:
 {{"suggested_titles": ["Title 1", "Title 2"], "suggested_excludes": ["keyword 1", "keyword 2"]}}"""
    result = LLMRouter().complete(prompt).strip()
    m = re.search(r"\{.*\}", result, re.DOTALL)
    if m:
        try:
            return json.loads(m.group())
        except Exception:
            pass
    return {"suggested_titles": [], "suggested_excludes": []}
 _show_finetune = bool(_profile and _profile.inference_profile in ("single-gpu", "dual-gpu"))
@ -324,6 +295,18 @@ with tab_search:
        st.session_state["_sp_excludes"] = "\n".join(p.get("exclude_keywords", []))
        st.session_state["_sp_hash"] = _sp_hash
    # Apply any pending programmatic updates BEFORE widgets are instantiated.
    # Streamlit forbids writing to a widget's key after it renders on the same pass;
    # button handlers write to *_pending keys instead, consumed here on the next pass.
    for _pend, _wkey in [("_sp_titles_pending", "_sp_titles_multi"),
                         ("_sp_locs_pending", "_sp_locations_multi"),
                         ("_sp_new_title_pending", "_sp_new_title"),
                         ("_sp_paste_titles_pending", "_sp_paste_titles"),
                         ("_sp_new_loc_pending", "_sp_new_loc"),
                         ("_sp_paste_locs_pending", "_sp_paste_locs")]:
        if _pend in st.session_state:
            st.session_state[_wkey] = st.session_state.pop(_pend)
    # ── Titles ────────────────────────────────────────────────────────────────
    _title_row, _suggest_btn_col = st.columns([4, 1])
    with _title_row:
@ -331,7 +314,7 @@ with tab_search:
    with _suggest_btn_col:
        st.write("")
        _run_suggest = st.button("✨ Suggest", key="sp_suggest_btn",
-                                  help="Ask the LLM to suggest additional titles and exclude keywords based on your resume")
+                                  help="Ask the LLM to suggest additional titles and smarter exclude keywords — using your blocklist, mission values, and career background.")
    st.multiselect(
        "Job titles",
@ -355,8 +338,8 @@ with tab_search:
                    st.session_state["_sp_title_options"] = _opts
                if _t not in _sel:
                    _sel.append(_t)
-                    st.session_state["_sp_titles_multi"] = _sel
+                    st.session_state["_sp_titles_pending"] = _sel
-                st.session_state["_sp_new_title"] = ""
+                st.session_state["_sp_new_title_pending"] = ""
                st.rerun()
    with st.expander("📋 Paste a list of titles"):
        st.text_area("One title per line", key="_sp_paste_titles", height=80, label_visibility="collapsed",
@ -371,23 +354,34 @@ with tab_search:
                if _t not in _sel:
                    _sel.append(_t)
            st.session_state["_sp_title_options"] = _opts
-            st.session_state["_sp_titles_multi"] = _sel
+            st.session_state["_sp_titles_pending"] = _sel
-            st.session_state["_sp_paste_titles"] = ""
+            st.session_state["_sp_paste_titles_pending"] = ""
            st.rerun()
    # ── LLM suggestions panel ────────────────────────────────────────────────
    if _run_suggest:
        _current_titles = list(st.session_state.get("_sp_titles_multi", []))
        _blocklist = load_yaml(BLOCKLIST_CFG)
        _user_profile = load_yaml(USER_CFG)
        with st.spinner("Asking LLM for suggestions…"):
-            suggestions = _suggest_search_terms(_current_titles, RESUME_PATH)
+            try:
-        # Add suggested titles to options list (not auto-selected — user picks from dropdown)
+                suggestions = _suggest_search_terms(_current_titles, RESUME_PATH, _blocklist, _user_profile)
-        _opts = list(st.session_state.get("_sp_title_options", []))
+            except RuntimeError as _e:
-        for _t in suggestions.get("suggested_titles", []):
+                st.warning(
-            if _t not in _opts:
+                    f"No LLM backend available: {_e}. "
-                _opts.append(_t)
+                    "Check that Ollama is running and has GPU access, or enable a cloud backend in Settings → System → LLM.",
-        st.session_state["_sp_title_options"] = _opts
+                    icon="⚠️",
-        st.session_state["_sp_suggestions"] = suggestions
+                )
-        st.rerun()
+                suggestions = None
        if suggestions is not None:
            # Add suggested titles to options list (not auto-selected — user picks from dropdown)
            _opts = list(st.session_state.get("_sp_title_options", []))
            for _t in suggestions.get("suggested_titles", []):
                if _t not in _opts:
                    _opts.append(_t)
            st.session_state["_sp_title_options"] = _opts
            st.session_state["_sp_suggestions"] = suggestions
            st.rerun()
    if st.session_state.get("_sp_suggestions"):
        sugg = st.session_state["_sp_suggestions"]
@ -436,8 +430,8 @@ with tab_search:
                    st.session_state["_sp_loc_options"] = _opts
                if _l not in _sel:
                    _sel.append(_l)
-                    st.session_state["_sp_locations_multi"] = _sel
+                    st.session_state["_sp_locs_pending"] = _sel
-                st.session_state["_sp_new_loc"] = ""
+                st.session_state["_sp_new_loc_pending"] = ""
                st.rerun()
    with st.expander("📋 Paste a list of locations"):
        st.text_area("One location per line", key="_sp_paste_locs", height=80, label_visibility="collapsed",
@ -452,8 +446,8 @@ with tab_search:
                if _l not in _sel:
                    _sel.append(_l)
            st.session_state["_sp_loc_options"] = _opts
-            st.session_state["_sp_locations_multi"] = _sel
+            st.session_state["_sp_locs_pending"] = _sel
-            st.session_state["_sp_paste_locs"] = ""
+            st.session_state["_sp_paste_locs_pending"] = ""
            st.rerun()
    st.subheader("Exclude Keywords")
@ -747,11 +741,33 @@ with tab_resume:
        st.balloons()
    st.divider()
-    st.subheader("🏷️ Skills & Keywords")
+    _kw_header_col, _kw_btn_col = st.columns([5, 1])
-    st.caption(
+    with _kw_header_col:
-        f"Matched against job descriptions to surface {_name}'s most relevant experience "
+        st.subheader("🏷️ Skills & Keywords")
-        "and highlight keyword overlap in research briefs. Search the bundled list or add your own."
+        st.caption(
-    )
+            f"Matched against job descriptions to surface {_name}'s most relevant experience "
            "and highlight keyword overlap in research briefs. Search the bundled list or add your own."
        )
    with _kw_btn_col:
        st.write("")
        st.write("")
        _run_kw_suggest = st.button(
            "✨ Suggest", key="kw_suggest_btn",
            help="Ask the LLM to suggest skills, domains, and keywords based on your resume.",
        )
    if _run_kw_suggest:
        _kw_current = load_yaml(KEYWORDS_CFG) if KEYWORDS_CFG.exists() else {}
        with st.spinner("Asking LLM for keyword suggestions…"):
            try:
                _kw_sugg = _suggest_resume_keywords(RESUME_PATH, _kw_current)
                st.session_state["_kw_suggestions"] = _kw_sugg
            except RuntimeError as _e:
                st.warning(
                    f"No LLM backend available: {_e}. "
                    "Check that Ollama is running and has GPU access, or enable a cloud backend in Settings → System → LLM.",
                    icon="⚠️",
                )
    from scripts.skills_utils import load_suggestions as _load_sugg, filter_tag as _filter_tag
@ -815,6 +831,33 @@ with tab_resume:
            save_yaml(KEYWORDS_CFG, kw_data)
            st.rerun()
        # ── LLM keyword suggestion chips ──────────────────────────────────────
        _kw_sugg_data = st.session_state.get("_kw_suggestions")
        if _kw_sugg_data:
            _KW_ICONS = {"skills": "🛠️", "domains": "🏢", "keywords": "🔑"}
            _any_shown = False
            for _cat, _icon in _KW_ICONS.items():
                _cat_sugg = [t for t in _kw_sugg_data.get(_cat, [])
                             if t not in kw_data.get(_cat, [])]
                if not _cat_sugg:
                    continue
                _any_shown = True
                st.caption(f"**{_icon} {_cat.capitalize()} suggestions** — click to add:")
                _chip_cols = st.columns(min(len(_cat_sugg), 4))
                for _i, _tag in enumerate(_cat_sugg):
                    with _chip_cols[_i % 4]:
                        if st.button(f"+ {_tag}", key=f"kw_sugg_{_cat}_{_i}"):
                            _new_list = list(kw_data.get(_cat, [])) + [_tag]
                            kw_data[_cat] = _new_list
                            save_yaml(KEYWORDS_CFG, kw_data)
                            _kw_sugg_data[_cat] = [t for t in _kw_sugg_data[_cat] if t != _tag]
                            st.session_state["_kw_suggestions"] = _kw_sugg_data
                            st.rerun()
            if _any_shown:
                if st.button("✕ Clear suggestions", key="kw_clear_sugg"):
                    st.session_state.pop("_kw_suggestions", None)
                    st.rerun()
 # ── System tab ────────────────────────────────────────────────────────────────
 with tab_system:
    st.caption("Infrastructure, LLM backends, integrations, and service connections.")
@ -1015,8 +1058,10 @@ with tab_system:
    with st.expander("🔌 Services", expanded=True):
        import subprocess as _sp
        import shutil as _shutil
        import os as _os
        TOKENS_CFG = CONFIG_DIR / "tokens.yaml"
        COMPOSE_DIR = str(Path(__file__).parent.parent.parent)
        _compose_env = {**_os.environ, "COMPOSE_PROJECT_NAME": "peregrine"}
        _docker_available = bool(_shutil.which("docker"))
        _sys_profile_name = _profile.inference_profile if _profile else "remote"
        SYS_SERVICES = [
@ -1108,7 +1153,7 @@ with tab_system:
                    elif up:
                        if st.button("⏹ Stop", key=f"sys_svc_stop_{svc['port']}", use_container_width=True):
                            with st.spinner(f"Stopping {svc['name']}…"):
-                                r = _sp.run(svc["stop"], capture_output=True, text=True, cwd=svc["cwd"])
+                                r = _sp.run(svc["stop"], capture_output=True, text=True, cwd=svc["cwd"], env=_compose_env)
                            st.success("Stopped.") if r.returncode == 0 else st.error(r.stderr or r.stdout)
                            st.rerun()
                    else:
@ -1119,7 +1164,7 @@ with tab_system:
                                _start_cmd.append(_sel)
                        if st.button("▶ Start", key=f"sys_svc_start_{svc['port']}", use_container_width=True, type="primary"):
                            with st.spinner(f"Starting {svc['name']}…"):
-                                r = _sp.run(_start_cmd, capture_output=True, text=True, cwd=svc["cwd"])
+                                r = _sp.run(_start_cmd, capture_output=True, text=True, cwd=svc["cwd"], env=_compose_env)
                            st.success("Started!") if r.returncode == 0 else st.error(r.stderr or r.stdout)
                            st.rerun()
--- a/compose.yml
+++ b/compose.yml
@ -16,6 +16,8 @@ services:
      - ./config:/app/config
      - ./data:/app/data
      - ${DOCS_DIR:-~/Documents/JobSearch}:/docs
      - /var/run/docker.sock:/var/run/docker.sock
      - /usr/bin/docker:/usr/bin/docker:ro
    environment:
      - STAGING_DB=/app/data/staging.db
      - DOCS_DIR=/docs
--- a/docs/plans/2026-03-05-digest-parsers-design.md
+++ b/docs/plans/2026-03-05-digest-parsers-design.md
@ -0,0 +1,242 @@
 # Digest Email Parsers — Design
 **Date:** 2026-03-05
 **Products:** Peregrine (primary), Avocet (bucket)
 **Status:** Design approved, ready for implementation planning
 ---
 ## Problem
 Peregrine's `imap_sync.py` can extract leads from digest emails, but only for LinkedIn — the
 parser is hardcoded inline with no extension point. Adzuna and The Ladders digest emails are
 unhandled. Additionally, any digest email from an unknown sender is silently dropped with no
 way to collect samples for building new parsers.
 ---
 ## Solution Overview
 Two complementary changes:
 1. **`peregrine/scripts/digest_parsers.py`** — a standalone parser module with a sender registry
   and dispatcher. `imap_sync.py` calls a single function; the registry handles dispatch.
   LinkedIn parser moves here; Adzuna and Ladders parsers are built against real IMAP samples.
 2. **Avocet digest bucket** — when a user labels an email as `digest` in the Avocet label UI,
   the email is appended to `data/digest_samples.jsonl`. This file is the corpus for building
   and testing new parsers for senders not yet in the registry.
 ---
 ## Architecture
 ### Production path (Peregrine)
 ```
 imap_sync._scan_unmatched_leads()
    │
    ├─ parse_digest(from_addr, body)
    │       │
    │       ├─ None  → unknown sender → fall through to LLM extraction (unchanged)
    │       ├─ []    → known sender, nothing found → skip
    │       └─ [...] → jobs found → insert_job() + submit_task("scrape_url")
    │
    └─ continue  (digest email consumed; does not reach LLM path)
 ```
 ### Sample collection path (Avocet)
 ```
 Avocet label UI
    │
    └─ label == "digest"
            │
            └─ append to data/digest_samples.jsonl
                    │
                    └─ used as reference for building new parsers
 ```
 ---
 ## Module: `peregrine/scripts/digest_parsers.py`
 ### Parser interface
 Each parser function:
 ```python
 def parse_<source>(body: str) -> list[dict]
 ```
 Returns zero or more job dicts:
 ```python
 {
    "title":    str,   # job title
    "company":  str,   # company name
    "location": str,   # location string (may be empty)
    "url":      str,   # canonical URL, tracking params stripped
    "source":   str,   # "linkedin" | "adzuna" | "theladders"
 }
 ```
 ### Dispatcher
 ```python
 DIGEST_PARSERS: dict[str, tuple[str, Callable[[str], list[dict]]]] = {
    "jobalerts@linkedin.com":  ("linkedin",   parse_linkedin),
    "noreply@adzuna.com":      ("adzuna",     parse_adzuna),
    "noreply@theladders.com":  ("theladders", parse_theladders),
 }
 def parse_digest(from_addr: str, body: str) -> list[dict] | None:
    """
    Dispatch to the appropriate parser based on sender address.
    Returns:
        None        — no parser matched (not a known digest sender)
        []          — parser matched, no extractable jobs found
        [dict, ...] — one dict per job card extracted
    """
    addr = from_addr.lower()
    for sender, (source, parse_fn) in DIGEST_PARSERS.items():
        if sender in addr:
            return parse_fn(body)
    return None
 ```
 Sender matching is a substring check, tolerant of display-name wrappers
 (`"LinkedIn <jobalerts@linkedin.com>"` matches correctly).
 ### Parsers
 **`parse_linkedin`** — moved verbatim from `imap_sync.parse_linkedin_alert()`, renamed.
 No behavior change.
 **`parse_adzuna`** — built against real Adzuna digest email bodies pulled from the
 configured IMAP account during implementation. Expected format: job blocks separated
 by consistent delimiters with title, company, location, and a trackable URL per block.
 **`parse_theladders`** — same approach. The Ladders already has a web scraper in
 `scripts/custom_boards/theladders.py`; URL canonicalization patterns from there apply here.
 ---
 ## Changes to `imap_sync.py`
 Replace the LinkedIn-specific block in `_scan_unmatched_leads()` (~lines 561–585):
 **Before:**
 ```python
 if _LINKEDIN_ALERT_SENDER in parsed["from_addr"].lower():
    cards = parse_linkedin_alert(parsed["body"])
    for card in cards:
        # ... LinkedIn-specific insert ...
    known_message_ids.add(mid)
    continue
 ```
 **After:**
 ```python
 from scripts.digest_parsers import parse_digest  # top of file
 cards = parse_digest(parsed["from_addr"], parsed["body"])
 if cards is not None:
    for card in cards:
        if card["url"] in existing_urls:
            continue
        job_id = insert_job(db_path, {
            "title":      card["title"],
            "company":    card["company"],
            "url":        card["url"],
            "source":     card["source"],
            "location":   card["location"],
            "is_remote":  0,
            "salary":     "",
            "description": "",
            "date_found": datetime.now().isoformat()[:10],
        })
        if job_id:
            submit_task(db_path, "scrape_url", job_id)
            existing_urls.add(card["url"])
            new_leads += 1
            print(f"[imap] digest ({card['source']}) → {card['company']} — {card['title']}")
    known_message_ids.add(mid)
    continue
 ```
 `parse_digest` returning `None` falls through to the existing LLM extraction path — all
 non-digest recruitment emails are completely unaffected.
 ---
 ## Avocet: Digest Bucket
 ### File
 `avocet/data/digest_samples.jsonl` — gitignored. An `.example` entry is committed.
 Schema matches the existing label queue (JSONL on-disk schema):
 ```json
 {"subject": "...", "body": "...", "from_addr": "...", "date": "...", "account": "..."}
 ```
 ### Trigger
 In `app/label_tool.py` and `app/api.py`: when a `digest` label is applied, append the
 email to `digest_samples.jsonl` alongside the normal write to `email_score.jsonl`.
 No Peregrine dependency — if the file path doesn't exist the `data/` directory is created
 automatically. Avocet remains fully standalone.
 ### Usage
 When a new digest sender appears in the wild:
 1. Label representative emails as `digest` in Avocet → samples land in `digest_samples.jsonl`
 2. Inspect samples, write `parse_<source>(body)` in `digest_parsers.py`
 3. Add the sender string to `DIGEST_PARSERS`
 4. Add fixture test in `peregrine/tests/test_digest_parsers.py`
 ---
 ## Testing
 ### `peregrine/tests/test_digest_parsers.py`
 - Fixture bodies sourced from real IMAP samples (anonymized company names / URLs acceptable)
 - Each parser: valid body → expected cards returned
 - Each parser: empty / malformed body → `[]`, no exception
 - Dispatcher: known sender → correct parser invoked
 - Dispatcher: unknown sender → `None`
 - URL canonicalization: tracking params stripped, canonical form asserted
 - Dedup within digest: same URL appearing twice in one email → one card
 ### `avocet/tests/test_digest_bucket.py`
 - `digest` label → row appended to `digest_samples.jsonl`
 - Any other label → `digest_samples.jsonl` not touched
 - First write creates `data/` directory if absent
 ---
 ## Files Changed / Created
 | File | Change |
 |------|--------|
 | `peregrine/scripts/digest_parsers.py` | **New** — parser module |
 | `peregrine/scripts/imap_sync.py` | Replace inline LinkedIn block with `parse_digest()` call |
 | `peregrine/tests/test_digest_parsers.py` | **New** — parser unit tests |
 | `avocet/app/label_tool.py` | Append to `digest_samples.jsonl` on `digest` label |
 | `avocet/app/api.py` | Same — digest bucket write in label endpoint |
 | `avocet/tests/test_digest_bucket.py` | **New** — bucket write tests |
 | `avocet/data/digest_samples.jsonl.example` | **New** — committed sample for reference |
 ---
 ## Out of Scope
 - Avocet → Peregrine direct import trigger (deferred; bucket is sufficient for now)
 - `background_tasks` integration for digest re-processing (not needed with bucket approach)
 - HTML digest parsing (all three senders send plain-text alerts; revisit if needed)
--- a/docs/plans/2026-03-05-digest-parsers-plan.md
+++ b/docs/plans/2026-03-05-digest-parsers-plan.md
@ -0,0 +1,897 @@
 # Digest Email Parsers Implementation Plan
 > **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
 **Goal:** Extract job listings from LinkedIn, Adzuna, and The Ladders digest emails into Peregrine leads, with an Avocet bucket that collects digest samples for future parser development.
 **Architecture:** New `peregrine/scripts/digest_parsers.py` exposes a `parse_digest(from_addr, body)` dispatcher backed by a sender registry. `imap_sync.py` replaces its inline LinkedIn block with one dispatcher call. Avocet's two label paths (`label_tool.py` + `api.py`) append digest-labeled emails to `data/digest_samples.jsonl`. Adzuna and Ladders parsers are built from real IMAP samples fetched in Task 2.
 **Tech Stack:** Python stdlib only — `re`, `json`, `pathlib`. No new dependencies.
 ---
 ### Task 1: Create `digest_parsers.py` with dispatcher + LinkedIn parser
 **Files:**
 - Create: `peregrine/scripts/digest_parsers.py`
 - Create: `peregrine/tests/test_digest_parsers.py`
 **Context:**
 `parse_linkedin_alert()` currently lives inline in `imap_sync.py`. We move it here (renamed
 `parse_linkedin`) and wrap it in a dispatcher. All other parsers plug into the same registry.
 Run all tests with:
 ```
 /devl/miniconda3/envs/job-seeker/bin/pytest tests/test_digest_parsers.py -v
 ```
 ---
 **Step 1: Write the failing tests**
 Create `peregrine/tests/test_digest_parsers.py`:
 ```python
 """Tests for digest email parser registry."""
 import pytest
 from scripts.digest_parsers import parse_digest, parse_linkedin
 # ── LinkedIn fixture ──────────────────────────────────────────────────────────
 # Mirrors the plain-text format LinkedIn Job Alert emails actually send.
 # Each job block is separated by a line of 10+ dashes.
 LINKEDIN_BODY = """\
 Software Engineer
 Acme Corp
 San Francisco, CA
 View job: https://www.linkedin.com/comm/jobs/view/1111111111/?refId=abc&trackingId=xyz
 --------------------------------------------------
 Senior Developer
 Widget Inc
 Remote
 View job: https://www.linkedin.com/comm/jobs/view/2222222222/?refId=def
 """
 LINKEDIN_BODY_EMPTY = "No jobs matched your alert this week."
 LINKEDIN_BODY_NO_URL = """\
 Software Engineer
 Acme Corp
 San Francisco, CA
 --------------------------------------------------
 """
 def test_dispatcher_linkedin_sender():
    cards = parse_digest("LinkedIn <jobalerts@linkedin.com>", LINKEDIN_BODY)
    assert cards is not None
    assert len(cards) == 2
 def test_dispatcher_unknown_sender_returns_none():
    result = parse_digest("noreply@randomboard.com", LINKEDIN_BODY)
    assert result is None
 def test_dispatcher_case_insensitive_sender():
    cards = parse_digest("JOBALERTS@LINKEDIN.COM", LINKEDIN_BODY)
    assert cards is not None
 def test_parse_linkedin_returns_correct_fields():
    cards = parse_linkedin(LINKEDIN_BODY)
    assert cards[0]["title"] == "Software Engineer"
    assert cards[0]["company"] == "Acme Corp"
    assert cards[0]["location"] == "San Francisco, CA"
    assert cards[0]["source"] == "linkedin"
 def test_parse_linkedin_url_canonicalized():
    """Tracking params stripped; canonical jobs/view/<id>/ form."""
    cards = parse_linkedin(LINKEDIN_BODY)
    assert cards[0]["url"] == "https://www.linkedin.com/jobs/view/1111111111/"
    assert "refId" not in cards[0]["url"]
    assert "trackingId" not in cards[0]["url"]
 def test_parse_linkedin_empty_body_returns_empty_list():
    assert parse_linkedin(LINKEDIN_BODY_EMPTY) == []
 def test_parse_linkedin_block_without_url_skipped():
    cards = parse_linkedin(LINKEDIN_BODY_NO_URL)
    assert cards == []
 ```
 **Step 2: Run tests to verify they fail**
 ```
 /devl/miniconda3/envs/job-seeker/bin/pytest tests/test_digest_parsers.py -v
 ```
 Expected: `ImportError: cannot import name 'parse_digest'`
 ---
 **Step 3: Write `digest_parsers.py`**
 Create `peregrine/scripts/digest_parsers.py`:
 ```python
 """Digest email parser registry for Peregrine.
 Each parser extracts job listings from a known digest sender's plain-text body.
 New parsers are added by decorating with @_register(sender_substring, source_name).
 Usage:
    from scripts.digest_parsers import parse_digest
    cards = parse_digest(from_addr, body)
    # None  → unknown sender (fall through to LLM path)
    # []    → known sender, nothing extractable
    # [...] → list of {title, company, location, url, source} dicts
 """
 from __future__ import annotations
 import re
 from typing import Callable
 # ── Registry ──────────────────────────────────────────────────────────────────
 # Maps sender substring (lowercased) → (source_name, parse_fn)
 DIGEST_PARSERS: dict[str, tuple[str, Callable[[str], list[dict]]]] = {}
 def _register(sender: str, source: str):
    """Decorator to register a parser for a given sender substring."""
    def decorator(fn: Callable[[str], list[dict]]):
        DIGEST_PARSERS[sender.lower()] = (source, fn)
        return fn
    return decorator
 def parse_digest(from_addr: str, body: str) -> list[dict] | None:
    """Dispatch to the appropriate parser based on sender address.
    Returns:
        None        — no parser matched (caller should use LLM fallback)
        []          — known sender, no extractable jobs
        [dict, ...] — one dict per job card with keys:
                      title, company, location, url, source
    """
    addr = from_addr.lower()
    for sender, (source, parse_fn) in DIGEST_PARSERS.items():
        if sender in addr:
            return parse_fn(body)
    return None
 # ── Shared helpers ─────────────────────────────────────────────────────────────
 _LINKEDIN_SKIP_PHRASES = {
    "promoted", "easily apply", "apply now", "job alert",
    "unsubscribe", "linkedin corporation",
 }
 # ── LinkedIn Job Alert ─────────────────────────────────────────────────────────
@_register("jobalerts@linkedin.com", "linkedin")
 def parse_linkedin(body: str) -> list[dict]:
    """Parse LinkedIn Job Alert digest email body.
    Blocks are separated by lines of 10+ dashes. Each block contains:
        Line 0: job title
        Line 1: company
        Line 2: location (optional)
        'View job: <url>'  →  canonicalized to /jobs/view/<id>/
    """
    jobs = []
    blocks = re.split(r"\n\s*-{10,}\s*\n", body)
    for block in blocks:
        lines = [ln.strip() for ln in block.strip().splitlines() if ln.strip()]
        url = None
        for line in lines:
            m = re.search(r"View job:\s*(https?://\S+)", line, re.IGNORECASE)
            if m:
                raw_url = m.group(1)
                job_id_m = re.search(r"/jobs/view/(\d+)", raw_url)
                if job_id_m:
                    url = f"https://www.linkedin.com/jobs/view/{job_id_m.group(1)}/"
                break
        if not url:
            continue
        content = [
            ln for ln in lines
            if not any(p in ln.lower() for p in _LINKEDIN_SKIP_PHRASES)
            and not ln.lower().startswith("view job:")
            and not ln.startswith("http")
        ]
        if len(content) < 2:
            continue
        jobs.append({
            "title":    content[0],
            "company":  content[1],
            "location": content[2] if len(content) > 2 else "",
            "url":      url,
            "source":   "linkedin",
        })
    return jobs
 # ── Adzuna Job Alert ───────────────────────────────────────────────────────────
@_register("noreply@adzuna.com", "adzuna")
 def parse_adzuna(body: str) -> list[dict]:
    """Parse Adzuna job alert digest email body.
    TODO: implement after reviewing samples in avocet/data/digest_samples.jsonl
    See Task 3 in docs/plans/2026-03-05-digest-parsers-plan.md
    """
    return []
 # ── The Ladders Job Alert ──────────────────────────────────────────────────────
@_register("noreply@theladders.com", "theladders")
 def parse_theladders(body: str) -> list[dict]:
    """Parse The Ladders job alert digest email body.
    TODO: implement after reviewing samples in avocet/data/digest_samples.jsonl
    See Task 4 in docs/plans/2026-03-05-digest-parsers-plan.md
    """
    return []
 ```
 **Step 4: Run tests to verify they pass**
 ```
 /devl/miniconda3/envs/job-seeker/bin/pytest tests/test_digest_parsers.py -v
 ```
 Expected: all 8 tests PASS
 **Step 5: Commit**
 ```bash
 git add scripts/digest_parsers.py tests/test_digest_parsers.py
 git commit -m "feat: digest parser registry + LinkedIn parser (moved from imap_sync)"
 ```
 ---
 ### Task 2: Fetch digest samples from IMAP
 **Files:**
 - Create: `avocet/scripts/fetch_digest_samples.py`
 **Context:**
 We need real Adzuna and Ladders email bodies to write parsers against. This one-off script
 searches the configured IMAP account by sender domain and writes results to
 `data/digest_samples.jsonl`. Run it once; the output file feeds Tasks 3 and 4.
 ---
 **Step 1: Create the fetch script**
 Create `avocet/scripts/fetch_digest_samples.py`:
 ```python
 #!/usr/bin/env python3
 """Fetch digest email samples from IMAP into data/digest_samples.jsonl.
 Searches for emails from known digest sender domains, deduplicates against
 any existing samples, and appends new ones.
 Usage:
    conda run -n job-seeker python scripts/fetch_digest_samples.py
 Reads config/label_tool.yaml for IMAP credentials (first account used).
 """
 from __future__ import annotations
 import imaplib
 import json
 import sys
 from pathlib import Path
 import yaml
 ROOT = Path(__file__).parent.parent
 CONFIG = ROOT / "config" / "label_tool.yaml"
 OUTPUT = ROOT / "data" / "digest_samples.jsonl"
 # Sender domains to search — add new ones here as needed
 DIGEST_SENDERS = [
    "adzuna.com",
    "theladders.com",
    "jobalerts@linkedin.com",
 ]
 # Import shared helpers from avocet
 sys.path.insert(0, str(ROOT))
 from app.imap_fetch import _decode_str, _extract_body, entry_key  # noqa: E402
 def _load_existing_keys() -> set[str]:
    if not OUTPUT.exists():
        return set()
    keys = set()
    for line in OUTPUT.read_text().splitlines():
        try:
            keys.add(entry_key(json.loads(line)))
        except Exception:
            pass
    return keys
 def main() -> None:
    cfg = yaml.safe_load(CONFIG.read_text())
    accounts = cfg.get("accounts", [])
    if not accounts:
        print("No accounts configured in config/label_tool.yaml")
        sys.exit(1)
    acc = accounts[0]
    host = acc.get("host", "imap.gmail.com")
    port = int(acc.get("port", 993))
    use_ssl = acc.get("use_ssl", True)
    username = acc["username"]
    password = acc["password"]
    folder = acc.get("folder", "INBOX")
    days_back = int(acc.get("days_back", 90))
    from datetime import datetime, timedelta
    import email as _email_lib
    since = (datetime.now() - timedelta(days=days_back)).strftime("%d-%b-%Y")
    conn = (imaplib.IMAP4_SSL if use_ssl else imaplib.IMAP4)(host, port)
    conn.login(username, password)
    conn.select(folder, readonly=True)
    known_keys = _load_existing_keys()
    found: list[dict] = []
    seen_uids: dict[bytes, None] = {}
    for sender in DIGEST_SENDERS:
        try:
            _, data = conn.search(None, f'(FROM "{sender}" SINCE "{since}")')
            for uid in (data[0] or b"").split():
                seen_uids[uid] = None
        except Exception as exc:
            print(f"  search error for {sender!r}: {exc}")
    print(f"Found {len(seen_uids)} candidate UIDs across {len(DIGEST_SENDERS)} senders")
    for uid in seen_uids:
        try:
            _, raw_data = conn.fetch(uid, "(RFC822)")
            if not raw_data or not raw_data[0]:
                continue
            msg = _email_lib.message_from_bytes(raw_data[0][1])
            entry = {
                "subject":   _decode_str(msg.get("Subject", "")),
                "body":      _extract_body(msg)[:2000],  # larger cap for parser dev
                "from_addr": _decode_str(msg.get("From", "")),
                "date":      _decode_str(msg.get("Date", "")),
                "account":   acc.get("name", username),
            }
            k = entry_key(entry)
            if k not in known_keys:
                known_keys.add(k)
                found.append(entry)
        except Exception as exc:
            print(f"  fetch error uid {uid}: {exc}")
    conn.logout()
    if not found:
        print("No new digest samples found.")
        return
    OUTPUT.parent.mkdir(exist_ok=True)
    with OUTPUT.open("a", encoding="utf-8") as f:
        for entry in found:
            f.write(json.dumps(entry) + "\n")
    print(f"Wrote {len(found)} new samples to {OUTPUT}")
 if __name__ == "__main__":
    main()
 ```
 **Step 2: Run the fetch script**
 ```
 cd /Library/Development/CircuitForge/avocet
 conda run -n job-seeker python scripts/fetch_digest_samples.py
 ```
 Expected output: `Wrote N new samples to data/digest_samples.jsonl`
 **Step 3: Inspect the samples**
 ```
 # View first few entries — look at from_addr and body for Adzuna and Ladders format
 conda run -n job-seeker python -c "
 import json
 from pathlib import Path
 for line in Path('data/digest_samples.jsonl').read_text().splitlines()[:10]:
    e = json.loads(line)
    print('FROM:', e['from_addr'])
    print('SUBJECT:', e['subject'])
    print('BODY[:500]:', e['body'][:500])
    print('---')
 "
 ```
 Note down:
 - The exact sender addresses for Adzuna and Ladders (update `DIGEST_PARSERS` in `digest_parsers.py` if different from `noreply@adzuna.com` / `noreply@theladders.com`)
 - The structure of each job block in the body (separator lines, field order, URL format)
 **Step 4: Commit**
 ```bash
 cd /Library/Development/CircuitForge/avocet
 git add scripts/fetch_digest_samples.py
 git commit -m "feat: fetch_digest_samples script for building new parsers"
 ```
 ---
 ### Task 3: Build and test Adzuna parser
 **Files:**
 - Modify: `peregrine/scripts/digest_parsers.py` — implement `parse_adzuna`
 - Modify: `peregrine/tests/test_digest_parsers.py` — add Adzuna fixtures + tests
 **Context:**
 After running Task 2, you have real Adzuna email bodies in `avocet/data/digest_samples.jsonl`.
 Inspect them (see Task 2 Step 3), identify the structure, then write the test fixture from
 a real sample before implementing the parser.
 ---
 **Step 1: Write a failing Adzuna test**
 Inspect a real Adzuna sample from `data/digest_samples.jsonl` and identify:
 - How job blocks are separated (blank lines? dashes? headers?)
 - Field order (title first? company first?)
 - Where the job URL appears and what format it uses
 - Any noise lines to filter (unsubscribe, promo text, etc.)
 Add to `peregrine/tests/test_digest_parsers.py`:
 ```python
 from scripts.digest_parsers import parse_adzuna
 # Replace ADZUNA_BODY with a real excerpt from avocet/data/digest_samples.jsonl
 # Copy 2-3 job blocks verbatim; replace real company names with "Test Co" etc. if desired
 ADZUNA_BODY = """
 <paste real Adzuna body excerpt here — 2-3 job blocks>
 """
 def test_dispatcher_adzuna_sender():
    # Update sender string if real sender differs from noreply@adzuna.com
    cards = parse_digest("noreply@adzuna.com", ADZUNA_BODY)
    assert cards is not None
    assert len(cards) >= 1
 def test_parse_adzuna_fields():
    cards = parse_adzuna(ADZUNA_BODY)
    assert cards[0]["title"]   # non-empty
    assert cards[0]["company"] # non-empty
    assert cards[0]["url"].startswith("http")
    assert cards[0]["source"] == "adzuna"
 def test_parse_adzuna_url_no_tracking():
    """Adzuna URLs often contain tracking params — strip them."""
    cards = parse_adzuna(ADZUNA_BODY)
    # Adjust assertion to match actual URL format once you've seen real samples
    for card in cards:
        assert "utm_" not in card["url"]
 def test_parse_adzuna_empty_body():
    assert parse_adzuna("No jobs this week.") == []
 ```
 **Step 2: Run tests to verify they fail**
 ```
 /devl/miniconda3/envs/job-seeker/bin/pytest tests/test_digest_parsers.py::test_parse_adzuna_fields -v
 ```
 Expected: FAIL (stub returns `[]`)
 **Step 3: Implement `parse_adzuna` in `digest_parsers.py`**
 Replace the stub body of `parse_adzuna` based on the actual email structure you observed.
 Pattern to follow (adapt field positions to match Adzuna's actual format):
 ```python
@_register("noreply@adzuna.com", "adzuna")  # update sender if needed
 def parse_adzuna(body: str) -> list[dict]:
    jobs = []
    # Split on whatever delimiter Adzuna uses between blocks
    # e.g.: blocks = re.split(r"\n\s*\n{2,}", body)  # double blank line
    # For each block, extract title, company, location, url
    # Strip tracking params from URL: re.sub(r"\?.*", "", url) or parse with urllib
    return jobs
 ```
 If Adzuna sender differs from `noreply@adzuna.com`, update the `@_register` decorator
 **and** the `DIGEST_PARSERS` key in the registry (they're set by the decorator — just change
 the decorator argument).
 **Step 4: Run all digest tests**
 ```
 /devl/miniconda3/envs/job-seeker/bin/pytest tests/test_digest_parsers.py -v
 ```
 Expected: all tests PASS
 **Step 5: Commit**
 ```bash
 cd /Library/Development/CircuitForge/peregrine
 git add scripts/digest_parsers.py tests/test_digest_parsers.py
 git commit -m "feat: Adzuna digest email parser"
 ```
 ---
 ### Task 4: Build and test The Ladders parser
 **Files:**
 - Modify: `peregrine/scripts/digest_parsers.py` — implement `parse_theladders`
 - Modify: `peregrine/tests/test_digest_parsers.py` — add Ladders fixtures + tests
 **Context:**
 Same approach as Task 3. The Ladders already has a web scraper in
 `scripts/custom_boards/theladders.py` — check it for URL patterns that may apply here.
 ---
 **Step 1: Write failing Ladders tests**
 Inspect a real Ladders sample from `avocet/data/digest_samples.jsonl`. Add to test file:
 ```python
 from scripts.digest_parsers import parse_theladders
 # Replace with real Ladders body excerpt
 LADDERS_BODY = """
 <paste real Ladders body excerpt here — 2-3 job blocks>
 """
 def test_dispatcher_ladders_sender():
    cards = parse_digest("noreply@theladders.com", LADDERS_BODY)
    assert cards is not None
    assert len(cards) >= 1
 def test_parse_theladders_fields():
    cards = parse_theladders(LADDERS_BODY)
    assert cards[0]["title"]
    assert cards[0]["company"]
    assert cards[0]["url"].startswith("http")
    assert cards[0]["source"] == "theladders"
 def test_parse_theladders_empty_body():
    assert parse_theladders("No new jobs.") == []
 ```
 **Step 2: Run tests to verify they fail**
 ```
 /devl/miniconda3/envs/job-seeker/bin/pytest tests/test_digest_parsers.py::test_parse_theladders_fields -v
 ```
 Expected: FAIL
 **Step 3: Implement `parse_theladders`**
 Replace the stub. The Ladders URLs often use redirect wrappers — canonicalize to the
 `theladders.com/job/<id>` form if possible, otherwise just strip tracking params.
 **Step 4: Run all digest tests**
 ```
 /devl/miniconda3/envs/job-seeker/bin/pytest tests/test_digest_parsers.py -v
 ```
 Expected: all tests PASS
 **Step 5: Commit**
 ```bash
 git add scripts/digest_parsers.py tests/test_digest_parsers.py
 git commit -m "feat: The Ladders digest email parser"
 ```
 ---
 ### Task 5: Update `imap_sync.py` to use the dispatcher
 **Files:**
 - Modify: `peregrine/scripts/imap_sync.py`
 **Context:**
 The LinkedIn-specific block in `_scan_unmatched_leads()` (search for
 `_LINKEDIN_ALERT_SENDER`) gets replaced with a generic `parse_digest()` call.
 The existing behavior is preserved — only the dispatch mechanism changes.
 ---
 **Step 1: Add the import**
 At the top of `imap_sync.py`, alongside other local imports, add:
 ```python
 from scripts.digest_parsers import parse_digest
 ```
 **Step 2: Find the LinkedIn-specific block**
 Search for `_LINKEDIN_ALERT_SENDER` in `imap_sync.py`. The block looks like:
 ```python
 if _LINKEDIN_ALERT_SENDER in parsed["from_addr"].lower():
    cards = parse_linkedin_alert(parsed["body"])
    for card in cards:
        ...
    known_message_ids.add(mid)
    continue
 ```
 **Step 3: Replace with the generic dispatcher**
 ```python
 # ── Digest email — dispatch to parser registry ────────────────────────
 cards = parse_digest(parsed["from_addr"], parsed["body"])
 if cards is not None:
    for card in cards:
        if card["url"] in existing_urls:
            continue
        job_id = insert_job(db_path, {
            "title":      card["title"],
            "company":    card["company"],
            "url":        card["url"],
            "source":     card["source"],
            "location":   card["location"],
            "is_remote":  0,
            "salary":     "",
            "description": "",
            "date_found": datetime.now().isoformat()[:10],
        })
        if job_id:
            submit_task(db_path, "scrape_url", job_id)
            existing_urls.add(card["url"])
            new_leads += 1
            print(f"[imap] digest ({card['source']}) → {card['company']} — {card['title']}")
    known_message_ids.add(mid)
    continue
 ```
 **Step 4: Remove the now-unused `parse_linkedin_alert` import/definition**
 `parse_linkedin_alert` was defined in `imap_sync.py`. It's now `parse_linkedin` in
 `digest_parsers.py`. Delete the old function from `imap_sync.py`. Also remove
 `_LINKEDIN_ALERT_SENDER` constant if it's no longer referenced.
 **Step 5: Run the full test suite**
 ```
 /devl/miniconda3/envs/job-seeker/bin/pytest tests/ -v
 ```
 Expected: all existing tests still pass; no regressions
 **Step 6: Commit**
 ```bash
 git add scripts/imap_sync.py
 git commit -m "refactor: imap_sync uses digest_parsers dispatcher; remove inline LinkedIn parser"
 ```
 ---
 ### Task 6: Avocet digest bucket
 **Files:**
 - Modify: `avocet/app/label_tool.py`
 - Modify: `avocet/app/api.py`
 - Create: `avocet/tests/test_digest_bucket.py`
 - Create: `avocet/data/digest_samples.jsonl.example`
 **Context:**
 When either label path (`_do_label` in the Streamlit UI or `POST /api/label` in the FastAPI
 app) assigns the `digest` label, the full email record is appended to
 `data/digest_samples.jsonl`. This is the sample corpus for building future parsers.
 ---
 **Step 1: Write failing tests**
 Create `avocet/tests/test_digest_bucket.py`:
 ```python
 """Tests for digest sample bucket write behavior."""
 import json
 import pytest
 from pathlib import Path
 from unittest.mock import patch, MagicMock
 # ── Helpers ───────────────────────────────────────────────────────────────────
 def _read_bucket(tmp_path: Path) -> list[dict]:
    bucket = tmp_path / "data" / "digest_samples.jsonl"
    if not bucket.exists():
        return []
    return [json.loads(line) for line in bucket.read_text().splitlines() if line.strip()]
 SAMPLE_ENTRY = {
    "subject":   "10 new jobs for you",
    "body":      "Software Engineer\nAcme Corp\nRemote\nView job: https://example.com/123",
    "from_addr": "noreply@adzuna.com",
    "date":      "Mon, 03 Mar 2026 09:00:00 +0000",
    "account":   "test@example.com",
 }
 # ── api.py bucket tests ───────────────────────────────────────────────────────
 def test_api_digest_label_writes_to_bucket(tmp_path):
    from app.api import _append_digest_sample
    data_dir = tmp_path / "data"
    _append_digest_sample(SAMPLE_ENTRY, data_dir=data_dir)
    rows = _read_bucket(tmp_path)
    assert len(rows) == 1
    assert rows[0]["from_addr"] == "noreply@adzuna.com"
 def test_api_non_digest_label_does_not_write(tmp_path):
    from app.api import _append_digest_sample
    data_dir = tmp_path / "data"
    # _append_digest_sample should only be called for digest; confirm it writes when called
    # Confirm that callers gate on label == "digest" — tested via integration below
    _append_digest_sample(SAMPLE_ENTRY, data_dir=data_dir)
    rows = _read_bucket(tmp_path)
    assert len(rows) == 1  # called directly, always writes
 def test_api_digest_creates_data_dir(tmp_path):
    from app.api import _append_digest_sample
    data_dir = tmp_path / "nonexistent" / "data"
    assert not data_dir.exists()
    _append_digest_sample(SAMPLE_ENTRY, data_dir=data_dir)
    assert data_dir.exists()
 def test_api_digest_appends_multiple(tmp_path):
    from app.api import _append_digest_sample
    data_dir = tmp_path / "data"
    _append_digest_sample(SAMPLE_ENTRY, data_dir=data_dir)
    _append_digest_sample({**SAMPLE_ENTRY, "subject": "5 more jobs"}, data_dir=data_dir)
    rows = _read_bucket(tmp_path)
    assert len(rows) == 2
 ```
 **Step 2: Run tests to verify they fail**
 ```
 /devl/miniconda3/envs/job-seeker/bin/pytest tests/test_digest_bucket.py -v
 ```
 Expected: `ImportError: cannot import name '_append_digest_sample'`
 ---
 **Step 3: Add `_append_digest_sample` to `api.py`**
 In `avocet/app/api.py`, add this helper (near the top, after the imports and `_DATA_DIR`
 constant):
 ```python
 _DIGEST_SAMPLES_FILE = _DATA_DIR / "digest_samples.jsonl"
 def _append_digest_sample(entry: dict, data_dir: Path | None = None) -> None:
    """Append a digest-labeled email to the sample corpus."""
    target_dir = data_dir if data_dir is not None else _DATA_DIR
    target_dir.mkdir(parents=True, exist_ok=True)
    bucket = target_dir / "digest_samples.jsonl"
    record = {
        "subject":   entry.get("subject", ""),
        "body":      entry.get("body", ""),
        "from_addr": entry.get("from_addr", entry.get("from", "")),
        "date":      entry.get("date", ""),
        "account":   entry.get("account", entry.get("source", "")),
    }
    with bucket.open("a", encoding="utf-8") as f:
        f.write(json.dumps(record) + "\n")
 ```
 Then in `post_label()` (around line 127, after `_append_jsonl(_score_file(), record)`):
 ```python
    if req.label == "digest":
        _append_digest_sample(match)
 ```
 **Step 4: Add the same write to `label_tool.py`**
 In `avocet/app/label_tool.py`, add a module-level constant after `_SCORE_FILE`:
 ```python
 _DIGEST_SAMPLES_FILE = _ROOT / "data" / "digest_samples.jsonl"
 ```
 In `_do_label()` (around line 728, after `_append_jsonl(_SCORE_FILE, row)`):
 ```python
            if label == "digest":
                _append_jsonl(
                    _DIGEST_SAMPLES_FILE,
                    {
                        "subject":   entry.get("subject", ""),
                        "body":      (entry.get("body", ""))[:2000],
                        "from_addr": entry.get("from_addr", ""),
                        "date":      entry.get("date", ""),
                        "account":   entry.get("account", ""),
                    },
                )
 ```
 (`_append_jsonl` already exists in label_tool.py at line ~396 — reuse it.)
 **Step 5: Create the example file**
 Create `avocet/data/digest_samples.jsonl.example`:
 ```json
 {"subject": "10 new Software Engineer jobs for you", "body": "Software Engineer\nAcme Corp\nSan Francisco, CA\n\nView job: https://www.linkedin.com/jobs/view/1234567890/\n", "from_addr": "LinkedIn <jobalerts@linkedin.com>", "date": "Mon, 03 Mar 2026 09:00:00 +0000", "account": "example@gmail.com"}
 ```
 **Step 6: Update `.gitignore` in avocet**
 Verify `data/digest_samples.jsonl` is gitignored. Open `avocet/.gitignore` — it should
 already have `data/*.jsonl`. If not, add:
 ```
 data/digest_samples.jsonl
 ```
 **Step 7: Run all avocet tests**
 ```
 /devl/miniconda3/envs/job-seeker/bin/pytest tests/ -v
 ```
 Expected: all tests PASS
 **Step 8: Commit**
 ```bash
 cd /Library/Development/CircuitForge/avocet
 git add app/api.py app/label_tool.py tests/test_digest_bucket.py data/digest_samples.jsonl.example
 git commit -m "feat: digest sample bucket — write digest-labeled emails to digest_samples.jsonl"
 ```
 ---
 ## Summary
 | Task | Repo | Commit message |
 |------|------|----------------|
 | 1 | peregrine | `feat: digest parser registry + LinkedIn parser (moved from imap_sync)` |
 | 2 | avocet | `feat: fetch_digest_samples script for building new parsers` |
 | 3 | peregrine | `feat: Adzuna digest email parser` |
 | 4 | peregrine | `feat: The Ladders digest email parser` |
 | 5 | peregrine | `refactor: imap_sync uses digest_parsers dispatcher; remove inline LinkedIn parser` |
 | 6 | avocet | `feat: digest sample bucket — write digest-labeled emails to digest_samples.jsonl` |
 Tasks 1, 2, and 6 are independent and can be done in any order.
 Tasks 3 and 4 depend on Task 2 (samples needed before implementing parsers).
 Task 5 depends on Tasks 1, 3, and 4 (all parsers should be ready before switching imap_sync).
--- a/scripts/backup.py
+++ b/scripts/backup.py
@ -0,0 +1,277 @@
 """Config backup / restore / teleport for Peregrine.
 Creates a portable zip of all gitignored configs + optionally the staging DB.
 Intended for: machine migrations, Docker volume transfers, and safe wizard testing.
 Supports both the Peregrine Docker instance and the legacy /devl/job-seeker install.
 Usage (CLI):
    conda run -n job-seeker python scripts/backup.py --create backup.zip
    conda run -n job-seeker python scripts/backup.py --create backup.zip --no-db
    conda run -n job-seeker python scripts/backup.py --create backup.zip --base-dir /devl/job-seeker
    conda run -n job-seeker python scripts/backup.py --restore backup.zip
    conda run -n job-seeker python scripts/backup.py --list   backup.zip
 Usage (programmatic — called from Settings UI):
    from scripts.backup import create_backup, restore_backup, list_backup_contents
    zip_bytes = create_backup(base_dir, include_db=True)
    info      = list_backup_contents(zip_bytes)
    result    = restore_backup(zip_bytes, base_dir, include_db=True)
 """
 from __future__ import annotations
 import io
 import json
 import zipfile
 from datetime import datetime
 from pathlib import Path
 # ---------------------------------------------------------------------------
 # Files included in every backup (relative to repo root)
 # ---------------------------------------------------------------------------
 # Gitignored config files that hold secrets / personal data
 _SECRET_CONFIGS = [
    "config/notion.yaml",
    "config/tokens.yaml",
    "config/email.yaml",
    "config/adzuna.yaml",
    "config/craigslist.yaml",
    "config/user.yaml",
    "config/plain_text_resume.yaml",
    "config/license.json",
    "config/user.yaml.working",
 ]
 # Gitignored integration configs (glob pattern — each matching file is added)
 _INTEGRATION_CONFIG_GLOB = "config/integrations/*.yaml"
 # Non-secret committed configs worth preserving for portability
 # (also present in the legacy /devl/job-seeker instance)
 _EXTRA_CONFIGS = [
    "config/llm.yaml",
    "config/search_profiles.yaml",
    "config/resume_keywords.yaml",  # personal keyword list — present in both instances
    "config/skills_suggestions.yaml",
    "config/blocklist.yaml",
    "config/server.yaml",           # deployment config (base URL path, port) — Peregrine only
 ]
 # Candidate DB paths (first one that exists wins)
 _DB_CANDIDATES = ["data/staging.db", "staging.db"]
 _MANIFEST_NAME = "backup-manifest.json"
 # ---------------------------------------------------------------------------
 # Source detection
 # ---------------------------------------------------------------------------
 def _detect_source_label(base_dir: Path) -> str:
    """Return a human-readable label for the instance being backed up.
    Uses the directory name — stable as long as the repo root isn't renamed,
    which is the normal case for both the Docker install (peregrine/) and the
    legacy Conda install (job-seeker/).
    Args:
        base_dir: The root directory being backed up.
    Returns:
        A short identifier string, e.g. "peregrine" or "job-seeker".
    """
    return base_dir.name
 # ---------------------------------------------------------------------------
 # Public API
 # ---------------------------------------------------------------------------
 def create_backup(
    base_dir: Path,
    include_db: bool = True,
    source_label: str | None = None,
 ) -> bytes:
    """Return a zip archive as raw bytes.
    Args:
        base_dir:     Repo root (parent of config/ and staging.db).
        include_db:   If True, include staging.db in the archive.
        source_label: Human-readable instance name stored in the manifest
                      (e.g. "peregrine", "job-seeker"). Auto-detected if None.
    """
    buf = io.BytesIO()
    included: list[str] = []
    with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_DEFLATED) as zf:
        # Gitignored secret configs
        for rel in _SECRET_CONFIGS:
            p = base_dir / rel
            if p.exists():
                zf.write(p, rel)
                included.append(rel)
        # Integration configs (glob)
        for p in sorted((base_dir).glob(_INTEGRATION_CONFIG_GLOB)):
            rel = str(p.relative_to(base_dir))
            zf.write(p, rel)
            included.append(rel)
        # Extra non-secret configs
        for rel in _EXTRA_CONFIGS:
            p = base_dir / rel
            if p.exists():
                zf.write(p, rel)
                included.append(rel)
        # Staging DB
        if include_db:
            for candidate in _DB_CANDIDATES:
                p = base_dir / candidate
                if p.exists():
                    zf.write(p, candidate)
                    included.append(candidate)
                    break
        # Manifest
        manifest = {
            "created_at": datetime.now().isoformat(),
            "source": source_label or _detect_source_label(base_dir),
            "source_path": str(base_dir.resolve()),
            "peregrine_version": "1.0",
            "files": included,
            "includes_db": include_db and any(f.endswith(".db") for f in included),
        }
        zf.writestr(_MANIFEST_NAME, json.dumps(manifest, indent=2))
    return buf.getvalue()
 def list_backup_contents(zip_bytes: bytes) -> dict:
    """Return manifest + file list from a backup zip (no extraction)."""
    with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
        names = [n for n in zf.namelist() if n != _MANIFEST_NAME]
        manifest: dict = {}
        if _MANIFEST_NAME in zf.namelist():
            manifest = json.loads(zf.read(_MANIFEST_NAME))
        sizes = {info.filename: info.file_size for info in zf.infolist()}
    return {
        "manifest": manifest,
        "files": names,
        "sizes": sizes,
        "total_bytes": sum(sizes[n] for n in names if n in sizes),
    }
 def restore_backup(
    zip_bytes: bytes,
    base_dir: Path,
    include_db: bool = True,
    overwrite: bool = True,
 ) -> dict[str, list[str]]:
    """Extract a backup zip into base_dir.
    Args:
        zip_bytes:  Raw bytes of the backup zip.
        base_dir:   Repo root to restore into.
        include_db: If False, skip any .db files.
        overwrite:  If False, skip files that already exist.
    Returns:
        {"restored": [...], "skipped": [...]}
    """
    restored: list[str] = []
    skipped: list[str] = []
    with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
        for name in zf.namelist():
            if name == _MANIFEST_NAME:
                continue
            if not include_db and name.endswith(".db"):
                skipped.append(name)
                continue
            dest = base_dir / name
            if dest.exists() and not overwrite:
                skipped.append(name)
                continue
            dest.parent.mkdir(parents=True, exist_ok=True)
            dest.write_bytes(zf.read(name))
            restored.append(name)
    return {"restored": restored, "skipped": skipped}
 # ---------------------------------------------------------------------------
 # CLI entry point
 # ---------------------------------------------------------------------------
 def main() -> None:
    import argparse
    import sys
    parser = argparse.ArgumentParser(description="Peregrine config backup / restore / teleport")
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--create", metavar="OUT.zip", help="Create a backup zip")
    group.add_argument("--restore", metavar="IN.zip", help="Restore from a backup zip")
    group.add_argument("--list", metavar="IN.zip", help="List contents of a backup zip")
    parser.add_argument("--no-db", action="store_true", help="Exclude staging.db (--create/--restore)")
    parser.add_argument("--no-overwrite", action="store_true",
                        help="Skip files that already exist (--restore)")
    parser.add_argument(
        "--base-dir", metavar="PATH",
        help="Root of the instance to back up/restore (default: this repo root). "
             "Use /devl/job-seeker to target the legacy Conda install.",
    )
    args = parser.parse_args()
    base_dir = Path(args.base_dir).resolve() if args.base_dir else Path(__file__).parent.parent
    if args.create:
        out = Path(args.create)
        data = create_backup(base_dir, include_db=not args.no_db)
        out.write_bytes(data)
        info = list_backup_contents(data)
        m = info["manifest"]
        print(f"Backup created: {out}  ({len(data):,} bytes)")
        print(f"  Source: {m.get('source', '?')}  ({base_dir})")
        print(f"  {len(info['files'])} files archived:")
        for name in info["files"]:
            size = info["sizes"].get(name, 0)
            print(f"    {name}  ({size:,} bytes)")
    elif args.restore:
        in_path = Path(args.restore)
        if not in_path.exists():
            print(f"ERROR: {in_path} not found", file=sys.stderr)
            sys.exit(1)
        data = in_path.read_bytes()
        result = restore_backup(data, base_dir,
                                include_db=not args.no_db,
                                overwrite=not args.no_overwrite)
        print(f"Restored {len(result['restored'])} files:")
        for name in result["restored"]:
            print(f"  ✓ {name}")
        if result["skipped"]:
            print(f"Skipped {len(result['skipped'])} files:")
            for name in result["skipped"]:
                print(f"  - {name}")
    elif args.list:
        in_path = Path(args.list)
        if not in_path.exists():
            print(f"ERROR: {in_path} not found", file=sys.stderr)
            sys.exit(1)
        data = in_path.read_bytes()
        info = list_backup_contents(data)
        m = info["manifest"]
        if m:
            print(f"Created:  {m.get('created_at', 'unknown')}")
            print(f"Source:   {m.get('source', '?')}  ({m.get('source_path', '?')})")
            print(f"Has DB:   {m.get('includes_db', '?')}")
        print(f"\n{len(info['files'])} files  ({info['total_bytes']:,} bytes uncompressed):")
        for name in info["files"]:
            size = info["sizes"].get(name, 0)
            print(f"  {name}  ({size:,} bytes)")
 if __name__ == "__main__":
    main()
--- a/scripts/db.py
+++ b/scripts/db.py
@ -3,12 +3,13 @@ SQLite staging layer for job listings.
 Jobs flow: pending → approved/rejected → applied → synced
          applied → phone_screen → interviewing → offer → hired (or rejected)
 """
 import os
 import sqlite3
 from datetime import datetime
 from pathlib import Path
 from typing import Optional
-DEFAULT_DB = Path(__file__).parent.parent / "staging.db"
+DEFAULT_DB = Path(os.environ.get("STAGING_DB", Path(__file__).parent.parent / "staging.db"))
 CREATE_JOBS = """
 CREATE TABLE IF NOT EXISTS jobs (
--- a/scripts/suggest_helpers.py
+++ b/scripts/suggest_helpers.py
@ -0,0 +1,160 @@
 """
 LLM-powered suggestion helpers for Settings UI.
 Two functions, each makes one LLMRouter call:
  - suggest_search_terms: enhanced title + three-angle exclude suggestions
  - suggest_resume_keywords: skills/domains/keywords gap analysis
 """
 import json
 import re
 from pathlib import Path
 from typing import Any
 from scripts.llm_router import LLMRouter
 def _load_resume_context(resume_path: Path) -> str:
    """Extract 3 most recent positions from plain_text_resume.yaml as a short summary."""
    import yaml
    if not resume_path.exists():
        return ""
    resume = yaml.safe_load(resume_path.read_text()) or {}
    lines = []
    for exp in (resume.get("experience_details") or [])[:3]:
        pos = exp.get("position", "")
        co = exp.get("company", "")
        skills = ", ".join((exp.get("skills_acquired") or [])[:5])
        lines.append(f"- {pos} at {co}: {skills}")
    return "\n".join(lines)
 def _parse_json(text: str) -> dict[str, Any]:
    """Extract the first JSON object from LLM output. Returns {} on failure."""
    m = re.search(r"\{.*\}", text, re.DOTALL)
    if m:
        try:
            return json.loads(m.group())
        except Exception:
            pass
    return {}
 def suggest_search_terms(
    current_titles: list[str],
    resume_path: Path,
    blocklist: dict[str, Any],
    user_profile: dict[str, Any],
 ) -> dict:
    """
    Suggest additional job titles and exclude keywords.
    Three-angle exclude analysis:
      A: Blocklist alias expansion (blocked companies/industries → keyword variants)
      B: Values misalignment (mission preferences → industries/culture to avoid)
      C: Role-type filter (career summary → role types that don't fit)
    Returns: {"suggested_titles": [...], "suggested_excludes": [...]}
    """
    resume_context = _load_resume_context(resume_path)
    titles_str = "\n".join(f"- {t}" for t in current_titles) or "(none yet)"
    bl_companies = ", ".join(blocklist.get("companies", [])) or "none"
    bl_industries = ", ".join(blocklist.get("industries", [])) or "none"
    nda = ", ".join(user_profile.get("nda_companies", [])) or "none"
    career_summary = user_profile.get("career_summary", "") or "Not provided"
    mission_raw = user_profile.get("mission_preferences", {}) or {}
    # Three exclude angles are intentionally collapsed into one flat suggested_excludes list
    mission_str = "\n".join(
        f"  - {k}: {v}" for k, v in mission_raw.items() if v and isinstance(v, str) and v.strip()
    ) or "  (none specified)"
    prompt = f"""You are helping a job seeker optimise their search configuration.
 --- RESUME BACKGROUND ---
 {resume_context or "Not provided"}
 --- CAREER SUMMARY ---
 {career_summary}
 --- CURRENT TITLES BEING SEARCHED ---
 {titles_str}
 --- BLOCKED ENTITIES ---
 Companies blocked: {bl_companies}
 Industries blocked: {bl_industries}
 NDA / confidential employers: {nda}
 --- MISSION & VALUES ---
 {mission_str}
 Provide all four of the following:
 1. TITLE SUGGESTIONS
   5-8 additional job titles they may be missing: alternative names, adjacent roles, or senior variants of their current titles.
 2. EXCLUDE KEYWORDS — BLOCKLIST ALIASES
   The user has blocked the companies/industries above. Suggest keyword variants that would also catch their aliases, subsidiaries, or related brands.
   Example: blocking "Meta" → also exclude "facebook", "instagram", "metaverse", "oculus".
 3. EXCLUDE KEYWORDS — VALUES MISALIGNMENT
   Based on the user's mission and values above, suggest industry or culture keywords to exclude.
   Examples: "tobacco", "gambling", "fossil fuel", "defense contractor", "MLM", "commission-only", "pyramid".
 4. EXCLUDE KEYWORDS — ROLE TYPE FILTER
   Based on the user's career background, suggest role-type terms that don't match their trajectory.
   Examples for a CS/TAM leader: "cold calling", "door to door", "quota-driven", "SDR", "sales development rep".
 Return ONLY valid JSON in exactly this format (no extra text):
 {{"suggested_titles": ["Title 1", "Title 2"],
  "suggested_excludes": ["keyword 1", "keyword 2", "keyword 3"]}}"""
    raw = LLMRouter().complete(prompt).strip()
    parsed = _parse_json(raw)
    return {
        "suggested_titles": parsed.get("suggested_titles", []),
        "suggested_excludes": parsed.get("suggested_excludes", []),
    }
 def suggest_resume_keywords(
    resume_path: Path,
    current_kw: dict[str, list[str]],
 ) -> dict:
    """
    Suggest skills, domains, and keywords not already in the user's resume_keywords.yaml.
    Returns: {"skills": [...], "domains": [...], "keywords": [...]}
    """
    resume_context = _load_resume_context(resume_path)
    already_skills   = ", ".join(current_kw.get("skills", []))   or "none"
    already_domains  = ", ".join(current_kw.get("domains", []))  or "none"
    already_keywords = ", ".join(current_kw.get("keywords", [])) or "none"
    prompt = f"""You are helping a job seeker build a keyword profile used to score job description matches.
 --- RESUME BACKGROUND ---
 {resume_context or "Not provided"}
 --- ALREADY SELECTED (do not repeat these) ---
 Skills:   {already_skills}
 Domains:  {already_domains}
 Keywords: {already_keywords}
 Suggest additional tags in each of the three categories below. Only suggest tags NOT already in the lists above.
 SKILLS — specific technical or soft skills (e.g. "Salesforce", "Executive Communication", "SQL", "Stakeholder Management")
 DOMAINS — industry verticals, company types, or functional areas (e.g. "B2B SaaS", "EdTech", "Non-profit", "Series A-C")
 KEYWORDS — specific terms, methodologies, metrics, or JD phrases (e.g. "NPS", "churn prevention", "QBR", "cross-functional")
 Return ONLY valid JSON in exactly this format (no extra text):
 {{"skills": ["Skill A", "Skill B"],
  "domains": ["Domain A"],
  "keywords": ["Keyword A", "Keyword B"]}}"""
    raw = LLMRouter().complete(prompt).strip()
    parsed = _parse_json(raw)
    return {
        "skills":   parsed.get("skills", []),
        "domains":  parsed.get("domains", []),
        "keywords": parsed.get("keywords", []),
    }
--- a/tests/test_backup.py
+++ b/tests/test_backup.py
@ -0,0 +1,231 @@
 """Tests for scripts/backup.py — create, list, restore, and multi-instance support."""
 from __future__ import annotations
 import json
 import zipfile
 from pathlib import Path
 import pytest
 from scripts.backup import (
    _detect_source_label,
    create_backup,
    list_backup_contents,
    restore_backup,
 )
 # ---------------------------------------------------------------------------
 # Fixtures
 # ---------------------------------------------------------------------------
 def _make_instance(tmp_path: Path, name: str, *, root_db: bool = False) -> Path:
    """Build a minimal fake instance directory for testing."""
    base = tmp_path / name
    base.mkdir()
    # Secret configs
    (base / "config").mkdir()
    (base / "config" / "notion.yaml").write_text("token: secret")
    (base / "config" / "email.yaml").write_text("user: test@example.com")
    # Extra config
    (base / "config" / "llm.yaml").write_text("backend: ollama")
    (base / "config" / "resume_keywords.yaml").write_text("keywords: [python]")
    (base / "config" / "server.yaml").write_text("port: 8502")
    # DB — either at data/staging.db (Peregrine) or staging.db root (legacy)
    if root_db:
        (base / "staging.db").write_bytes(b"SQLite legacy")
    else:
        (base / "data").mkdir()
        (base / "data" / "staging.db").write_bytes(b"SQLite peregrine")
    return base
 # ---------------------------------------------------------------------------
 # create_backup
 # ---------------------------------------------------------------------------
 class TestCreateBackup:
    def test_returns_valid_zip(self, tmp_path):
        base = _make_instance(tmp_path, "peregrine")
        data = create_backup(base)
        assert zipfile.is_zipfile(__import__("io").BytesIO(data))
    def test_includes_secret_configs(self, tmp_path):
        base = _make_instance(tmp_path, "peregrine")
        data = create_backup(base)
        info = list_backup_contents(data)
        assert "config/notion.yaml" in info["files"]
        assert "config/email.yaml" in info["files"]
    def test_includes_extra_configs(self, tmp_path):
        base = _make_instance(tmp_path, "peregrine")
        data = create_backup(base)
        info = list_backup_contents(data)
        assert "config/llm.yaml" in info["files"]
        assert "config/resume_keywords.yaml" in info["files"]
        assert "config/server.yaml" in info["files"]
    def test_includes_db_by_default(self, tmp_path):
        base = _make_instance(tmp_path, "peregrine")
        data = create_backup(base)
        info = list_backup_contents(data)
        assert info["manifest"]["includes_db"] is True
        assert any(f.endswith(".db") for f in info["files"])
    def test_excludes_db_when_flag_false(self, tmp_path):
        base = _make_instance(tmp_path, "peregrine")
        data = create_backup(base, include_db=False)
        info = list_backup_contents(data)
        assert info["manifest"]["includes_db"] is False
        assert not any(f.endswith(".db") for f in info["files"])
    def test_silently_skips_missing_files(self, tmp_path):
        base = _make_instance(tmp_path, "peregrine")
        # tokens.yaml not created in fixture — should not raise
        data = create_backup(base)
        info = list_backup_contents(data)
        assert "config/tokens.yaml" not in info["files"]
    def test_manifest_contains_source_label(self, tmp_path):
        base = _make_instance(tmp_path, "peregrine")
        data = create_backup(base)
        info = list_backup_contents(data)
        assert info["manifest"]["source"] == "peregrine"
    def test_source_label_override(self, tmp_path):
        base = _make_instance(tmp_path, "peregrine")
        data = create_backup(base, source_label="custom-label")
        info = list_backup_contents(data)
        assert info["manifest"]["source"] == "custom-label"
 # ---------------------------------------------------------------------------
 # Legacy instance (staging.db at repo root)
 # ---------------------------------------------------------------------------
 class TestLegacyInstance:
    def test_picks_up_root_db(self, tmp_path):
        base = _make_instance(tmp_path, "job-seeker", root_db=True)
        data = create_backup(base)
        info = list_backup_contents(data)
        assert "staging.db" in info["files"]
        assert "data/staging.db" not in info["files"]
    def test_source_label_is_job_seeker(self, tmp_path):
        base = _make_instance(tmp_path, "job-seeker", root_db=True)
        data = create_backup(base)
        info = list_backup_contents(data)
        assert info["manifest"]["source"] == "job-seeker"
    def test_missing_peregrine_only_configs_skipped(self, tmp_path):
        """Legacy doesn't have server.yaml, user.yaml, etc. — should not error."""
        base = _make_instance(tmp_path, "job-seeker", root_db=True)
        # Remove server.yaml to simulate legacy (it won't exist there)
        (base / "config" / "server.yaml").unlink()
        data = create_backup(base)
        info = list_backup_contents(data)
        assert "config/server.yaml" not in info["files"]
        assert "config/notion.yaml" in info["files"]
 # ---------------------------------------------------------------------------
 # list_backup_contents
 # ---------------------------------------------------------------------------
 class TestListBackupContents:
    def test_returns_manifest_and_files(self, tmp_path):
        base = _make_instance(tmp_path, "peregrine")
        data = create_backup(base)
        info = list_backup_contents(data)
        assert "manifest" in info
        assert "files" in info
        assert "sizes" in info
        assert "total_bytes" in info
    def test_total_bytes_is_sum_of_file_sizes(self, tmp_path):
        base = _make_instance(tmp_path, "peregrine")
        data = create_backup(base)
        info = list_backup_contents(data)
        expected = sum(info["sizes"][f] for f in info["files"] if f in info["sizes"])
        assert info["total_bytes"] == expected
    def test_manifest_not_in_files_list(self, tmp_path):
        base = _make_instance(tmp_path, "peregrine")
        data = create_backup(base)
        info = list_backup_contents(data)
        assert "backup-manifest.json" not in info["files"]
 # ---------------------------------------------------------------------------
 # restore_backup
 # ---------------------------------------------------------------------------
 class TestRestoreBackup:
    def test_restores_all_files(self, tmp_path):
        src = _make_instance(tmp_path, "peregrine")
        dst = tmp_path / "restored"
        dst.mkdir()
        data = create_backup(src)
        result = restore_backup(data, dst)
        assert len(result["restored"]) > 0
        assert (dst / "config" / "notion.yaml").exists()
    def test_skips_db_when_flag_false(self, tmp_path):
        src = _make_instance(tmp_path, "peregrine")
        dst = tmp_path / "restored"
        dst.mkdir()
        data = create_backup(src)
        result = restore_backup(data, dst, include_db=False)
        assert not any(f.endswith(".db") for f in result["restored"])
        assert any(f.endswith(".db") for f in result["skipped"])
    def test_no_overwrite_skips_existing(self, tmp_path):
        src = _make_instance(tmp_path, "peregrine")
        dst = tmp_path / "restored"
        dst.mkdir()
        (dst / "config").mkdir()
        existing = dst / "config" / "notion.yaml"
        existing.write_text("original content")
        data = create_backup(src)
        result = restore_backup(data, dst, overwrite=False)
        assert "config/notion.yaml" in result["skipped"]
        assert existing.read_text() == "original content"
    def test_overwrite_replaces_existing(self, tmp_path):
        src = _make_instance(tmp_path, "peregrine")
        dst = tmp_path / "restored"
        dst.mkdir()
        (dst / "config").mkdir()
        (dst / "config" / "notion.yaml").write_text("stale content")
        data = create_backup(src)
        restore_backup(data, dst, overwrite=True)
        assert (dst / "config" / "notion.yaml").read_text() == "token: secret"
    def test_roundtrip_preserves_content(self, tmp_path):
        src = _make_instance(tmp_path, "peregrine")
        original = (src / "config" / "notion.yaml").read_text()
        dst = tmp_path / "restored"
        dst.mkdir()
        data = create_backup(src)
        restore_backup(data, dst)
        assert (dst / "config" / "notion.yaml").read_text() == original
 # ---------------------------------------------------------------------------
 # _detect_source_label
 # ---------------------------------------------------------------------------
 class TestDetectSourceLabel:
    def test_returns_directory_name(self, tmp_path):
        base = tmp_path / "peregrine"
        base.mkdir()
        assert _detect_source_label(base) == "peregrine"
    def test_legacy_label(self, tmp_path):
        base = tmp_path / "job-seeker"
        base.mkdir()
        assert _detect_source_label(base) == "job-seeker"
--- a/tests/test_suggest_helpers.py
+++ b/tests/test_suggest_helpers.py
@ -0,0 +1,148 @@
 """Tests for scripts/suggest_helpers.py."""
 import json
 import pytest
 from pathlib import Path
 from unittest.mock import patch, MagicMock
 RESUME_PATH = Path(__file__).parent.parent / "config" / "plain_text_resume.yaml"
 # ── _parse_json ───────────────────────────────────────────────────────────────
 def test_parse_json_extracts_valid_object():
    from scripts.suggest_helpers import _parse_json
    raw = 'Here is the result: {"a": [1, 2], "b": "hello"} done.'
    assert _parse_json(raw) == {"a": [1, 2], "b": "hello"}
 def test_parse_json_returns_empty_on_invalid():
    from scripts.suggest_helpers import _parse_json
    assert _parse_json("no json here") == {}
    assert _parse_json('{"broken": ') == {}
 # ── suggest_search_terms ──────────────────────────────────────────────────────
 BLOCKLIST = {
    "companies": ["Meta", "Amazon"],
    "industries": ["gambling"],
    "locations": [],
 }
 USER_PROFILE = {
    "career_summary": "Customer success leader with 10 years in B2B SaaS.",
    "mission_preferences": {
        "animal_welfare": "I volunteer at my local shelter.",
        "education": "",
    },
    "nda_companies": ["Acme Corp"],
 }
 def _mock_llm(response_dict: dict):
    """Return a patcher that makes LLMRouter().complete() return a JSON string."""
    mock_router = MagicMock()
    mock_router.complete.return_value = json.dumps(response_dict)
    return patch("scripts.suggest_helpers.LLMRouter", return_value=mock_router)
 def test_suggest_search_terms_returns_titles_and_excludes():
    from scripts.suggest_helpers import suggest_search_terms
    payload = {"suggested_titles": ["VP Customer Success"], "suggested_excludes": ["cold calling"]}
    with _mock_llm(payload):
        result = suggest_search_terms(["Customer Success Manager"], RESUME_PATH, BLOCKLIST, USER_PROFILE)
    assert result["suggested_titles"] == ["VP Customer Success"]
    assert result["suggested_excludes"] == ["cold calling"]
 def test_suggest_search_terms_prompt_contains_blocklist_companies():
    from scripts.suggest_helpers import suggest_search_terms
    with _mock_llm({"suggested_titles": [], "suggested_excludes": []}) as mock_cls:
        suggest_search_terms(["CSM"], RESUME_PATH, BLOCKLIST, USER_PROFILE)
    prompt_sent = mock_cls.return_value.complete.call_args[0][0]
    assert "Meta" in prompt_sent
    assert "Amazon" in prompt_sent
 def test_suggest_search_terms_prompt_contains_mission():
    from scripts.suggest_helpers import suggest_search_terms
    with _mock_llm({"suggested_titles": [], "suggested_excludes": []}) as mock_cls:
        suggest_search_terms(["CSM"], RESUME_PATH, BLOCKLIST, USER_PROFILE)
    prompt_sent = mock_cls.return_value.complete.call_args[0][0]
    assert "animal_welfare" in prompt_sent or "animal welfare" in prompt_sent.lower()
 def test_suggest_search_terms_prompt_contains_career_summary():
    from scripts.suggest_helpers import suggest_search_terms
    with _mock_llm({"suggested_titles": [], "suggested_excludes": []}) as mock_cls:
        suggest_search_terms(["CSM"], RESUME_PATH, BLOCKLIST, USER_PROFILE)
    prompt_sent = mock_cls.return_value.complete.call_args[0][0]
    assert "Customer success leader" in prompt_sent
 def test_suggest_search_terms_returns_empty_on_bad_json():
    from scripts.suggest_helpers import suggest_search_terms
    mock_router = MagicMock()
    mock_router.complete.return_value = "sorry, I cannot help with that"
    with patch("scripts.suggest_helpers.LLMRouter", return_value=mock_router):
        result = suggest_search_terms(["CSM"], RESUME_PATH, BLOCKLIST, USER_PROFILE)
    assert result == {"suggested_titles": [], "suggested_excludes": []}
 def test_suggest_search_terms_raises_on_llm_exhausted():
    from scripts.suggest_helpers import suggest_search_terms
    mock_router = MagicMock()
    mock_router.complete.side_effect = RuntimeError("All LLM backends exhausted")
    with patch("scripts.suggest_helpers.LLMRouter", return_value=mock_router):
        with pytest.raises(RuntimeError, match="All LLM backends exhausted"):
            suggest_search_terms(["CSM"], RESUME_PATH, BLOCKLIST, USER_PROFILE)
 # ── suggest_resume_keywords ───────────────────────────────────────────────────
 CURRENT_KW = {
    "skills": ["Customer Success", "SQL"],
    "domains": ["B2B SaaS"],
    "keywords": ["NPS"],
 }
 def test_suggest_resume_keywords_returns_all_three_categories():
    from scripts.suggest_helpers import suggest_resume_keywords
    payload = {
        "skills": ["Project Management"],
        "domains": ["EdTech"],
        "keywords": ["churn prevention"],
    }
    with _mock_llm(payload):
        result = suggest_resume_keywords(RESUME_PATH, CURRENT_KW)
    assert "skills" in result
    assert "domains" in result
    assert "keywords" in result
 def test_suggest_resume_keywords_excludes_already_selected():
    from scripts.suggest_helpers import suggest_resume_keywords
    with _mock_llm({"skills": [], "domains": [], "keywords": []}) as mock_cls:
        suggest_resume_keywords(RESUME_PATH, CURRENT_KW)
    prompt_sent = mock_cls.return_value.complete.call_args[0][0]
    # Already-selected tags should appear in the prompt so LLM knows to skip them
    assert "Customer Success" in prompt_sent
    assert "NPS" in prompt_sent
 def test_suggest_resume_keywords_returns_empty_on_bad_json():
    from scripts.suggest_helpers import suggest_resume_keywords
    mock_router = MagicMock()
    mock_router.complete.return_value = "I cannot assist."
    with patch("scripts.suggest_helpers.LLMRouter", return_value=mock_router):
        result = suggest_resume_keywords(RESUME_PATH, CURRENT_KW)
    assert result == {"skills": [], "domains": [], "keywords": []}
 def test_suggest_resume_keywords_raises_on_llm_exhausted():
    from scripts.suggest_helpers import suggest_resume_keywords
    mock_router = MagicMock()
    mock_router.complete.side_effect = RuntimeError("All LLM backends exhausted")
    with patch("scripts.suggest_helpers.LLMRouter", return_value=mock_router):
        with pytest.raises(RuntimeError, match="All LLM backends exhausted"):
            suggest_resume_keywords(RESUME_PATH, CURRENT_KW)
Author	SHA1	Message	Date
pyr0ball	a6d787fed2	docs: digest parsers implementation plan (TDD, 6 tasks) Some checks failed CI / test (pull_request) Failing after 3m59s Details	2026-03-05 22:41:40 -08:00
pyr0ball	be2690af7b	docs: add privacy policy reference	2026-03-05 20:59:01 -08:00
pyr0ball	a30d86ddf5	feat: add LLM suggest button to Skills & Keywords section Places a ✨ Suggest button inline with the Skills & Keywords subheader. On click, calls suggest_resume_keywords() and stores results in session state. Suggestions render as per-category chip panels (skills, domains, keywords); clicking a chip appends it to the YAML and removes it from the panel. A ✕ Clear button dismisses the panel entirely.	2026-03-05 15:13:57 -08:00
pyr0ball	6dfa4a0949	feat: wire enhanced suggest_search_terms into Search tab (three-angle excludes) - Remove old inline _suggest_search_terms (no blocklist/profile awareness) - Replace with import shim delegating to scripts/suggest_helpers.py - Call site now loads blocklist.yaml + user.yaml and passes them through - Update button help text to reflect blocklist, mission values, career background	2026-03-05 15:08:07 -08:00
pyr0ball	93fb452941	feat: add suggest_resume_keywords for skills/domains/keywords gap analysis Replaces NotImplementedError stub with full LLM-backed implementation. Builds a prompt from the last 3 resume positions plus already-selected skills/domains/keywords, calls LLMRouter, and returns de-duped suggestions in all three categories.	2026-03-05 15:00:53 -08:00
pyr0ball	e0063e237b	fix: guard mission_preferences values against non-string types in suggest_search_terms	2026-03-05 13:40:53 -08:00
pyr0ball	bf23987c11	feat: add suggest_search_terms with three-angle exclude analysis Replaces NotImplementedError stub with a real LLMRouter-backed implementation that builds a structured prompt covering blocklist alias expansion, values misalignment, and role-type filtering, then parses the JSON response into suggested_titles and suggested_excludes lists. Moves LLMRouter import to module level so tests can patch it at scripts.suggest_helpers.LLMRouter.	2026-03-05 13:15:25 -08:00
pyr0ball	87b7892e43	docs: digest email parser design — LinkedIn/Adzuna/Ladders registry + Avocet bucket	2026-03-05 12:56:53 -08:00
pyr0ball	e0f69a9db6	fix: Settings widget crash, stale setup banners, Docker service controls - Settings → Search: add-title (＋) and Import buttons crashed with StreamlitAPIException when writing to _sp_titles_multi after it was already instantiated. Fix: pending-key pattern (_sp_titles_pending / _sp_locs_pending) applied before widget renders on next pass. - Home setup banners: fired for email/notion/keywords even when those features were already configured. Add 'done' condition callables (_email_configured, _notion_configured, _keywords_configured) to suppress banners automatically when config files are present. - Services tab start/stop buttons: docker CLI was unavailable inside the container so _docker_available was False and buttons never showed. Bind-mount host /usr/bin/docker (ro) + /var/run/docker.sock into the app container so it can control sibling containers via DooD pattern.	2026-03-04 12:11:23 -08:00
pyr0ball	ddf07c52ab	fix: DEFAULT_DB respects STAGING_DB env var — was ignoring Docker-set path	2026-03-04 11:47:59 -08:00
pyr0ball	64db154a87	feat: backup/restore script with multi-instance and legacy support - create_backup() / restore_backup() / list_backup_contents() public API - --base-dir PATH flag: targets any instance root (default: this repo) --base-dir /devl/job-seeker backs up the legacy Conda install - _DB_CANDIDATES fallback: data/staging.db (Peregrine) or staging.db root (legacy) - Manifest records source label (dir name), source_path, created_at, files, includes_db - Added config/resume_keywords.yaml and config/server.yaml to backup lists - 21 tests covering create, list, restore, legacy DB path, overwrite, roundtrip	2026-03-04 10:52:51 -08:00