Compare commits
11 commits
2514173c6f
...
a6d787fed2
| Author | SHA1 | Date | |
|---|---|---|---|
| a6d787fed2 | |||
| be2690af7b | |||
| a30d86ddf5 | |||
| 6dfa4a0949 | |||
| 93fb452941 | |||
| e0063e237b | |||
| bf23987c11 | |||
| 87b7892e43 | |||
| e0f69a9db6 | |||
| ddf07c52ab | |||
| 64db154a87 |
11 changed files with 2112 additions and 71 deletions
7
PRIVACY.md
Normal file
7
PRIVACY.md
Normal file
|
|
@ -0,0 +1,7 @@
|
||||||
|
# Privacy Policy
|
||||||
|
|
||||||
|
CircuitForge LLC's privacy policy applies to this product and is published at:
|
||||||
|
|
||||||
|
**<https://circuitforge.tech/privacy>**
|
||||||
|
|
||||||
|
Last reviewed: March 2026.
|
||||||
41
app/Home.py
41
app/Home.py
|
|
@ -25,17 +25,45 @@ from scripts.task_runner import submit_task
|
||||||
|
|
||||||
init_db(DEFAULT_DB)
|
init_db(DEFAULT_DB)
|
||||||
|
|
||||||
|
def _email_configured() -> bool:
|
||||||
|
_e = Path(__file__).parent.parent / "config" / "email.yaml"
|
||||||
|
if not _e.exists():
|
||||||
|
return False
|
||||||
|
import yaml as _yaml
|
||||||
|
_cfg = _yaml.safe_load(_e.read_text()) or {}
|
||||||
|
return bool(_cfg.get("username") or _cfg.get("user") or _cfg.get("imap_host"))
|
||||||
|
|
||||||
|
def _notion_configured() -> bool:
|
||||||
|
_n = Path(__file__).parent.parent / "config" / "notion.yaml"
|
||||||
|
if not _n.exists():
|
||||||
|
return False
|
||||||
|
import yaml as _yaml
|
||||||
|
_cfg = _yaml.safe_load(_n.read_text()) or {}
|
||||||
|
return bool(_cfg.get("token"))
|
||||||
|
|
||||||
|
def _keywords_configured() -> bool:
|
||||||
|
_k = Path(__file__).parent.parent / "config" / "resume_keywords.yaml"
|
||||||
|
if not _k.exists():
|
||||||
|
return False
|
||||||
|
import yaml as _yaml
|
||||||
|
_cfg = _yaml.safe_load(_k.read_text()) or {}
|
||||||
|
return bool(_cfg.get("keywords") or _cfg.get("required") or _cfg.get("preferred"))
|
||||||
|
|
||||||
_SETUP_BANNERS = [
|
_SETUP_BANNERS = [
|
||||||
{"key": "connect_cloud", "text": "Connect a cloud service for resume/cover letter storage",
|
{"key": "connect_cloud", "text": "Connect a cloud service for resume/cover letter storage",
|
||||||
"link_label": "Settings → Integrations"},
|
"link_label": "Settings → Integrations",
|
||||||
|
"done": _notion_configured},
|
||||||
{"key": "setup_email", "text": "Set up email sync to catch recruiter outreach",
|
{"key": "setup_email", "text": "Set up email sync to catch recruiter outreach",
|
||||||
"link_label": "Settings → Email"},
|
"link_label": "Settings → Email",
|
||||||
|
"done": _email_configured},
|
||||||
{"key": "setup_email_labels", "text": "Set up email label filters for auto-classification",
|
{"key": "setup_email_labels", "text": "Set up email label filters for auto-classification",
|
||||||
"link_label": "Settings → Email (label guide)"},
|
"link_label": "Settings → Email (label guide)",
|
||||||
|
"done": _email_configured},
|
||||||
{"key": "tune_mission", "text": "Tune your mission preferences for better cover letters",
|
{"key": "tune_mission", "text": "Tune your mission preferences for better cover letters",
|
||||||
"link_label": "Settings → My Profile"},
|
"link_label": "Settings → My Profile"},
|
||||||
{"key": "configure_keywords", "text": "Configure keywords and blocklist for smarter search",
|
{"key": "configure_keywords", "text": "Configure keywords and blocklist for smarter search",
|
||||||
"link_label": "Settings → Search"},
|
"link_label": "Settings → Search",
|
||||||
|
"done": _keywords_configured},
|
||||||
{"key": "upload_corpus", "text": "Upload your cover letter corpus for voice fine-tuning",
|
{"key": "upload_corpus", "text": "Upload your cover letter corpus for voice fine-tuning",
|
||||||
"link_label": "Settings → Fine-Tune"},
|
"link_label": "Settings → Fine-Tune"},
|
||||||
{"key": "configure_linkedin", "text": "Configure LinkedIn Easy Apply automation",
|
{"key": "configure_linkedin", "text": "Configure LinkedIn Easy Apply automation",
|
||||||
|
|
@ -513,7 +541,10 @@ with st.expander("⚠️ Danger Zone", expanded=False):
|
||||||
# ── Setup banners ─────────────────────────────────────────────────────────────
|
# ── Setup banners ─────────────────────────────────────────────────────────────
|
||||||
if _profile and _profile.wizard_complete:
|
if _profile and _profile.wizard_complete:
|
||||||
_dismissed = set(_profile.dismissed_banners)
|
_dismissed = set(_profile.dismissed_banners)
|
||||||
_pending_banners = [b for b in _SETUP_BANNERS if b["key"] not in _dismissed]
|
_pending_banners = [
|
||||||
|
b for b in _SETUP_BANNERS
|
||||||
|
if b["key"] not in _dismissed and not b.get("done", lambda: False)()
|
||||||
|
]
|
||||||
if _pending_banners:
|
if _pending_banners:
|
||||||
st.divider()
|
st.divider()
|
||||||
st.markdown("#### Finish setting up Peregrine")
|
st.markdown("#### Finish setting up Peregrine")
|
||||||
|
|
|
||||||
|
|
@ -36,47 +36,18 @@ def save_yaml(path: Path, data: dict) -> None:
|
||||||
path.write_text(yaml.dump(data, default_flow_style=False, allow_unicode=True))
|
path.write_text(yaml.dump(data, default_flow_style=False, allow_unicode=True))
|
||||||
|
|
||||||
|
|
||||||
def _suggest_search_terms(current_titles: list[str], resume_path: Path) -> dict:
|
from scripts.suggest_helpers import (
|
||||||
"""Call LLM to suggest additional job titles and exclude keywords."""
|
suggest_search_terms as _suggest_search_terms_impl,
|
||||||
import json
|
suggest_resume_keywords as _suggest_resume_keywords,
|
||||||
import re
|
)
|
||||||
from scripts.llm_router import LLMRouter
|
|
||||||
|
|
||||||
resume_context = ""
|
def _suggest_search_terms(current_titles, resume_path, blocklist=None, user_profile=None):
|
||||||
if resume_path.exists():
|
return _suggest_search_terms_impl(
|
||||||
resume = load_yaml(resume_path)
|
current_titles,
|
||||||
lines = []
|
resume_path,
|
||||||
for exp in (resume.get("experience_details") or [])[:3]:
|
blocklist or {},
|
||||||
pos = exp.get("position", "")
|
user_profile or {},
|
||||||
co = exp.get("company", "")
|
)
|
||||||
skills = ", ".join((exp.get("skills_acquired") or [])[:5])
|
|
||||||
lines.append(f"- {pos} at {co}: {skills}")
|
|
||||||
resume_context = "\n".join(lines)
|
|
||||||
|
|
||||||
titles_str = "\n".join(f"- {t}" for t in current_titles)
|
|
||||||
prompt = f"""You are helping a job seeker optimize their search criteria.
|
|
||||||
|
|
||||||
Their background (from resume):
|
|
||||||
{resume_context or "Customer success and technical account management leader"}
|
|
||||||
|
|
||||||
Current job titles being searched:
|
|
||||||
{titles_str}
|
|
||||||
|
|
||||||
Suggest:
|
|
||||||
1. 5-8 additional job titles they might be missing (alternative names, adjacent roles, senior variants)
|
|
||||||
2. 3-5 keywords to add to the exclusion filter (to screen out irrelevant postings)
|
|
||||||
|
|
||||||
Return ONLY valid JSON in this exact format:
|
|
||||||
{{"suggested_titles": ["Title 1", "Title 2"], "suggested_excludes": ["keyword 1", "keyword 2"]}}"""
|
|
||||||
|
|
||||||
result = LLMRouter().complete(prompt).strip()
|
|
||||||
m = re.search(r"\{.*\}", result, re.DOTALL)
|
|
||||||
if m:
|
|
||||||
try:
|
|
||||||
return json.loads(m.group())
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return {"suggested_titles": [], "suggested_excludes": []}
|
|
||||||
|
|
||||||
_show_finetune = bool(_profile and _profile.inference_profile in ("single-gpu", "dual-gpu"))
|
_show_finetune = bool(_profile and _profile.inference_profile in ("single-gpu", "dual-gpu"))
|
||||||
|
|
||||||
|
|
@ -324,6 +295,18 @@ with tab_search:
|
||||||
st.session_state["_sp_excludes"] = "\n".join(p.get("exclude_keywords", []))
|
st.session_state["_sp_excludes"] = "\n".join(p.get("exclude_keywords", []))
|
||||||
st.session_state["_sp_hash"] = _sp_hash
|
st.session_state["_sp_hash"] = _sp_hash
|
||||||
|
|
||||||
|
# Apply any pending programmatic updates BEFORE widgets are instantiated.
|
||||||
|
# Streamlit forbids writing to a widget's key after it renders on the same pass;
|
||||||
|
# button handlers write to *_pending keys instead, consumed here on the next pass.
|
||||||
|
for _pend, _wkey in [("_sp_titles_pending", "_sp_titles_multi"),
|
||||||
|
("_sp_locs_pending", "_sp_locations_multi"),
|
||||||
|
("_sp_new_title_pending", "_sp_new_title"),
|
||||||
|
("_sp_paste_titles_pending", "_sp_paste_titles"),
|
||||||
|
("_sp_new_loc_pending", "_sp_new_loc"),
|
||||||
|
("_sp_paste_locs_pending", "_sp_paste_locs")]:
|
||||||
|
if _pend in st.session_state:
|
||||||
|
st.session_state[_wkey] = st.session_state.pop(_pend)
|
||||||
|
|
||||||
# ── Titles ────────────────────────────────────────────────────────────────
|
# ── Titles ────────────────────────────────────────────────────────────────
|
||||||
_title_row, _suggest_btn_col = st.columns([4, 1])
|
_title_row, _suggest_btn_col = st.columns([4, 1])
|
||||||
with _title_row:
|
with _title_row:
|
||||||
|
|
@ -331,7 +314,7 @@ with tab_search:
|
||||||
with _suggest_btn_col:
|
with _suggest_btn_col:
|
||||||
st.write("")
|
st.write("")
|
||||||
_run_suggest = st.button("✨ Suggest", key="sp_suggest_btn",
|
_run_suggest = st.button("✨ Suggest", key="sp_suggest_btn",
|
||||||
help="Ask the LLM to suggest additional titles and exclude keywords based on your resume")
|
help="Ask the LLM to suggest additional titles and smarter exclude keywords — using your blocklist, mission values, and career background.")
|
||||||
|
|
||||||
st.multiselect(
|
st.multiselect(
|
||||||
"Job titles",
|
"Job titles",
|
||||||
|
|
@ -355,8 +338,8 @@ with tab_search:
|
||||||
st.session_state["_sp_title_options"] = _opts
|
st.session_state["_sp_title_options"] = _opts
|
||||||
if _t not in _sel:
|
if _t not in _sel:
|
||||||
_sel.append(_t)
|
_sel.append(_t)
|
||||||
st.session_state["_sp_titles_multi"] = _sel
|
st.session_state["_sp_titles_pending"] = _sel
|
||||||
st.session_state["_sp_new_title"] = ""
|
st.session_state["_sp_new_title_pending"] = ""
|
||||||
st.rerun()
|
st.rerun()
|
||||||
with st.expander("📋 Paste a list of titles"):
|
with st.expander("📋 Paste a list of titles"):
|
||||||
st.text_area("One title per line", key="_sp_paste_titles", height=80, label_visibility="collapsed",
|
st.text_area("One title per line", key="_sp_paste_titles", height=80, label_visibility="collapsed",
|
||||||
|
|
@ -371,23 +354,34 @@ with tab_search:
|
||||||
if _t not in _sel:
|
if _t not in _sel:
|
||||||
_sel.append(_t)
|
_sel.append(_t)
|
||||||
st.session_state["_sp_title_options"] = _opts
|
st.session_state["_sp_title_options"] = _opts
|
||||||
st.session_state["_sp_titles_multi"] = _sel
|
st.session_state["_sp_titles_pending"] = _sel
|
||||||
st.session_state["_sp_paste_titles"] = ""
|
st.session_state["_sp_paste_titles_pending"] = ""
|
||||||
st.rerun()
|
st.rerun()
|
||||||
|
|
||||||
# ── LLM suggestions panel ────────────────────────────────────────────────
|
# ── LLM suggestions panel ────────────────────────────────────────────────
|
||||||
if _run_suggest:
|
if _run_suggest:
|
||||||
_current_titles = list(st.session_state.get("_sp_titles_multi", []))
|
_current_titles = list(st.session_state.get("_sp_titles_multi", []))
|
||||||
|
_blocklist = load_yaml(BLOCKLIST_CFG)
|
||||||
|
_user_profile = load_yaml(USER_CFG)
|
||||||
with st.spinner("Asking LLM for suggestions…"):
|
with st.spinner("Asking LLM for suggestions…"):
|
||||||
suggestions = _suggest_search_terms(_current_titles, RESUME_PATH)
|
try:
|
||||||
# Add suggested titles to options list (not auto-selected — user picks from dropdown)
|
suggestions = _suggest_search_terms(_current_titles, RESUME_PATH, _blocklist, _user_profile)
|
||||||
_opts = list(st.session_state.get("_sp_title_options", []))
|
except RuntimeError as _e:
|
||||||
for _t in suggestions.get("suggested_titles", []):
|
st.warning(
|
||||||
if _t not in _opts:
|
f"No LLM backend available: {_e}. "
|
||||||
_opts.append(_t)
|
"Check that Ollama is running and has GPU access, or enable a cloud backend in Settings → System → LLM.",
|
||||||
st.session_state["_sp_title_options"] = _opts
|
icon="⚠️",
|
||||||
st.session_state["_sp_suggestions"] = suggestions
|
)
|
||||||
st.rerun()
|
suggestions = None
|
||||||
|
if suggestions is not None:
|
||||||
|
# Add suggested titles to options list (not auto-selected — user picks from dropdown)
|
||||||
|
_opts = list(st.session_state.get("_sp_title_options", []))
|
||||||
|
for _t in suggestions.get("suggested_titles", []):
|
||||||
|
if _t not in _opts:
|
||||||
|
_opts.append(_t)
|
||||||
|
st.session_state["_sp_title_options"] = _opts
|
||||||
|
st.session_state["_sp_suggestions"] = suggestions
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
if st.session_state.get("_sp_suggestions"):
|
if st.session_state.get("_sp_suggestions"):
|
||||||
sugg = st.session_state["_sp_suggestions"]
|
sugg = st.session_state["_sp_suggestions"]
|
||||||
|
|
@ -436,8 +430,8 @@ with tab_search:
|
||||||
st.session_state["_sp_loc_options"] = _opts
|
st.session_state["_sp_loc_options"] = _opts
|
||||||
if _l not in _sel:
|
if _l not in _sel:
|
||||||
_sel.append(_l)
|
_sel.append(_l)
|
||||||
st.session_state["_sp_locations_multi"] = _sel
|
st.session_state["_sp_locs_pending"] = _sel
|
||||||
st.session_state["_sp_new_loc"] = ""
|
st.session_state["_sp_new_loc_pending"] = ""
|
||||||
st.rerun()
|
st.rerun()
|
||||||
with st.expander("📋 Paste a list of locations"):
|
with st.expander("📋 Paste a list of locations"):
|
||||||
st.text_area("One location per line", key="_sp_paste_locs", height=80, label_visibility="collapsed",
|
st.text_area("One location per line", key="_sp_paste_locs", height=80, label_visibility="collapsed",
|
||||||
|
|
@ -452,8 +446,8 @@ with tab_search:
|
||||||
if _l not in _sel:
|
if _l not in _sel:
|
||||||
_sel.append(_l)
|
_sel.append(_l)
|
||||||
st.session_state["_sp_loc_options"] = _opts
|
st.session_state["_sp_loc_options"] = _opts
|
||||||
st.session_state["_sp_locations_multi"] = _sel
|
st.session_state["_sp_locs_pending"] = _sel
|
||||||
st.session_state["_sp_paste_locs"] = ""
|
st.session_state["_sp_paste_locs_pending"] = ""
|
||||||
st.rerun()
|
st.rerun()
|
||||||
|
|
||||||
st.subheader("Exclude Keywords")
|
st.subheader("Exclude Keywords")
|
||||||
|
|
@ -747,11 +741,33 @@ with tab_resume:
|
||||||
st.balloons()
|
st.balloons()
|
||||||
|
|
||||||
st.divider()
|
st.divider()
|
||||||
st.subheader("🏷️ Skills & Keywords")
|
_kw_header_col, _kw_btn_col = st.columns([5, 1])
|
||||||
st.caption(
|
with _kw_header_col:
|
||||||
f"Matched against job descriptions to surface {_name}'s most relevant experience "
|
st.subheader("🏷️ Skills & Keywords")
|
||||||
"and highlight keyword overlap in research briefs. Search the bundled list or add your own."
|
st.caption(
|
||||||
)
|
f"Matched against job descriptions to surface {_name}'s most relevant experience "
|
||||||
|
"and highlight keyword overlap in research briefs. Search the bundled list or add your own."
|
||||||
|
)
|
||||||
|
with _kw_btn_col:
|
||||||
|
st.write("")
|
||||||
|
st.write("")
|
||||||
|
_run_kw_suggest = st.button(
|
||||||
|
"✨ Suggest", key="kw_suggest_btn",
|
||||||
|
help="Ask the LLM to suggest skills, domains, and keywords based on your resume.",
|
||||||
|
)
|
||||||
|
|
||||||
|
if _run_kw_suggest:
|
||||||
|
_kw_current = load_yaml(KEYWORDS_CFG) if KEYWORDS_CFG.exists() else {}
|
||||||
|
with st.spinner("Asking LLM for keyword suggestions…"):
|
||||||
|
try:
|
||||||
|
_kw_sugg = _suggest_resume_keywords(RESUME_PATH, _kw_current)
|
||||||
|
st.session_state["_kw_suggestions"] = _kw_sugg
|
||||||
|
except RuntimeError as _e:
|
||||||
|
st.warning(
|
||||||
|
f"No LLM backend available: {_e}. "
|
||||||
|
"Check that Ollama is running and has GPU access, or enable a cloud backend in Settings → System → LLM.",
|
||||||
|
icon="⚠️",
|
||||||
|
)
|
||||||
|
|
||||||
from scripts.skills_utils import load_suggestions as _load_sugg, filter_tag as _filter_tag
|
from scripts.skills_utils import load_suggestions as _load_sugg, filter_tag as _filter_tag
|
||||||
|
|
||||||
|
|
@ -815,6 +831,33 @@ with tab_resume:
|
||||||
save_yaml(KEYWORDS_CFG, kw_data)
|
save_yaml(KEYWORDS_CFG, kw_data)
|
||||||
st.rerun()
|
st.rerun()
|
||||||
|
|
||||||
|
# ── LLM keyword suggestion chips ──────────────────────────────────────
|
||||||
|
_kw_sugg_data = st.session_state.get("_kw_suggestions")
|
||||||
|
if _kw_sugg_data:
|
||||||
|
_KW_ICONS = {"skills": "🛠️", "domains": "🏢", "keywords": "🔑"}
|
||||||
|
_any_shown = False
|
||||||
|
for _cat, _icon in _KW_ICONS.items():
|
||||||
|
_cat_sugg = [t for t in _kw_sugg_data.get(_cat, [])
|
||||||
|
if t not in kw_data.get(_cat, [])]
|
||||||
|
if not _cat_sugg:
|
||||||
|
continue
|
||||||
|
_any_shown = True
|
||||||
|
st.caption(f"**{_icon} {_cat.capitalize()} suggestions** — click to add:")
|
||||||
|
_chip_cols = st.columns(min(len(_cat_sugg), 4))
|
||||||
|
for _i, _tag in enumerate(_cat_sugg):
|
||||||
|
with _chip_cols[_i % 4]:
|
||||||
|
if st.button(f"+ {_tag}", key=f"kw_sugg_{_cat}_{_i}"):
|
||||||
|
_new_list = list(kw_data.get(_cat, [])) + [_tag]
|
||||||
|
kw_data[_cat] = _new_list
|
||||||
|
save_yaml(KEYWORDS_CFG, kw_data)
|
||||||
|
_kw_sugg_data[_cat] = [t for t in _kw_sugg_data[_cat] if t != _tag]
|
||||||
|
st.session_state["_kw_suggestions"] = _kw_sugg_data
|
||||||
|
st.rerun()
|
||||||
|
if _any_shown:
|
||||||
|
if st.button("✕ Clear suggestions", key="kw_clear_sugg"):
|
||||||
|
st.session_state.pop("_kw_suggestions", None)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
# ── System tab ────────────────────────────────────────────────────────────────
|
# ── System tab ────────────────────────────────────────────────────────────────
|
||||||
with tab_system:
|
with tab_system:
|
||||||
st.caption("Infrastructure, LLM backends, integrations, and service connections.")
|
st.caption("Infrastructure, LLM backends, integrations, and service connections.")
|
||||||
|
|
@ -1015,8 +1058,10 @@ with tab_system:
|
||||||
with st.expander("🔌 Services", expanded=True):
|
with st.expander("🔌 Services", expanded=True):
|
||||||
import subprocess as _sp
|
import subprocess as _sp
|
||||||
import shutil as _shutil
|
import shutil as _shutil
|
||||||
|
import os as _os
|
||||||
TOKENS_CFG = CONFIG_DIR / "tokens.yaml"
|
TOKENS_CFG = CONFIG_DIR / "tokens.yaml"
|
||||||
COMPOSE_DIR = str(Path(__file__).parent.parent.parent)
|
COMPOSE_DIR = str(Path(__file__).parent.parent.parent)
|
||||||
|
_compose_env = {**_os.environ, "COMPOSE_PROJECT_NAME": "peregrine"}
|
||||||
_docker_available = bool(_shutil.which("docker"))
|
_docker_available = bool(_shutil.which("docker"))
|
||||||
_sys_profile_name = _profile.inference_profile if _profile else "remote"
|
_sys_profile_name = _profile.inference_profile if _profile else "remote"
|
||||||
SYS_SERVICES = [
|
SYS_SERVICES = [
|
||||||
|
|
@ -1108,7 +1153,7 @@ with tab_system:
|
||||||
elif up:
|
elif up:
|
||||||
if st.button("⏹ Stop", key=f"sys_svc_stop_{svc['port']}", use_container_width=True):
|
if st.button("⏹ Stop", key=f"sys_svc_stop_{svc['port']}", use_container_width=True):
|
||||||
with st.spinner(f"Stopping {svc['name']}…"):
|
with st.spinner(f"Stopping {svc['name']}…"):
|
||||||
r = _sp.run(svc["stop"], capture_output=True, text=True, cwd=svc["cwd"])
|
r = _sp.run(svc["stop"], capture_output=True, text=True, cwd=svc["cwd"], env=_compose_env)
|
||||||
st.success("Stopped.") if r.returncode == 0 else st.error(r.stderr or r.stdout)
|
st.success("Stopped.") if r.returncode == 0 else st.error(r.stderr or r.stdout)
|
||||||
st.rerun()
|
st.rerun()
|
||||||
else:
|
else:
|
||||||
|
|
@ -1119,7 +1164,7 @@ with tab_system:
|
||||||
_start_cmd.append(_sel)
|
_start_cmd.append(_sel)
|
||||||
if st.button("▶ Start", key=f"sys_svc_start_{svc['port']}", use_container_width=True, type="primary"):
|
if st.button("▶ Start", key=f"sys_svc_start_{svc['port']}", use_container_width=True, type="primary"):
|
||||||
with st.spinner(f"Starting {svc['name']}…"):
|
with st.spinner(f"Starting {svc['name']}…"):
|
||||||
r = _sp.run(_start_cmd, capture_output=True, text=True, cwd=svc["cwd"])
|
r = _sp.run(_start_cmd, capture_output=True, text=True, cwd=svc["cwd"], env=_compose_env)
|
||||||
st.success("Started!") if r.returncode == 0 else st.error(r.stderr or r.stdout)
|
st.success("Started!") if r.returncode == 0 else st.error(r.stderr or r.stdout)
|
||||||
st.rerun()
|
st.rerun()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,8 @@ services:
|
||||||
- ./config:/app/config
|
- ./config:/app/config
|
||||||
- ./data:/app/data
|
- ./data:/app/data
|
||||||
- ${DOCS_DIR:-~/Documents/JobSearch}:/docs
|
- ${DOCS_DIR:-~/Documents/JobSearch}:/docs
|
||||||
|
- /var/run/docker.sock:/var/run/docker.sock
|
||||||
|
- /usr/bin/docker:/usr/bin/docker:ro
|
||||||
environment:
|
environment:
|
||||||
- STAGING_DB=/app/data/staging.db
|
- STAGING_DB=/app/data/staging.db
|
||||||
- DOCS_DIR=/docs
|
- DOCS_DIR=/docs
|
||||||
|
|
|
||||||
242
docs/plans/2026-03-05-digest-parsers-design.md
Normal file
242
docs/plans/2026-03-05-digest-parsers-design.md
Normal file
|
|
@ -0,0 +1,242 @@
|
||||||
|
# Digest Email Parsers — Design
|
||||||
|
|
||||||
|
**Date:** 2026-03-05
|
||||||
|
**Products:** Peregrine (primary), Avocet (bucket)
|
||||||
|
**Status:** Design approved, ready for implementation planning
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
|
||||||
|
Peregrine's `imap_sync.py` can extract leads from digest emails, but only for LinkedIn — the
|
||||||
|
parser is hardcoded inline with no extension point. Adzuna and The Ladders digest emails are
|
||||||
|
unhandled. Additionally, any digest email from an unknown sender is silently dropped with no
|
||||||
|
way to collect samples for building new parsers.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Solution Overview
|
||||||
|
|
||||||
|
Two complementary changes:
|
||||||
|
|
||||||
|
1. **`peregrine/scripts/digest_parsers.py`** — a standalone parser module with a sender registry
|
||||||
|
and dispatcher. `imap_sync.py` calls a single function; the registry handles dispatch.
|
||||||
|
LinkedIn parser moves here; Adzuna and Ladders parsers are built against real IMAP samples.
|
||||||
|
|
||||||
|
2. **Avocet digest bucket** — when a user labels an email as `digest` in the Avocet label UI,
|
||||||
|
the email is appended to `data/digest_samples.jsonl`. This file is the corpus for building
|
||||||
|
and testing new parsers for senders not yet in the registry.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
### Production path (Peregrine)
|
||||||
|
|
||||||
|
```
|
||||||
|
imap_sync._scan_unmatched_leads()
|
||||||
|
│
|
||||||
|
├─ parse_digest(from_addr, body)
|
||||||
|
│ │
|
||||||
|
│ ├─ None → unknown sender → fall through to LLM extraction (unchanged)
|
||||||
|
│ ├─ [] → known sender, nothing found → skip
|
||||||
|
│ └─ [...] → jobs found → insert_job() + submit_task("scrape_url")
|
||||||
|
│
|
||||||
|
└─ continue (digest email consumed; does not reach LLM path)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Sample collection path (Avocet)
|
||||||
|
|
||||||
|
```
|
||||||
|
Avocet label UI
|
||||||
|
│
|
||||||
|
└─ label == "digest"
|
||||||
|
│
|
||||||
|
└─ append to data/digest_samples.jsonl
|
||||||
|
│
|
||||||
|
└─ used as reference for building new parsers
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Module: `peregrine/scripts/digest_parsers.py`
|
||||||
|
|
||||||
|
### Parser interface
|
||||||
|
|
||||||
|
Each parser function:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def parse_<source>(body: str) -> list[dict]
|
||||||
|
```
|
||||||
|
|
||||||
|
Returns zero or more job dicts:
|
||||||
|
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"title": str, # job title
|
||||||
|
"company": str, # company name
|
||||||
|
"location": str, # location string (may be empty)
|
||||||
|
"url": str, # canonical URL, tracking params stripped
|
||||||
|
"source": str, # "linkedin" | "adzuna" | "theladders"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Dispatcher
|
||||||
|
|
||||||
|
```python
|
||||||
|
DIGEST_PARSERS: dict[str, tuple[str, Callable[[str], list[dict]]]] = {
|
||||||
|
"jobalerts@linkedin.com": ("linkedin", parse_linkedin),
|
||||||
|
"noreply@adzuna.com": ("adzuna", parse_adzuna),
|
||||||
|
"noreply@theladders.com": ("theladders", parse_theladders),
|
||||||
|
}
|
||||||
|
|
||||||
|
def parse_digest(from_addr: str, body: str) -> list[dict] | None:
|
||||||
|
"""
|
||||||
|
Dispatch to the appropriate parser based on sender address.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None — no parser matched (not a known digest sender)
|
||||||
|
[] — parser matched, no extractable jobs found
|
||||||
|
[dict, ...] — one dict per job card extracted
|
||||||
|
"""
|
||||||
|
addr = from_addr.lower()
|
||||||
|
for sender, (source, parse_fn) in DIGEST_PARSERS.items():
|
||||||
|
if sender in addr:
|
||||||
|
return parse_fn(body)
|
||||||
|
return None
|
||||||
|
```
|
||||||
|
|
||||||
|
Sender matching is a substring check, tolerant of display-name wrappers
|
||||||
|
(`"LinkedIn <jobalerts@linkedin.com>"` matches correctly).
|
||||||
|
|
||||||
|
### Parsers
|
||||||
|
|
||||||
|
**`parse_linkedin`** — moved verbatim from `imap_sync.parse_linkedin_alert()`, renamed.
|
||||||
|
No behavior change.
|
||||||
|
|
||||||
|
**`parse_adzuna`** — built against real Adzuna digest email bodies pulled from the
|
||||||
|
configured IMAP account during implementation. Expected format: job blocks separated
|
||||||
|
by consistent delimiters with title, company, location, and a trackable URL per block.
|
||||||
|
|
||||||
|
**`parse_theladders`** — same approach. The Ladders already has a web scraper in
|
||||||
|
`scripts/custom_boards/theladders.py`; URL canonicalization patterns from there apply here.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Changes to `imap_sync.py`
|
||||||
|
|
||||||
|
Replace the LinkedIn-specific block in `_scan_unmatched_leads()` (~lines 561–585):
|
||||||
|
|
||||||
|
**Before:**
|
||||||
|
```python
|
||||||
|
if _LINKEDIN_ALERT_SENDER in parsed["from_addr"].lower():
|
||||||
|
cards = parse_linkedin_alert(parsed["body"])
|
||||||
|
for card in cards:
|
||||||
|
# ... LinkedIn-specific insert ...
|
||||||
|
known_message_ids.add(mid)
|
||||||
|
continue
|
||||||
|
```
|
||||||
|
|
||||||
|
**After:**
|
||||||
|
```python
|
||||||
|
from scripts.digest_parsers import parse_digest # top of file
|
||||||
|
|
||||||
|
cards = parse_digest(parsed["from_addr"], parsed["body"])
|
||||||
|
if cards is not None:
|
||||||
|
for card in cards:
|
||||||
|
if card["url"] in existing_urls:
|
||||||
|
continue
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": card["title"],
|
||||||
|
"company": card["company"],
|
||||||
|
"url": card["url"],
|
||||||
|
"source": card["source"],
|
||||||
|
"location": card["location"],
|
||||||
|
"is_remote": 0,
|
||||||
|
"salary": "",
|
||||||
|
"description": "",
|
||||||
|
"date_found": datetime.now().isoformat()[:10],
|
||||||
|
})
|
||||||
|
if job_id:
|
||||||
|
submit_task(db_path, "scrape_url", job_id)
|
||||||
|
existing_urls.add(card["url"])
|
||||||
|
new_leads += 1
|
||||||
|
print(f"[imap] digest ({card['source']}) → {card['company']} — {card['title']}")
|
||||||
|
known_message_ids.add(mid)
|
||||||
|
continue
|
||||||
|
```
|
||||||
|
|
||||||
|
`parse_digest` returning `None` falls through to the existing LLM extraction path — all
|
||||||
|
non-digest recruitment emails are completely unaffected.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Avocet: Digest Bucket
|
||||||
|
|
||||||
|
### File
|
||||||
|
|
||||||
|
`avocet/data/digest_samples.jsonl` — gitignored. An `.example` entry is committed.
|
||||||
|
|
||||||
|
Schema matches the existing label queue (JSONL on-disk schema):
|
||||||
|
|
||||||
|
```json
|
||||||
|
{"subject": "...", "body": "...", "from_addr": "...", "date": "...", "account": "..."}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Trigger
|
||||||
|
|
||||||
|
In `app/label_tool.py` and `app/api.py`: when a `digest` label is applied, append the
|
||||||
|
email to `digest_samples.jsonl` alongside the normal write to `email_score.jsonl`.
|
||||||
|
|
||||||
|
No Peregrine dependency — if the file path doesn't exist the `data/` directory is created
|
||||||
|
automatically. Avocet remains fully standalone.
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
When a new digest sender appears in the wild:
|
||||||
|
1. Label representative emails as `digest` in Avocet → samples land in `digest_samples.jsonl`
|
||||||
|
2. Inspect samples, write `parse_<source>(body)` in `digest_parsers.py`
|
||||||
|
3. Add the sender string to `DIGEST_PARSERS`
|
||||||
|
4. Add fixture test in `peregrine/tests/test_digest_parsers.py`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
### `peregrine/tests/test_digest_parsers.py`
|
||||||
|
|
||||||
|
- Fixture bodies sourced from real IMAP samples (anonymized company names / URLs acceptable)
|
||||||
|
- Each parser: valid body → expected cards returned
|
||||||
|
- Each parser: empty / malformed body → `[]`, no exception
|
||||||
|
- Dispatcher: known sender → correct parser invoked
|
||||||
|
- Dispatcher: unknown sender → `None`
|
||||||
|
- URL canonicalization: tracking params stripped, canonical form asserted
|
||||||
|
- Dedup within digest: same URL appearing twice in one email → one card
|
||||||
|
|
||||||
|
### `avocet/tests/test_digest_bucket.py`
|
||||||
|
|
||||||
|
- `digest` label → row appended to `digest_samples.jsonl`
|
||||||
|
- Any other label → `digest_samples.jsonl` not touched
|
||||||
|
- First write creates `data/` directory if absent
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Files Changed / Created
|
||||||
|
|
||||||
|
| File | Change |
|
||||||
|
|------|--------|
|
||||||
|
| `peregrine/scripts/digest_parsers.py` | **New** — parser module |
|
||||||
|
| `peregrine/scripts/imap_sync.py` | Replace inline LinkedIn block with `parse_digest()` call |
|
||||||
|
| `peregrine/tests/test_digest_parsers.py` | **New** — parser unit tests |
|
||||||
|
| `avocet/app/label_tool.py` | Append to `digest_samples.jsonl` on `digest` label |
|
||||||
|
| `avocet/app/api.py` | Same — digest bucket write in label endpoint |
|
||||||
|
| `avocet/tests/test_digest_bucket.py` | **New** — bucket write tests |
|
||||||
|
| `avocet/data/digest_samples.jsonl.example` | **New** — committed sample for reference |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Out of Scope
|
||||||
|
|
||||||
|
- Avocet → Peregrine direct import trigger (deferred; bucket is sufficient for now)
|
||||||
|
- `background_tasks` integration for digest re-processing (not needed with bucket approach)
|
||||||
|
- HTML digest parsing (all three senders send plain-text alerts; revisit if needed)
|
||||||
897
docs/plans/2026-03-05-digest-parsers-plan.md
Normal file
897
docs/plans/2026-03-05-digest-parsers-plan.md
Normal file
|
|
@ -0,0 +1,897 @@
|
||||||
|
# Digest Email Parsers Implementation Plan
|
||||||
|
|
||||||
|
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
|
||||||
|
|
||||||
|
**Goal:** Extract job listings from LinkedIn, Adzuna, and The Ladders digest emails into Peregrine leads, with an Avocet bucket that collects digest samples for future parser development.
|
||||||
|
|
||||||
|
**Architecture:** New `peregrine/scripts/digest_parsers.py` exposes a `parse_digest(from_addr, body)` dispatcher backed by a sender registry. `imap_sync.py` replaces its inline LinkedIn block with one dispatcher call. Avocet's two label paths (`label_tool.py` + `api.py`) append digest-labeled emails to `data/digest_samples.jsonl`. Adzuna and Ladders parsers are built from real IMAP samples fetched in Task 2.
|
||||||
|
|
||||||
|
**Tech Stack:** Python stdlib only — `re`, `json`, `pathlib`. No new dependencies.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 1: Create `digest_parsers.py` with dispatcher + LinkedIn parser
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Create: `peregrine/scripts/digest_parsers.py`
|
||||||
|
- Create: `peregrine/tests/test_digest_parsers.py`
|
||||||
|
|
||||||
|
**Context:**
|
||||||
|
`parse_linkedin_alert()` currently lives inline in `imap_sync.py`. We move it here (renamed
|
||||||
|
`parse_linkedin`) and wrap it in a dispatcher. All other parsers plug into the same registry.
|
||||||
|
|
||||||
|
Run all tests with:
|
||||||
|
```
|
||||||
|
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_digest_parsers.py -v
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Step 1: Write the failing tests**
|
||||||
|
|
||||||
|
Create `peregrine/tests/test_digest_parsers.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""Tests for digest email parser registry."""
|
||||||
|
import pytest
|
||||||
|
from scripts.digest_parsers import parse_digest, parse_linkedin
|
||||||
|
|
||||||
|
# ── LinkedIn fixture ──────────────────────────────────────────────────────────
|
||||||
|
# Mirrors the plain-text format LinkedIn Job Alert emails actually send.
|
||||||
|
# Each job block is separated by a line of 10+ dashes.
|
||||||
|
LINKEDIN_BODY = """\
|
||||||
|
Software Engineer
|
||||||
|
Acme Corp
|
||||||
|
San Francisco, CA
|
||||||
|
|
||||||
|
View job: https://www.linkedin.com/comm/jobs/view/1111111111/?refId=abc&trackingId=xyz
|
||||||
|
|
||||||
|
--------------------------------------------------
|
||||||
|
Senior Developer
|
||||||
|
Widget Inc
|
||||||
|
Remote
|
||||||
|
|
||||||
|
View job: https://www.linkedin.com/comm/jobs/view/2222222222/?refId=def
|
||||||
|
"""
|
||||||
|
|
||||||
|
LINKEDIN_BODY_EMPTY = "No jobs matched your alert this week."
|
||||||
|
|
||||||
|
LINKEDIN_BODY_NO_URL = """\
|
||||||
|
Software Engineer
|
||||||
|
Acme Corp
|
||||||
|
San Francisco, CA
|
||||||
|
|
||||||
|
--------------------------------------------------
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def test_dispatcher_linkedin_sender():
|
||||||
|
cards = parse_digest("LinkedIn <jobalerts@linkedin.com>", LINKEDIN_BODY)
|
||||||
|
assert cards is not None
|
||||||
|
assert len(cards) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_dispatcher_unknown_sender_returns_none():
|
||||||
|
result = parse_digest("noreply@randomboard.com", LINKEDIN_BODY)
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_dispatcher_case_insensitive_sender():
|
||||||
|
cards = parse_digest("JOBALERTS@LINKEDIN.COM", LINKEDIN_BODY)
|
||||||
|
assert cards is not None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_linkedin_returns_correct_fields():
|
||||||
|
cards = parse_linkedin(LINKEDIN_BODY)
|
||||||
|
assert cards[0]["title"] == "Software Engineer"
|
||||||
|
assert cards[0]["company"] == "Acme Corp"
|
||||||
|
assert cards[0]["location"] == "San Francisco, CA"
|
||||||
|
assert cards[0]["source"] == "linkedin"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_linkedin_url_canonicalized():
|
||||||
|
"""Tracking params stripped; canonical jobs/view/<id>/ form."""
|
||||||
|
cards = parse_linkedin(LINKEDIN_BODY)
|
||||||
|
assert cards[0]["url"] == "https://www.linkedin.com/jobs/view/1111111111/"
|
||||||
|
assert "refId" not in cards[0]["url"]
|
||||||
|
assert "trackingId" not in cards[0]["url"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_linkedin_empty_body_returns_empty_list():
|
||||||
|
assert parse_linkedin(LINKEDIN_BODY_EMPTY) == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_linkedin_block_without_url_skipped():
|
||||||
|
cards = parse_linkedin(LINKEDIN_BODY_NO_URL)
|
||||||
|
assert cards == []
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Run tests to verify they fail**
|
||||||
|
|
||||||
|
```
|
||||||
|
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_digest_parsers.py -v
|
||||||
|
```
|
||||||
|
Expected: `ImportError: cannot import name 'parse_digest'`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Step 3: Write `digest_parsers.py`**
|
||||||
|
|
||||||
|
Create `peregrine/scripts/digest_parsers.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""Digest email parser registry for Peregrine.
|
||||||
|
|
||||||
|
Each parser extracts job listings from a known digest sender's plain-text body.
|
||||||
|
New parsers are added by decorating with @_register(sender_substring, source_name).
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
from scripts.digest_parsers import parse_digest
|
||||||
|
|
||||||
|
cards = parse_digest(from_addr, body)
|
||||||
|
# None → unknown sender (fall through to LLM path)
|
||||||
|
# [] → known sender, nothing extractable
|
||||||
|
# [...] → list of {title, company, location, url, source} dicts
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
|
# ── Registry ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# Maps sender substring (lowercased) → (source_name, parse_fn)
|
||||||
|
DIGEST_PARSERS: dict[str, tuple[str, Callable[[str], list[dict]]]] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def _register(sender: str, source: str):
|
||||||
|
"""Decorator to register a parser for a given sender substring."""
|
||||||
|
def decorator(fn: Callable[[str], list[dict]]):
|
||||||
|
DIGEST_PARSERS[sender.lower()] = (source, fn)
|
||||||
|
return fn
|
||||||
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
|
def parse_digest(from_addr: str, body: str) -> list[dict] | None:
|
||||||
|
"""Dispatch to the appropriate parser based on sender address.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None — no parser matched (caller should use LLM fallback)
|
||||||
|
[] — known sender, no extractable jobs
|
||||||
|
[dict, ...] — one dict per job card with keys:
|
||||||
|
title, company, location, url, source
|
||||||
|
"""
|
||||||
|
addr = from_addr.lower()
|
||||||
|
for sender, (source, parse_fn) in DIGEST_PARSERS.items():
|
||||||
|
if sender in addr:
|
||||||
|
return parse_fn(body)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Shared helpers ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_LINKEDIN_SKIP_PHRASES = {
|
||||||
|
"promoted", "easily apply", "apply now", "job alert",
|
||||||
|
"unsubscribe", "linkedin corporation",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ── LinkedIn Job Alert ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@_register("jobalerts@linkedin.com", "linkedin")
|
||||||
|
def parse_linkedin(body: str) -> list[dict]:
|
||||||
|
"""Parse LinkedIn Job Alert digest email body.
|
||||||
|
|
||||||
|
Blocks are separated by lines of 10+ dashes. Each block contains:
|
||||||
|
Line 0: job title
|
||||||
|
Line 1: company
|
||||||
|
Line 2: location (optional)
|
||||||
|
'View job: <url>' → canonicalized to /jobs/view/<id>/
|
||||||
|
"""
|
||||||
|
jobs = []
|
||||||
|
blocks = re.split(r"\n\s*-{10,}\s*\n", body)
|
||||||
|
for block in blocks:
|
||||||
|
lines = [ln.strip() for ln in block.strip().splitlines() if ln.strip()]
|
||||||
|
|
||||||
|
url = None
|
||||||
|
for line in lines:
|
||||||
|
m = re.search(r"View job:\s*(https?://\S+)", line, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
raw_url = m.group(1)
|
||||||
|
job_id_m = re.search(r"/jobs/view/(\d+)", raw_url)
|
||||||
|
if job_id_m:
|
||||||
|
url = f"https://www.linkedin.com/jobs/view/{job_id_m.group(1)}/"
|
||||||
|
break
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
content = [
|
||||||
|
ln for ln in lines
|
||||||
|
if not any(p in ln.lower() for p in _LINKEDIN_SKIP_PHRASES)
|
||||||
|
and not ln.lower().startswith("view job:")
|
||||||
|
and not ln.startswith("http")
|
||||||
|
]
|
||||||
|
if len(content) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
jobs.append({
|
||||||
|
"title": content[0],
|
||||||
|
"company": content[1],
|
||||||
|
"location": content[2] if len(content) > 2 else "",
|
||||||
|
"url": url,
|
||||||
|
"source": "linkedin",
|
||||||
|
})
|
||||||
|
return jobs
|
||||||
|
|
||||||
|
|
||||||
|
# ── Adzuna Job Alert ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@_register("noreply@adzuna.com", "adzuna")
|
||||||
|
def parse_adzuna(body: str) -> list[dict]:
|
||||||
|
"""Parse Adzuna job alert digest email body.
|
||||||
|
|
||||||
|
TODO: implement after reviewing samples in avocet/data/digest_samples.jsonl
|
||||||
|
See Task 3 in docs/plans/2026-03-05-digest-parsers-plan.md
|
||||||
|
"""
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
# ── The Ladders Job Alert ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@_register("noreply@theladders.com", "theladders")
|
||||||
|
def parse_theladders(body: str) -> list[dict]:
|
||||||
|
"""Parse The Ladders job alert digest email body.
|
||||||
|
|
||||||
|
TODO: implement after reviewing samples in avocet/data/digest_samples.jsonl
|
||||||
|
See Task 4 in docs/plans/2026-03-05-digest-parsers-plan.md
|
||||||
|
"""
|
||||||
|
return []
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: Run tests to verify they pass**
|
||||||
|
|
||||||
|
```
|
||||||
|
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_digest_parsers.py -v
|
||||||
|
```
|
||||||
|
Expected: all 8 tests PASS
|
||||||
|
|
||||||
|
**Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add scripts/digest_parsers.py tests/test_digest_parsers.py
|
||||||
|
git commit -m "feat: digest parser registry + LinkedIn parser (moved from imap_sync)"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 2: Fetch digest samples from IMAP
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Create: `avocet/scripts/fetch_digest_samples.py`
|
||||||
|
|
||||||
|
**Context:**
|
||||||
|
We need real Adzuna and Ladders email bodies to write parsers against. This one-off script
|
||||||
|
searches the configured IMAP account by sender domain and writes results to
|
||||||
|
`data/digest_samples.jsonl`. Run it once; the output file feeds Tasks 3 and 4.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Step 1: Create the fetch script**
|
||||||
|
|
||||||
|
Create `avocet/scripts/fetch_digest_samples.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Fetch digest email samples from IMAP into data/digest_samples.jsonl.
|
||||||
|
|
||||||
|
Searches for emails from known digest sender domains, deduplicates against
|
||||||
|
any existing samples, and appends new ones.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
conda run -n job-seeker python scripts/fetch_digest_samples.py
|
||||||
|
|
||||||
|
Reads config/label_tool.yaml for IMAP credentials (first account used).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import imaplib
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
ROOT = Path(__file__).parent.parent
|
||||||
|
CONFIG = ROOT / "config" / "label_tool.yaml"
|
||||||
|
OUTPUT = ROOT / "data" / "digest_samples.jsonl"
|
||||||
|
|
||||||
|
# Sender domains to search — add new ones here as needed
|
||||||
|
DIGEST_SENDERS = [
|
||||||
|
"adzuna.com",
|
||||||
|
"theladders.com",
|
||||||
|
"jobalerts@linkedin.com",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Import shared helpers from avocet
|
||||||
|
sys.path.insert(0, str(ROOT))
|
||||||
|
from app.imap_fetch import _decode_str, _extract_body, entry_key # noqa: E402
|
||||||
|
|
||||||
|
|
||||||
|
def _load_existing_keys() -> set[str]:
|
||||||
|
if not OUTPUT.exists():
|
||||||
|
return set()
|
||||||
|
keys = set()
|
||||||
|
for line in OUTPUT.read_text().splitlines():
|
||||||
|
try:
|
||||||
|
keys.add(entry_key(json.loads(line)))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return keys
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
cfg = yaml.safe_load(CONFIG.read_text())
|
||||||
|
accounts = cfg.get("accounts", [])
|
||||||
|
if not accounts:
|
||||||
|
print("No accounts configured in config/label_tool.yaml")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
acc = accounts[0]
|
||||||
|
host = acc.get("host", "imap.gmail.com")
|
||||||
|
port = int(acc.get("port", 993))
|
||||||
|
use_ssl = acc.get("use_ssl", True)
|
||||||
|
username = acc["username"]
|
||||||
|
password = acc["password"]
|
||||||
|
folder = acc.get("folder", "INBOX")
|
||||||
|
days_back = int(acc.get("days_back", 90))
|
||||||
|
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import email as _email_lib
|
||||||
|
|
||||||
|
since = (datetime.now() - timedelta(days=days_back)).strftime("%d-%b-%Y")
|
||||||
|
|
||||||
|
conn = (imaplib.IMAP4_SSL if use_ssl else imaplib.IMAP4)(host, port)
|
||||||
|
conn.login(username, password)
|
||||||
|
conn.select(folder, readonly=True)
|
||||||
|
|
||||||
|
known_keys = _load_existing_keys()
|
||||||
|
found: list[dict] = []
|
||||||
|
seen_uids: dict[bytes, None] = {}
|
||||||
|
|
||||||
|
for sender in DIGEST_SENDERS:
|
||||||
|
try:
|
||||||
|
_, data = conn.search(None, f'(FROM "{sender}" SINCE "{since}")')
|
||||||
|
for uid in (data[0] or b"").split():
|
||||||
|
seen_uids[uid] = None
|
||||||
|
except Exception as exc:
|
||||||
|
print(f" search error for {sender!r}: {exc}")
|
||||||
|
|
||||||
|
print(f"Found {len(seen_uids)} candidate UIDs across {len(DIGEST_SENDERS)} senders")
|
||||||
|
|
||||||
|
for uid in seen_uids:
|
||||||
|
try:
|
||||||
|
_, raw_data = conn.fetch(uid, "(RFC822)")
|
||||||
|
if not raw_data or not raw_data[0]:
|
||||||
|
continue
|
||||||
|
msg = _email_lib.message_from_bytes(raw_data[0][1])
|
||||||
|
entry = {
|
||||||
|
"subject": _decode_str(msg.get("Subject", "")),
|
||||||
|
"body": _extract_body(msg)[:2000], # larger cap for parser dev
|
||||||
|
"from_addr": _decode_str(msg.get("From", "")),
|
||||||
|
"date": _decode_str(msg.get("Date", "")),
|
||||||
|
"account": acc.get("name", username),
|
||||||
|
}
|
||||||
|
k = entry_key(entry)
|
||||||
|
if k not in known_keys:
|
||||||
|
known_keys.add(k)
|
||||||
|
found.append(entry)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f" fetch error uid {uid}: {exc}")
|
||||||
|
|
||||||
|
conn.logout()
|
||||||
|
|
||||||
|
if not found:
|
||||||
|
print("No new digest samples found.")
|
||||||
|
return
|
||||||
|
|
||||||
|
OUTPUT.parent.mkdir(exist_ok=True)
|
||||||
|
with OUTPUT.open("a", encoding="utf-8") as f:
|
||||||
|
for entry in found:
|
||||||
|
f.write(json.dumps(entry) + "\n")
|
||||||
|
|
||||||
|
print(f"Wrote {len(found)} new samples to {OUTPUT}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Run the fetch script**
|
||||||
|
|
||||||
|
```
|
||||||
|
cd /Library/Development/CircuitForge/avocet
|
||||||
|
conda run -n job-seeker python scripts/fetch_digest_samples.py
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected output: `Wrote N new samples to data/digest_samples.jsonl`
|
||||||
|
|
||||||
|
**Step 3: Inspect the samples**
|
||||||
|
|
||||||
|
```
|
||||||
|
# View first few entries — look at from_addr and body for Adzuna and Ladders format
|
||||||
|
conda run -n job-seeker python -c "
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
for line in Path('data/digest_samples.jsonl').read_text().splitlines()[:10]:
|
||||||
|
e = json.loads(line)
|
||||||
|
print('FROM:', e['from_addr'])
|
||||||
|
print('SUBJECT:', e['subject'])
|
||||||
|
print('BODY[:500]:', e['body'][:500])
|
||||||
|
print('---')
|
||||||
|
"
|
||||||
|
```
|
||||||
|
|
||||||
|
Note down:
|
||||||
|
- The exact sender addresses for Adzuna and Ladders (update `DIGEST_PARSERS` in `digest_parsers.py` if different from `noreply@adzuna.com` / `noreply@theladders.com`)
|
||||||
|
- The structure of each job block in the body (separator lines, field order, URL format)
|
||||||
|
|
||||||
|
**Step 4: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /Library/Development/CircuitForge/avocet
|
||||||
|
git add scripts/fetch_digest_samples.py
|
||||||
|
git commit -m "feat: fetch_digest_samples script for building new parsers"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 3: Build and test Adzuna parser
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `peregrine/scripts/digest_parsers.py` — implement `parse_adzuna`
|
||||||
|
- Modify: `peregrine/tests/test_digest_parsers.py` — add Adzuna fixtures + tests
|
||||||
|
|
||||||
|
**Context:**
|
||||||
|
After running Task 2, you have real Adzuna email bodies in `avocet/data/digest_samples.jsonl`.
|
||||||
|
Inspect them (see Task 2 Step 3), identify the structure, then write the test fixture from
|
||||||
|
a real sample before implementing the parser.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Step 1: Write a failing Adzuna test**
|
||||||
|
|
||||||
|
Inspect a real Adzuna sample from `data/digest_samples.jsonl` and identify:
|
||||||
|
- How job blocks are separated (blank lines? dashes? headers?)
|
||||||
|
- Field order (title first? company first?)
|
||||||
|
- Where the job URL appears and what format it uses
|
||||||
|
- Any noise lines to filter (unsubscribe, promo text, etc.)
|
||||||
|
|
||||||
|
Add to `peregrine/tests/test_digest_parsers.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from scripts.digest_parsers import parse_adzuna
|
||||||
|
|
||||||
|
# Replace ADZUNA_BODY with a real excerpt from avocet/data/digest_samples.jsonl
|
||||||
|
# Copy 2-3 job blocks verbatim; replace real company names with "Test Co" etc. if desired
|
||||||
|
ADZUNA_BODY = """
|
||||||
|
<paste real Adzuna body excerpt here — 2-3 job blocks>
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_dispatcher_adzuna_sender():
|
||||||
|
# Update sender string if real sender differs from noreply@adzuna.com
|
||||||
|
cards = parse_digest("noreply@adzuna.com", ADZUNA_BODY)
|
||||||
|
assert cards is not None
|
||||||
|
assert len(cards) >= 1
|
||||||
|
|
||||||
|
def test_parse_adzuna_fields():
|
||||||
|
cards = parse_adzuna(ADZUNA_BODY)
|
||||||
|
assert cards[0]["title"] # non-empty
|
||||||
|
assert cards[0]["company"] # non-empty
|
||||||
|
assert cards[0]["url"].startswith("http")
|
||||||
|
assert cards[0]["source"] == "adzuna"
|
||||||
|
|
||||||
|
def test_parse_adzuna_url_no_tracking():
|
||||||
|
"""Adzuna URLs often contain tracking params — strip them."""
|
||||||
|
cards = parse_adzuna(ADZUNA_BODY)
|
||||||
|
# Adjust assertion to match actual URL format once you've seen real samples
|
||||||
|
for card in cards:
|
||||||
|
assert "utm_" not in card["url"]
|
||||||
|
|
||||||
|
def test_parse_adzuna_empty_body():
|
||||||
|
assert parse_adzuna("No jobs this week.") == []
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Run tests to verify they fail**
|
||||||
|
|
||||||
|
```
|
||||||
|
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_digest_parsers.py::test_parse_adzuna_fields -v
|
||||||
|
```
|
||||||
|
Expected: FAIL (stub returns `[]`)
|
||||||
|
|
||||||
|
**Step 3: Implement `parse_adzuna` in `digest_parsers.py`**
|
||||||
|
|
||||||
|
Replace the stub body of `parse_adzuna` based on the actual email structure you observed.
|
||||||
|
Pattern to follow (adapt field positions to match Adzuna's actual format):
|
||||||
|
|
||||||
|
```python
|
||||||
|
@_register("noreply@adzuna.com", "adzuna") # update sender if needed
|
||||||
|
def parse_adzuna(body: str) -> list[dict]:
|
||||||
|
jobs = []
|
||||||
|
# Split on whatever delimiter Adzuna uses between blocks
|
||||||
|
# e.g.: blocks = re.split(r"\n\s*\n{2,}", body) # double blank line
|
||||||
|
# For each block, extract title, company, location, url
|
||||||
|
# Strip tracking params from URL: re.sub(r"\?.*", "", url) or parse with urllib
|
||||||
|
return jobs
|
||||||
|
```
|
||||||
|
|
||||||
|
If Adzuna sender differs from `noreply@adzuna.com`, update the `@_register` decorator
|
||||||
|
**and** the `DIGEST_PARSERS` key in the registry (they're set by the decorator — just change
|
||||||
|
the decorator argument).
|
||||||
|
|
||||||
|
**Step 4: Run all digest tests**
|
||||||
|
|
||||||
|
```
|
||||||
|
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_digest_parsers.py -v
|
||||||
|
```
|
||||||
|
Expected: all tests PASS
|
||||||
|
|
||||||
|
**Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /Library/Development/CircuitForge/peregrine
|
||||||
|
git add scripts/digest_parsers.py tests/test_digest_parsers.py
|
||||||
|
git commit -m "feat: Adzuna digest email parser"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 4: Build and test The Ladders parser
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `peregrine/scripts/digest_parsers.py` — implement `parse_theladders`
|
||||||
|
- Modify: `peregrine/tests/test_digest_parsers.py` — add Ladders fixtures + tests
|
||||||
|
|
||||||
|
**Context:**
|
||||||
|
Same approach as Task 3. The Ladders already has a web scraper in
|
||||||
|
`scripts/custom_boards/theladders.py` — check it for URL patterns that may apply here.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Step 1: Write failing Ladders tests**
|
||||||
|
|
||||||
|
Inspect a real Ladders sample from `avocet/data/digest_samples.jsonl`. Add to test file:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from scripts.digest_parsers import parse_theladders
|
||||||
|
|
||||||
|
# Replace with real Ladders body excerpt
|
||||||
|
LADDERS_BODY = """
|
||||||
|
<paste real Ladders body excerpt here — 2-3 job blocks>
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_dispatcher_ladders_sender():
|
||||||
|
cards = parse_digest("noreply@theladders.com", LADDERS_BODY)
|
||||||
|
assert cards is not None
|
||||||
|
assert len(cards) >= 1
|
||||||
|
|
||||||
|
def test_parse_theladders_fields():
|
||||||
|
cards = parse_theladders(LADDERS_BODY)
|
||||||
|
assert cards[0]["title"]
|
||||||
|
assert cards[0]["company"]
|
||||||
|
assert cards[0]["url"].startswith("http")
|
||||||
|
assert cards[0]["source"] == "theladders"
|
||||||
|
|
||||||
|
def test_parse_theladders_empty_body():
|
||||||
|
assert parse_theladders("No new jobs.") == []
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Run tests to verify they fail**
|
||||||
|
|
||||||
|
```
|
||||||
|
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_digest_parsers.py::test_parse_theladders_fields -v
|
||||||
|
```
|
||||||
|
Expected: FAIL
|
||||||
|
|
||||||
|
**Step 3: Implement `parse_theladders`**
|
||||||
|
|
||||||
|
Replace the stub. The Ladders URLs often use redirect wrappers — canonicalize to the
|
||||||
|
`theladders.com/job/<id>` form if possible, otherwise just strip tracking params.
|
||||||
|
|
||||||
|
**Step 4: Run all digest tests**
|
||||||
|
|
||||||
|
```
|
||||||
|
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_digest_parsers.py -v
|
||||||
|
```
|
||||||
|
Expected: all tests PASS
|
||||||
|
|
||||||
|
**Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add scripts/digest_parsers.py tests/test_digest_parsers.py
|
||||||
|
git commit -m "feat: The Ladders digest email parser"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 5: Update `imap_sync.py` to use the dispatcher
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `peregrine/scripts/imap_sync.py`
|
||||||
|
|
||||||
|
**Context:**
|
||||||
|
The LinkedIn-specific block in `_scan_unmatched_leads()` (search for
|
||||||
|
`_LINKEDIN_ALERT_SENDER`) gets replaced with a generic `parse_digest()` call.
|
||||||
|
The existing behavior is preserved — only the dispatch mechanism changes.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Step 1: Add the import**
|
||||||
|
|
||||||
|
At the top of `imap_sync.py`, alongside other local imports, add:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from scripts.digest_parsers import parse_digest
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Find the LinkedIn-specific block**
|
||||||
|
|
||||||
|
Search for `_LINKEDIN_ALERT_SENDER` in `imap_sync.py`. The block looks like:
|
||||||
|
|
||||||
|
```python
|
||||||
|
if _LINKEDIN_ALERT_SENDER in parsed["from_addr"].lower():
|
||||||
|
cards = parse_linkedin_alert(parsed["body"])
|
||||||
|
for card in cards:
|
||||||
|
...
|
||||||
|
known_message_ids.add(mid)
|
||||||
|
continue
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 3: Replace with the generic dispatcher**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ── Digest email — dispatch to parser registry ────────────────────────
|
||||||
|
cards = parse_digest(parsed["from_addr"], parsed["body"])
|
||||||
|
if cards is not None:
|
||||||
|
for card in cards:
|
||||||
|
if card["url"] in existing_urls:
|
||||||
|
continue
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": card["title"],
|
||||||
|
"company": card["company"],
|
||||||
|
"url": card["url"],
|
||||||
|
"source": card["source"],
|
||||||
|
"location": card["location"],
|
||||||
|
"is_remote": 0,
|
||||||
|
"salary": "",
|
||||||
|
"description": "",
|
||||||
|
"date_found": datetime.now().isoformat()[:10],
|
||||||
|
})
|
||||||
|
if job_id:
|
||||||
|
submit_task(db_path, "scrape_url", job_id)
|
||||||
|
existing_urls.add(card["url"])
|
||||||
|
new_leads += 1
|
||||||
|
print(f"[imap] digest ({card['source']}) → {card['company']} — {card['title']}")
|
||||||
|
known_message_ids.add(mid)
|
||||||
|
continue
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: Remove the now-unused `parse_linkedin_alert` import/definition**
|
||||||
|
|
||||||
|
`parse_linkedin_alert` was defined in `imap_sync.py`. It's now `parse_linkedin` in
|
||||||
|
`digest_parsers.py`. Delete the old function from `imap_sync.py`. Also remove
|
||||||
|
`_LINKEDIN_ALERT_SENDER` constant if it's no longer referenced.
|
||||||
|
|
||||||
|
**Step 5: Run the full test suite**
|
||||||
|
|
||||||
|
```
|
||||||
|
/devl/miniconda3/envs/job-seeker/bin/pytest tests/ -v
|
||||||
|
```
|
||||||
|
Expected: all existing tests still pass; no regressions
|
||||||
|
|
||||||
|
**Step 6: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add scripts/imap_sync.py
|
||||||
|
git commit -m "refactor: imap_sync uses digest_parsers dispatcher; remove inline LinkedIn parser"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 6: Avocet digest bucket
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `avocet/app/label_tool.py`
|
||||||
|
- Modify: `avocet/app/api.py`
|
||||||
|
- Create: `avocet/tests/test_digest_bucket.py`
|
||||||
|
- Create: `avocet/data/digest_samples.jsonl.example`
|
||||||
|
|
||||||
|
**Context:**
|
||||||
|
When either label path (`_do_label` in the Streamlit UI or `POST /api/label` in the FastAPI
|
||||||
|
app) assigns the `digest` label, the full email record is appended to
|
||||||
|
`data/digest_samples.jsonl`. This is the sample corpus for building future parsers.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Step 1: Write failing tests**
|
||||||
|
|
||||||
|
Create `avocet/tests/test_digest_bucket.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""Tests for digest sample bucket write behavior."""
|
||||||
|
import json
|
||||||
|
import pytest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
|
||||||
|
|
||||||
|
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _read_bucket(tmp_path: Path) -> list[dict]:
|
||||||
|
bucket = tmp_path / "data" / "digest_samples.jsonl"
|
||||||
|
if not bucket.exists():
|
||||||
|
return []
|
||||||
|
return [json.loads(line) for line in bucket.read_text().splitlines() if line.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
SAMPLE_ENTRY = {
|
||||||
|
"subject": "10 new jobs for you",
|
||||||
|
"body": "Software Engineer\nAcme Corp\nRemote\nView job: https://example.com/123",
|
||||||
|
"from_addr": "noreply@adzuna.com",
|
||||||
|
"date": "Mon, 03 Mar 2026 09:00:00 +0000",
|
||||||
|
"account": "test@example.com",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ── api.py bucket tests ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_api_digest_label_writes_to_bucket(tmp_path):
|
||||||
|
from app.api import _append_digest_sample
|
||||||
|
data_dir = tmp_path / "data"
|
||||||
|
_append_digest_sample(SAMPLE_ENTRY, data_dir=data_dir)
|
||||||
|
rows = _read_bucket(tmp_path)
|
||||||
|
assert len(rows) == 1
|
||||||
|
assert rows[0]["from_addr"] == "noreply@adzuna.com"
|
||||||
|
|
||||||
|
|
||||||
|
def test_api_non_digest_label_does_not_write(tmp_path):
|
||||||
|
from app.api import _append_digest_sample
|
||||||
|
data_dir = tmp_path / "data"
|
||||||
|
# _append_digest_sample should only be called for digest; confirm it writes when called
|
||||||
|
# Confirm that callers gate on label == "digest" — tested via integration below
|
||||||
|
_append_digest_sample(SAMPLE_ENTRY, data_dir=data_dir)
|
||||||
|
rows = _read_bucket(tmp_path)
|
||||||
|
assert len(rows) == 1 # called directly, always writes
|
||||||
|
|
||||||
|
|
||||||
|
def test_api_digest_creates_data_dir(tmp_path):
|
||||||
|
from app.api import _append_digest_sample
|
||||||
|
data_dir = tmp_path / "nonexistent" / "data"
|
||||||
|
assert not data_dir.exists()
|
||||||
|
_append_digest_sample(SAMPLE_ENTRY, data_dir=data_dir)
|
||||||
|
assert data_dir.exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_api_digest_appends_multiple(tmp_path):
|
||||||
|
from app.api import _append_digest_sample
|
||||||
|
data_dir = tmp_path / "data"
|
||||||
|
_append_digest_sample(SAMPLE_ENTRY, data_dir=data_dir)
|
||||||
|
_append_digest_sample({**SAMPLE_ENTRY, "subject": "5 more jobs"}, data_dir=data_dir)
|
||||||
|
rows = _read_bucket(tmp_path)
|
||||||
|
assert len(rows) == 2
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Run tests to verify they fail**
|
||||||
|
|
||||||
|
```
|
||||||
|
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_digest_bucket.py -v
|
||||||
|
```
|
||||||
|
Expected: `ImportError: cannot import name '_append_digest_sample'`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Step 3: Add `_append_digest_sample` to `api.py`**
|
||||||
|
|
||||||
|
In `avocet/app/api.py`, add this helper (near the top, after the imports and `_DATA_DIR`
|
||||||
|
constant):
|
||||||
|
|
||||||
|
```python
|
||||||
|
_DIGEST_SAMPLES_FILE = _DATA_DIR / "digest_samples.jsonl"
|
||||||
|
|
||||||
|
|
||||||
|
def _append_digest_sample(entry: dict, data_dir: Path | None = None) -> None:
|
||||||
|
"""Append a digest-labeled email to the sample corpus."""
|
||||||
|
target_dir = data_dir if data_dir is not None else _DATA_DIR
|
||||||
|
target_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
bucket = target_dir / "digest_samples.jsonl"
|
||||||
|
record = {
|
||||||
|
"subject": entry.get("subject", ""),
|
||||||
|
"body": entry.get("body", ""),
|
||||||
|
"from_addr": entry.get("from_addr", entry.get("from", "")),
|
||||||
|
"date": entry.get("date", ""),
|
||||||
|
"account": entry.get("account", entry.get("source", "")),
|
||||||
|
}
|
||||||
|
with bucket.open("a", encoding="utf-8") as f:
|
||||||
|
f.write(json.dumps(record) + "\n")
|
||||||
|
```
|
||||||
|
|
||||||
|
Then in `post_label()` (around line 127, after `_append_jsonl(_score_file(), record)`):
|
||||||
|
|
||||||
|
```python
|
||||||
|
if req.label == "digest":
|
||||||
|
_append_digest_sample(match)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: Add the same write to `label_tool.py`**
|
||||||
|
|
||||||
|
In `avocet/app/label_tool.py`, add a module-level constant after `_SCORE_FILE`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
_DIGEST_SAMPLES_FILE = _ROOT / "data" / "digest_samples.jsonl"
|
||||||
|
```
|
||||||
|
|
||||||
|
In `_do_label()` (around line 728, after `_append_jsonl(_SCORE_FILE, row)`):
|
||||||
|
|
||||||
|
```python
|
||||||
|
if label == "digest":
|
||||||
|
_append_jsonl(
|
||||||
|
_DIGEST_SAMPLES_FILE,
|
||||||
|
{
|
||||||
|
"subject": entry.get("subject", ""),
|
||||||
|
"body": (entry.get("body", ""))[:2000],
|
||||||
|
"from_addr": entry.get("from_addr", ""),
|
||||||
|
"date": entry.get("date", ""),
|
||||||
|
"account": entry.get("account", ""),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
(`_append_jsonl` already exists in label_tool.py at line ~396 — reuse it.)
|
||||||
|
|
||||||
|
**Step 5: Create the example file**
|
||||||
|
|
||||||
|
Create `avocet/data/digest_samples.jsonl.example`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{"subject": "10 new Software Engineer jobs for you", "body": "Software Engineer\nAcme Corp\nSan Francisco, CA\n\nView job: https://www.linkedin.com/jobs/view/1234567890/\n", "from_addr": "LinkedIn <jobalerts@linkedin.com>", "date": "Mon, 03 Mar 2026 09:00:00 +0000", "account": "example@gmail.com"}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 6: Update `.gitignore` in avocet**
|
||||||
|
|
||||||
|
Verify `data/digest_samples.jsonl` is gitignored. Open `avocet/.gitignore` — it should
|
||||||
|
already have `data/*.jsonl`. If not, add:
|
||||||
|
|
||||||
|
```
|
||||||
|
data/digest_samples.jsonl
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 7: Run all avocet tests**
|
||||||
|
|
||||||
|
```
|
||||||
|
/devl/miniconda3/envs/job-seeker/bin/pytest tests/ -v
|
||||||
|
```
|
||||||
|
Expected: all tests PASS
|
||||||
|
|
||||||
|
**Step 8: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /Library/Development/CircuitForge/avocet
|
||||||
|
git add app/api.py app/label_tool.py tests/test_digest_bucket.py data/digest_samples.jsonl.example
|
||||||
|
git commit -m "feat: digest sample bucket — write digest-labeled emails to digest_samples.jsonl"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
| Task | Repo | Commit message |
|
||||||
|
|------|------|----------------|
|
||||||
|
| 1 | peregrine | `feat: digest parser registry + LinkedIn parser (moved from imap_sync)` |
|
||||||
|
| 2 | avocet | `feat: fetch_digest_samples script for building new parsers` |
|
||||||
|
| 3 | peregrine | `feat: Adzuna digest email parser` |
|
||||||
|
| 4 | peregrine | `feat: The Ladders digest email parser` |
|
||||||
|
| 5 | peregrine | `refactor: imap_sync uses digest_parsers dispatcher; remove inline LinkedIn parser` |
|
||||||
|
| 6 | avocet | `feat: digest sample bucket — write digest-labeled emails to digest_samples.jsonl` |
|
||||||
|
|
||||||
|
Tasks 1, 2, and 6 are independent and can be done in any order.
|
||||||
|
Tasks 3 and 4 depend on Task 2 (samples needed before implementing parsers).
|
||||||
|
Task 5 depends on Tasks 1, 3, and 4 (all parsers should be ready before switching imap_sync).
|
||||||
277
scripts/backup.py
Normal file
277
scripts/backup.py
Normal file
|
|
@ -0,0 +1,277 @@
|
||||||
|
"""Config backup / restore / teleport for Peregrine.
|
||||||
|
|
||||||
|
Creates a portable zip of all gitignored configs + optionally the staging DB.
|
||||||
|
Intended for: machine migrations, Docker volume transfers, and safe wizard testing.
|
||||||
|
Supports both the Peregrine Docker instance and the legacy /devl/job-seeker install.
|
||||||
|
|
||||||
|
Usage (CLI):
|
||||||
|
conda run -n job-seeker python scripts/backup.py --create backup.zip
|
||||||
|
conda run -n job-seeker python scripts/backup.py --create backup.zip --no-db
|
||||||
|
conda run -n job-seeker python scripts/backup.py --create backup.zip --base-dir /devl/job-seeker
|
||||||
|
conda run -n job-seeker python scripts/backup.py --restore backup.zip
|
||||||
|
conda run -n job-seeker python scripts/backup.py --list backup.zip
|
||||||
|
|
||||||
|
Usage (programmatic — called from Settings UI):
|
||||||
|
from scripts.backup import create_backup, restore_backup, list_backup_contents
|
||||||
|
zip_bytes = create_backup(base_dir, include_db=True)
|
||||||
|
info = list_backup_contents(zip_bytes)
|
||||||
|
result = restore_backup(zip_bytes, base_dir, include_db=True)
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import zipfile
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Files included in every backup (relative to repo root)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Gitignored config files that hold secrets / personal data
|
||||||
|
_SECRET_CONFIGS = [
|
||||||
|
"config/notion.yaml",
|
||||||
|
"config/tokens.yaml",
|
||||||
|
"config/email.yaml",
|
||||||
|
"config/adzuna.yaml",
|
||||||
|
"config/craigslist.yaml",
|
||||||
|
"config/user.yaml",
|
||||||
|
"config/plain_text_resume.yaml",
|
||||||
|
"config/license.json",
|
||||||
|
"config/user.yaml.working",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Gitignored integration configs (glob pattern — each matching file is added)
|
||||||
|
_INTEGRATION_CONFIG_GLOB = "config/integrations/*.yaml"
|
||||||
|
|
||||||
|
# Non-secret committed configs worth preserving for portability
|
||||||
|
# (also present in the legacy /devl/job-seeker instance)
|
||||||
|
_EXTRA_CONFIGS = [
|
||||||
|
"config/llm.yaml",
|
||||||
|
"config/search_profiles.yaml",
|
||||||
|
"config/resume_keywords.yaml", # personal keyword list — present in both instances
|
||||||
|
"config/skills_suggestions.yaml",
|
||||||
|
"config/blocklist.yaml",
|
||||||
|
"config/server.yaml", # deployment config (base URL path, port) — Peregrine only
|
||||||
|
]
|
||||||
|
|
||||||
|
# Candidate DB paths (first one that exists wins)
|
||||||
|
_DB_CANDIDATES = ["data/staging.db", "staging.db"]
|
||||||
|
|
||||||
|
_MANIFEST_NAME = "backup-manifest.json"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Source detection
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _detect_source_label(base_dir: Path) -> str:
|
||||||
|
"""Return a human-readable label for the instance being backed up.
|
||||||
|
|
||||||
|
Uses the directory name — stable as long as the repo root isn't renamed,
|
||||||
|
which is the normal case for both the Docker install (peregrine/) and the
|
||||||
|
legacy Conda install (job-seeker/).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
base_dir: The root directory being backed up.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A short identifier string, e.g. "peregrine" or "job-seeker".
|
||||||
|
"""
|
||||||
|
return base_dir.name
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Public API
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def create_backup(
|
||||||
|
base_dir: Path,
|
||||||
|
include_db: bool = True,
|
||||||
|
source_label: str | None = None,
|
||||||
|
) -> bytes:
|
||||||
|
"""Return a zip archive as raw bytes.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
base_dir: Repo root (parent of config/ and staging.db).
|
||||||
|
include_db: If True, include staging.db in the archive.
|
||||||
|
source_label: Human-readable instance name stored in the manifest
|
||||||
|
(e.g. "peregrine", "job-seeker"). Auto-detected if None.
|
||||||
|
"""
|
||||||
|
buf = io.BytesIO()
|
||||||
|
included: list[str] = []
|
||||||
|
|
||||||
|
with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_DEFLATED) as zf:
|
||||||
|
# Gitignored secret configs
|
||||||
|
for rel in _SECRET_CONFIGS:
|
||||||
|
p = base_dir / rel
|
||||||
|
if p.exists():
|
||||||
|
zf.write(p, rel)
|
||||||
|
included.append(rel)
|
||||||
|
|
||||||
|
# Integration configs (glob)
|
||||||
|
for p in sorted((base_dir).glob(_INTEGRATION_CONFIG_GLOB)):
|
||||||
|
rel = str(p.relative_to(base_dir))
|
||||||
|
zf.write(p, rel)
|
||||||
|
included.append(rel)
|
||||||
|
|
||||||
|
# Extra non-secret configs
|
||||||
|
for rel in _EXTRA_CONFIGS:
|
||||||
|
p = base_dir / rel
|
||||||
|
if p.exists():
|
||||||
|
zf.write(p, rel)
|
||||||
|
included.append(rel)
|
||||||
|
|
||||||
|
# Staging DB
|
||||||
|
if include_db:
|
||||||
|
for candidate in _DB_CANDIDATES:
|
||||||
|
p = base_dir / candidate
|
||||||
|
if p.exists():
|
||||||
|
zf.write(p, candidate)
|
||||||
|
included.append(candidate)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Manifest
|
||||||
|
manifest = {
|
||||||
|
"created_at": datetime.now().isoformat(),
|
||||||
|
"source": source_label or _detect_source_label(base_dir),
|
||||||
|
"source_path": str(base_dir.resolve()),
|
||||||
|
"peregrine_version": "1.0",
|
||||||
|
"files": included,
|
||||||
|
"includes_db": include_db and any(f.endswith(".db") for f in included),
|
||||||
|
}
|
||||||
|
zf.writestr(_MANIFEST_NAME, json.dumps(manifest, indent=2))
|
||||||
|
|
||||||
|
return buf.getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
def list_backup_contents(zip_bytes: bytes) -> dict:
|
||||||
|
"""Return manifest + file list from a backup zip (no extraction)."""
|
||||||
|
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
|
||||||
|
names = [n for n in zf.namelist() if n != _MANIFEST_NAME]
|
||||||
|
manifest: dict = {}
|
||||||
|
if _MANIFEST_NAME in zf.namelist():
|
||||||
|
manifest = json.loads(zf.read(_MANIFEST_NAME))
|
||||||
|
sizes = {info.filename: info.file_size for info in zf.infolist()}
|
||||||
|
return {
|
||||||
|
"manifest": manifest,
|
||||||
|
"files": names,
|
||||||
|
"sizes": sizes,
|
||||||
|
"total_bytes": sum(sizes[n] for n in names if n in sizes),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def restore_backup(
|
||||||
|
zip_bytes: bytes,
|
||||||
|
base_dir: Path,
|
||||||
|
include_db: bool = True,
|
||||||
|
overwrite: bool = True,
|
||||||
|
) -> dict[str, list[str]]:
|
||||||
|
"""Extract a backup zip into base_dir.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
zip_bytes: Raw bytes of the backup zip.
|
||||||
|
base_dir: Repo root to restore into.
|
||||||
|
include_db: If False, skip any .db files.
|
||||||
|
overwrite: If False, skip files that already exist.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
{"restored": [...], "skipped": [...]}
|
||||||
|
"""
|
||||||
|
restored: list[str] = []
|
||||||
|
skipped: list[str] = []
|
||||||
|
|
||||||
|
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
|
||||||
|
for name in zf.namelist():
|
||||||
|
if name == _MANIFEST_NAME:
|
||||||
|
continue
|
||||||
|
if not include_db and name.endswith(".db"):
|
||||||
|
skipped.append(name)
|
||||||
|
continue
|
||||||
|
dest = base_dir / name
|
||||||
|
if dest.exists() and not overwrite:
|
||||||
|
skipped.append(name)
|
||||||
|
continue
|
||||||
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
dest.write_bytes(zf.read(name))
|
||||||
|
restored.append(name)
|
||||||
|
|
||||||
|
return {"restored": restored, "skipped": skipped}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# CLI entry point
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Peregrine config backup / restore / teleport")
|
||||||
|
group = parser.add_mutually_exclusive_group(required=True)
|
||||||
|
group.add_argument("--create", metavar="OUT.zip", help="Create a backup zip")
|
||||||
|
group.add_argument("--restore", metavar="IN.zip", help="Restore from a backup zip")
|
||||||
|
group.add_argument("--list", metavar="IN.zip", help="List contents of a backup zip")
|
||||||
|
parser.add_argument("--no-db", action="store_true", help="Exclude staging.db (--create/--restore)")
|
||||||
|
parser.add_argument("--no-overwrite", action="store_true",
|
||||||
|
help="Skip files that already exist (--restore)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--base-dir", metavar="PATH",
|
||||||
|
help="Root of the instance to back up/restore (default: this repo root). "
|
||||||
|
"Use /devl/job-seeker to target the legacy Conda install.",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
base_dir = Path(args.base_dir).resolve() if args.base_dir else Path(__file__).parent.parent
|
||||||
|
|
||||||
|
if args.create:
|
||||||
|
out = Path(args.create)
|
||||||
|
data = create_backup(base_dir, include_db=not args.no_db)
|
||||||
|
out.write_bytes(data)
|
||||||
|
info = list_backup_contents(data)
|
||||||
|
m = info["manifest"]
|
||||||
|
print(f"Backup created: {out} ({len(data):,} bytes)")
|
||||||
|
print(f" Source: {m.get('source', '?')} ({base_dir})")
|
||||||
|
print(f" {len(info['files'])} files archived:")
|
||||||
|
for name in info["files"]:
|
||||||
|
size = info["sizes"].get(name, 0)
|
||||||
|
print(f" {name} ({size:,} bytes)")
|
||||||
|
|
||||||
|
elif args.restore:
|
||||||
|
in_path = Path(args.restore)
|
||||||
|
if not in_path.exists():
|
||||||
|
print(f"ERROR: {in_path} not found", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
data = in_path.read_bytes()
|
||||||
|
result = restore_backup(data, base_dir,
|
||||||
|
include_db=not args.no_db,
|
||||||
|
overwrite=not args.no_overwrite)
|
||||||
|
print(f"Restored {len(result['restored'])} files:")
|
||||||
|
for name in result["restored"]:
|
||||||
|
print(f" ✓ {name}")
|
||||||
|
if result["skipped"]:
|
||||||
|
print(f"Skipped {len(result['skipped'])} files:")
|
||||||
|
for name in result["skipped"]:
|
||||||
|
print(f" - {name}")
|
||||||
|
|
||||||
|
elif args.list:
|
||||||
|
in_path = Path(args.list)
|
||||||
|
if not in_path.exists():
|
||||||
|
print(f"ERROR: {in_path} not found", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
data = in_path.read_bytes()
|
||||||
|
info = list_backup_contents(data)
|
||||||
|
m = info["manifest"]
|
||||||
|
if m:
|
||||||
|
print(f"Created: {m.get('created_at', 'unknown')}")
|
||||||
|
print(f"Source: {m.get('source', '?')} ({m.get('source_path', '?')})")
|
||||||
|
print(f"Has DB: {m.get('includes_db', '?')}")
|
||||||
|
print(f"\n{len(info['files'])} files ({info['total_bytes']:,} bytes uncompressed):")
|
||||||
|
for name in info["files"]:
|
||||||
|
size = info["sizes"].get(name, 0)
|
||||||
|
print(f" {name} ({size:,} bytes)")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -3,12 +3,13 @@ SQLite staging layer for job listings.
|
||||||
Jobs flow: pending → approved/rejected → applied → synced
|
Jobs flow: pending → approved/rejected → applied → synced
|
||||||
applied → phone_screen → interviewing → offer → hired (or rejected)
|
applied → phone_screen → interviewing → offer → hired (or rejected)
|
||||||
"""
|
"""
|
||||||
|
import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
DEFAULT_DB = Path(__file__).parent.parent / "staging.db"
|
DEFAULT_DB = Path(os.environ.get("STAGING_DB", Path(__file__).parent.parent / "staging.db"))
|
||||||
|
|
||||||
CREATE_JOBS = """
|
CREATE_JOBS = """
|
||||||
CREATE TABLE IF NOT EXISTS jobs (
|
CREATE TABLE IF NOT EXISTS jobs (
|
||||||
|
|
|
||||||
160
scripts/suggest_helpers.py
Normal file
160
scripts/suggest_helpers.py
Normal file
|
|
@ -0,0 +1,160 @@
|
||||||
|
"""
|
||||||
|
LLM-powered suggestion helpers for Settings UI.
|
||||||
|
Two functions, each makes one LLMRouter call:
|
||||||
|
- suggest_search_terms: enhanced title + three-angle exclude suggestions
|
||||||
|
- suggest_resume_keywords: skills/domains/keywords gap analysis
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from scripts.llm_router import LLMRouter
|
||||||
|
|
||||||
|
|
||||||
|
def _load_resume_context(resume_path: Path) -> str:
|
||||||
|
"""Extract 3 most recent positions from plain_text_resume.yaml as a short summary."""
|
||||||
|
import yaml
|
||||||
|
if not resume_path.exists():
|
||||||
|
return ""
|
||||||
|
resume = yaml.safe_load(resume_path.read_text()) or {}
|
||||||
|
lines = []
|
||||||
|
for exp in (resume.get("experience_details") or [])[:3]:
|
||||||
|
pos = exp.get("position", "")
|
||||||
|
co = exp.get("company", "")
|
||||||
|
skills = ", ".join((exp.get("skills_acquired") or [])[:5])
|
||||||
|
lines.append(f"- {pos} at {co}: {skills}")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_json(text: str) -> dict[str, Any]:
|
||||||
|
"""Extract the first JSON object from LLM output. Returns {} on failure."""
|
||||||
|
m = re.search(r"\{.*\}", text, re.DOTALL)
|
||||||
|
if m:
|
||||||
|
try:
|
||||||
|
return json.loads(m.group())
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def suggest_search_terms(
|
||||||
|
current_titles: list[str],
|
||||||
|
resume_path: Path,
|
||||||
|
blocklist: dict[str, Any],
|
||||||
|
user_profile: dict[str, Any],
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Suggest additional job titles and exclude keywords.
|
||||||
|
|
||||||
|
Three-angle exclude analysis:
|
||||||
|
A: Blocklist alias expansion (blocked companies/industries → keyword variants)
|
||||||
|
B: Values misalignment (mission preferences → industries/culture to avoid)
|
||||||
|
C: Role-type filter (career summary → role types that don't fit)
|
||||||
|
|
||||||
|
Returns: {"suggested_titles": [...], "suggested_excludes": [...]}
|
||||||
|
"""
|
||||||
|
resume_context = _load_resume_context(resume_path)
|
||||||
|
titles_str = "\n".join(f"- {t}" for t in current_titles) or "(none yet)"
|
||||||
|
|
||||||
|
bl_companies = ", ".join(blocklist.get("companies", [])) or "none"
|
||||||
|
bl_industries = ", ".join(blocklist.get("industries", [])) or "none"
|
||||||
|
nda = ", ".join(user_profile.get("nda_companies", [])) or "none"
|
||||||
|
career_summary = user_profile.get("career_summary", "") or "Not provided"
|
||||||
|
mission_raw = user_profile.get("mission_preferences", {}) or {}
|
||||||
|
# Three exclude angles are intentionally collapsed into one flat suggested_excludes list
|
||||||
|
mission_str = "\n".join(
|
||||||
|
f" - {k}: {v}" for k, v in mission_raw.items() if v and isinstance(v, str) and v.strip()
|
||||||
|
) or " (none specified)"
|
||||||
|
|
||||||
|
prompt = f"""You are helping a job seeker optimise their search configuration.
|
||||||
|
|
||||||
|
--- RESUME BACKGROUND ---
|
||||||
|
{resume_context or "Not provided"}
|
||||||
|
|
||||||
|
--- CAREER SUMMARY ---
|
||||||
|
{career_summary}
|
||||||
|
|
||||||
|
--- CURRENT TITLES BEING SEARCHED ---
|
||||||
|
{titles_str}
|
||||||
|
|
||||||
|
--- BLOCKED ENTITIES ---
|
||||||
|
Companies blocked: {bl_companies}
|
||||||
|
Industries blocked: {bl_industries}
|
||||||
|
NDA / confidential employers: {nda}
|
||||||
|
|
||||||
|
--- MISSION & VALUES ---
|
||||||
|
{mission_str}
|
||||||
|
|
||||||
|
Provide all four of the following:
|
||||||
|
|
||||||
|
1. TITLE SUGGESTIONS
|
||||||
|
5-8 additional job titles they may be missing: alternative names, adjacent roles, or senior variants of their current titles.
|
||||||
|
|
||||||
|
2. EXCLUDE KEYWORDS — BLOCKLIST ALIASES
|
||||||
|
The user has blocked the companies/industries above. Suggest keyword variants that would also catch their aliases, subsidiaries, or related brands.
|
||||||
|
Example: blocking "Meta" → also exclude "facebook", "instagram", "metaverse", "oculus".
|
||||||
|
|
||||||
|
3. EXCLUDE KEYWORDS — VALUES MISALIGNMENT
|
||||||
|
Based on the user's mission and values above, suggest industry or culture keywords to exclude.
|
||||||
|
Examples: "tobacco", "gambling", "fossil fuel", "defense contractor", "MLM", "commission-only", "pyramid".
|
||||||
|
|
||||||
|
4. EXCLUDE KEYWORDS — ROLE TYPE FILTER
|
||||||
|
Based on the user's career background, suggest role-type terms that don't match their trajectory.
|
||||||
|
Examples for a CS/TAM leader: "cold calling", "door to door", "quota-driven", "SDR", "sales development rep".
|
||||||
|
|
||||||
|
Return ONLY valid JSON in exactly this format (no extra text):
|
||||||
|
{{"suggested_titles": ["Title 1", "Title 2"],
|
||||||
|
"suggested_excludes": ["keyword 1", "keyword 2", "keyword 3"]}}"""
|
||||||
|
|
||||||
|
raw = LLMRouter().complete(prompt).strip()
|
||||||
|
parsed = _parse_json(raw)
|
||||||
|
return {
|
||||||
|
"suggested_titles": parsed.get("suggested_titles", []),
|
||||||
|
"suggested_excludes": parsed.get("suggested_excludes", []),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def suggest_resume_keywords(
|
||||||
|
resume_path: Path,
|
||||||
|
current_kw: dict[str, list[str]],
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Suggest skills, domains, and keywords not already in the user's resume_keywords.yaml.
|
||||||
|
|
||||||
|
Returns: {"skills": [...], "domains": [...], "keywords": [...]}
|
||||||
|
"""
|
||||||
|
resume_context = _load_resume_context(resume_path)
|
||||||
|
|
||||||
|
already_skills = ", ".join(current_kw.get("skills", [])) or "none"
|
||||||
|
already_domains = ", ".join(current_kw.get("domains", [])) or "none"
|
||||||
|
already_keywords = ", ".join(current_kw.get("keywords", [])) or "none"
|
||||||
|
|
||||||
|
prompt = f"""You are helping a job seeker build a keyword profile used to score job description matches.
|
||||||
|
|
||||||
|
--- RESUME BACKGROUND ---
|
||||||
|
{resume_context or "Not provided"}
|
||||||
|
|
||||||
|
--- ALREADY SELECTED (do not repeat these) ---
|
||||||
|
Skills: {already_skills}
|
||||||
|
Domains: {already_domains}
|
||||||
|
Keywords: {already_keywords}
|
||||||
|
|
||||||
|
Suggest additional tags in each of the three categories below. Only suggest tags NOT already in the lists above.
|
||||||
|
|
||||||
|
SKILLS — specific technical or soft skills (e.g. "Salesforce", "Executive Communication", "SQL", "Stakeholder Management")
|
||||||
|
DOMAINS — industry verticals, company types, or functional areas (e.g. "B2B SaaS", "EdTech", "Non-profit", "Series A-C")
|
||||||
|
KEYWORDS — specific terms, methodologies, metrics, or JD phrases (e.g. "NPS", "churn prevention", "QBR", "cross-functional")
|
||||||
|
|
||||||
|
Return ONLY valid JSON in exactly this format (no extra text):
|
||||||
|
{{"skills": ["Skill A", "Skill B"],
|
||||||
|
"domains": ["Domain A"],
|
||||||
|
"keywords": ["Keyword A", "Keyword B"]}}"""
|
||||||
|
|
||||||
|
raw = LLMRouter().complete(prompt).strip()
|
||||||
|
parsed = _parse_json(raw)
|
||||||
|
return {
|
||||||
|
"skills": parsed.get("skills", []),
|
||||||
|
"domains": parsed.get("domains", []),
|
||||||
|
"keywords": parsed.get("keywords", []),
|
||||||
|
}
|
||||||
231
tests/test_backup.py
Normal file
231
tests/test_backup.py
Normal file
|
|
@ -0,0 +1,231 @@
|
||||||
|
"""Tests for scripts/backup.py — create, list, restore, and multi-instance support."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import zipfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from scripts.backup import (
|
||||||
|
_detect_source_label,
|
||||||
|
create_backup,
|
||||||
|
list_backup_contents,
|
||||||
|
restore_backup,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Fixtures
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _make_instance(tmp_path: Path, name: str, *, root_db: bool = False) -> Path:
|
||||||
|
"""Build a minimal fake instance directory for testing."""
|
||||||
|
base = tmp_path / name
|
||||||
|
base.mkdir()
|
||||||
|
|
||||||
|
# Secret configs
|
||||||
|
(base / "config").mkdir()
|
||||||
|
(base / "config" / "notion.yaml").write_text("token: secret")
|
||||||
|
(base / "config" / "email.yaml").write_text("user: test@example.com")
|
||||||
|
|
||||||
|
# Extra config
|
||||||
|
(base / "config" / "llm.yaml").write_text("backend: ollama")
|
||||||
|
(base / "config" / "resume_keywords.yaml").write_text("keywords: [python]")
|
||||||
|
(base / "config" / "server.yaml").write_text("port: 8502")
|
||||||
|
|
||||||
|
# DB — either at data/staging.db (Peregrine) or staging.db root (legacy)
|
||||||
|
if root_db:
|
||||||
|
(base / "staging.db").write_bytes(b"SQLite legacy")
|
||||||
|
else:
|
||||||
|
(base / "data").mkdir()
|
||||||
|
(base / "data" / "staging.db").write_bytes(b"SQLite peregrine")
|
||||||
|
|
||||||
|
return base
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# create_backup
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestCreateBackup:
|
||||||
|
def test_returns_valid_zip(self, tmp_path):
|
||||||
|
base = _make_instance(tmp_path, "peregrine")
|
||||||
|
data = create_backup(base)
|
||||||
|
assert zipfile.is_zipfile(__import__("io").BytesIO(data))
|
||||||
|
|
||||||
|
def test_includes_secret_configs(self, tmp_path):
|
||||||
|
base = _make_instance(tmp_path, "peregrine")
|
||||||
|
data = create_backup(base)
|
||||||
|
info = list_backup_contents(data)
|
||||||
|
assert "config/notion.yaml" in info["files"]
|
||||||
|
assert "config/email.yaml" in info["files"]
|
||||||
|
|
||||||
|
def test_includes_extra_configs(self, tmp_path):
|
||||||
|
base = _make_instance(tmp_path, "peregrine")
|
||||||
|
data = create_backup(base)
|
||||||
|
info = list_backup_contents(data)
|
||||||
|
assert "config/llm.yaml" in info["files"]
|
||||||
|
assert "config/resume_keywords.yaml" in info["files"]
|
||||||
|
assert "config/server.yaml" in info["files"]
|
||||||
|
|
||||||
|
def test_includes_db_by_default(self, tmp_path):
|
||||||
|
base = _make_instance(tmp_path, "peregrine")
|
||||||
|
data = create_backup(base)
|
||||||
|
info = list_backup_contents(data)
|
||||||
|
assert info["manifest"]["includes_db"] is True
|
||||||
|
assert any(f.endswith(".db") for f in info["files"])
|
||||||
|
|
||||||
|
def test_excludes_db_when_flag_false(self, tmp_path):
|
||||||
|
base = _make_instance(tmp_path, "peregrine")
|
||||||
|
data = create_backup(base, include_db=False)
|
||||||
|
info = list_backup_contents(data)
|
||||||
|
assert info["manifest"]["includes_db"] is False
|
||||||
|
assert not any(f.endswith(".db") for f in info["files"])
|
||||||
|
|
||||||
|
def test_silently_skips_missing_files(self, tmp_path):
|
||||||
|
base = _make_instance(tmp_path, "peregrine")
|
||||||
|
# tokens.yaml not created in fixture — should not raise
|
||||||
|
data = create_backup(base)
|
||||||
|
info = list_backup_contents(data)
|
||||||
|
assert "config/tokens.yaml" not in info["files"]
|
||||||
|
|
||||||
|
def test_manifest_contains_source_label(self, tmp_path):
|
||||||
|
base = _make_instance(tmp_path, "peregrine")
|
||||||
|
data = create_backup(base)
|
||||||
|
info = list_backup_contents(data)
|
||||||
|
assert info["manifest"]["source"] == "peregrine"
|
||||||
|
|
||||||
|
def test_source_label_override(self, tmp_path):
|
||||||
|
base = _make_instance(tmp_path, "peregrine")
|
||||||
|
data = create_backup(base, source_label="custom-label")
|
||||||
|
info = list_backup_contents(data)
|
||||||
|
assert info["manifest"]["source"] == "custom-label"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Legacy instance (staging.db at repo root)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestLegacyInstance:
|
||||||
|
def test_picks_up_root_db(self, tmp_path):
|
||||||
|
base = _make_instance(tmp_path, "job-seeker", root_db=True)
|
||||||
|
data = create_backup(base)
|
||||||
|
info = list_backup_contents(data)
|
||||||
|
assert "staging.db" in info["files"]
|
||||||
|
assert "data/staging.db" not in info["files"]
|
||||||
|
|
||||||
|
def test_source_label_is_job_seeker(self, tmp_path):
|
||||||
|
base = _make_instance(tmp_path, "job-seeker", root_db=True)
|
||||||
|
data = create_backup(base)
|
||||||
|
info = list_backup_contents(data)
|
||||||
|
assert info["manifest"]["source"] == "job-seeker"
|
||||||
|
|
||||||
|
def test_missing_peregrine_only_configs_skipped(self, tmp_path):
|
||||||
|
"""Legacy doesn't have server.yaml, user.yaml, etc. — should not error."""
|
||||||
|
base = _make_instance(tmp_path, "job-seeker", root_db=True)
|
||||||
|
# Remove server.yaml to simulate legacy (it won't exist there)
|
||||||
|
(base / "config" / "server.yaml").unlink()
|
||||||
|
data = create_backup(base)
|
||||||
|
info = list_backup_contents(data)
|
||||||
|
assert "config/server.yaml" not in info["files"]
|
||||||
|
assert "config/notion.yaml" in info["files"]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# list_backup_contents
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestListBackupContents:
|
||||||
|
def test_returns_manifest_and_files(self, tmp_path):
|
||||||
|
base = _make_instance(tmp_path, "peregrine")
|
||||||
|
data = create_backup(base)
|
||||||
|
info = list_backup_contents(data)
|
||||||
|
assert "manifest" in info
|
||||||
|
assert "files" in info
|
||||||
|
assert "sizes" in info
|
||||||
|
assert "total_bytes" in info
|
||||||
|
|
||||||
|
def test_total_bytes_is_sum_of_file_sizes(self, tmp_path):
|
||||||
|
base = _make_instance(tmp_path, "peregrine")
|
||||||
|
data = create_backup(base)
|
||||||
|
info = list_backup_contents(data)
|
||||||
|
expected = sum(info["sizes"][f] for f in info["files"] if f in info["sizes"])
|
||||||
|
assert info["total_bytes"] == expected
|
||||||
|
|
||||||
|
def test_manifest_not_in_files_list(self, tmp_path):
|
||||||
|
base = _make_instance(tmp_path, "peregrine")
|
||||||
|
data = create_backup(base)
|
||||||
|
info = list_backup_contents(data)
|
||||||
|
assert "backup-manifest.json" not in info["files"]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# restore_backup
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestRestoreBackup:
|
||||||
|
def test_restores_all_files(self, tmp_path):
|
||||||
|
src = _make_instance(tmp_path, "peregrine")
|
||||||
|
dst = tmp_path / "restored"
|
||||||
|
dst.mkdir()
|
||||||
|
data = create_backup(src)
|
||||||
|
result = restore_backup(data, dst)
|
||||||
|
assert len(result["restored"]) > 0
|
||||||
|
assert (dst / "config" / "notion.yaml").exists()
|
||||||
|
|
||||||
|
def test_skips_db_when_flag_false(self, tmp_path):
|
||||||
|
src = _make_instance(tmp_path, "peregrine")
|
||||||
|
dst = tmp_path / "restored"
|
||||||
|
dst.mkdir()
|
||||||
|
data = create_backup(src)
|
||||||
|
result = restore_backup(data, dst, include_db=False)
|
||||||
|
assert not any(f.endswith(".db") for f in result["restored"])
|
||||||
|
assert any(f.endswith(".db") for f in result["skipped"])
|
||||||
|
|
||||||
|
def test_no_overwrite_skips_existing(self, tmp_path):
|
||||||
|
src = _make_instance(tmp_path, "peregrine")
|
||||||
|
dst = tmp_path / "restored"
|
||||||
|
dst.mkdir()
|
||||||
|
(dst / "config").mkdir()
|
||||||
|
existing = dst / "config" / "notion.yaml"
|
||||||
|
existing.write_text("original content")
|
||||||
|
data = create_backup(src)
|
||||||
|
result = restore_backup(data, dst, overwrite=False)
|
||||||
|
assert "config/notion.yaml" in result["skipped"]
|
||||||
|
assert existing.read_text() == "original content"
|
||||||
|
|
||||||
|
def test_overwrite_replaces_existing(self, tmp_path):
|
||||||
|
src = _make_instance(tmp_path, "peregrine")
|
||||||
|
dst = tmp_path / "restored"
|
||||||
|
dst.mkdir()
|
||||||
|
(dst / "config").mkdir()
|
||||||
|
(dst / "config" / "notion.yaml").write_text("stale content")
|
||||||
|
data = create_backup(src)
|
||||||
|
restore_backup(data, dst, overwrite=True)
|
||||||
|
assert (dst / "config" / "notion.yaml").read_text() == "token: secret"
|
||||||
|
|
||||||
|
def test_roundtrip_preserves_content(self, tmp_path):
|
||||||
|
src = _make_instance(tmp_path, "peregrine")
|
||||||
|
original = (src / "config" / "notion.yaml").read_text()
|
||||||
|
dst = tmp_path / "restored"
|
||||||
|
dst.mkdir()
|
||||||
|
data = create_backup(src)
|
||||||
|
restore_backup(data, dst)
|
||||||
|
assert (dst / "config" / "notion.yaml").read_text() == original
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# _detect_source_label
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestDetectSourceLabel:
|
||||||
|
def test_returns_directory_name(self, tmp_path):
|
||||||
|
base = tmp_path / "peregrine"
|
||||||
|
base.mkdir()
|
||||||
|
assert _detect_source_label(base) == "peregrine"
|
||||||
|
|
||||||
|
def test_legacy_label(self, tmp_path):
|
||||||
|
base = tmp_path / "job-seeker"
|
||||||
|
base.mkdir()
|
||||||
|
assert _detect_source_label(base) == "job-seeker"
|
||||||
148
tests/test_suggest_helpers.py
Normal file
148
tests/test_suggest_helpers.py
Normal file
|
|
@ -0,0 +1,148 @@
|
||||||
|
"""Tests for scripts/suggest_helpers.py."""
|
||||||
|
import json
|
||||||
|
import pytest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
|
||||||
|
RESUME_PATH = Path(__file__).parent.parent / "config" / "plain_text_resume.yaml"
|
||||||
|
|
||||||
|
|
||||||
|
# ── _parse_json ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_parse_json_extracts_valid_object():
|
||||||
|
from scripts.suggest_helpers import _parse_json
|
||||||
|
raw = 'Here is the result: {"a": [1, 2], "b": "hello"} done.'
|
||||||
|
assert _parse_json(raw) == {"a": [1, 2], "b": "hello"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_json_returns_empty_on_invalid():
|
||||||
|
from scripts.suggest_helpers import _parse_json
|
||||||
|
assert _parse_json("no json here") == {}
|
||||||
|
assert _parse_json('{"broken": ') == {}
|
||||||
|
|
||||||
|
|
||||||
|
# ── suggest_search_terms ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
BLOCKLIST = {
|
||||||
|
"companies": ["Meta", "Amazon"],
|
||||||
|
"industries": ["gambling"],
|
||||||
|
"locations": [],
|
||||||
|
}
|
||||||
|
USER_PROFILE = {
|
||||||
|
"career_summary": "Customer success leader with 10 years in B2B SaaS.",
|
||||||
|
"mission_preferences": {
|
||||||
|
"animal_welfare": "I volunteer at my local shelter.",
|
||||||
|
"education": "",
|
||||||
|
},
|
||||||
|
"nda_companies": ["Acme Corp"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _mock_llm(response_dict: dict):
|
||||||
|
"""Return a patcher that makes LLMRouter().complete() return a JSON string."""
|
||||||
|
mock_router = MagicMock()
|
||||||
|
mock_router.complete.return_value = json.dumps(response_dict)
|
||||||
|
return patch("scripts.suggest_helpers.LLMRouter", return_value=mock_router)
|
||||||
|
|
||||||
|
|
||||||
|
def test_suggest_search_terms_returns_titles_and_excludes():
|
||||||
|
from scripts.suggest_helpers import suggest_search_terms
|
||||||
|
payload = {"suggested_titles": ["VP Customer Success"], "suggested_excludes": ["cold calling"]}
|
||||||
|
with _mock_llm(payload):
|
||||||
|
result = suggest_search_terms(["Customer Success Manager"], RESUME_PATH, BLOCKLIST, USER_PROFILE)
|
||||||
|
assert result["suggested_titles"] == ["VP Customer Success"]
|
||||||
|
assert result["suggested_excludes"] == ["cold calling"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_suggest_search_terms_prompt_contains_blocklist_companies():
|
||||||
|
from scripts.suggest_helpers import suggest_search_terms
|
||||||
|
with _mock_llm({"suggested_titles": [], "suggested_excludes": []}) as mock_cls:
|
||||||
|
suggest_search_terms(["CSM"], RESUME_PATH, BLOCKLIST, USER_PROFILE)
|
||||||
|
prompt_sent = mock_cls.return_value.complete.call_args[0][0]
|
||||||
|
assert "Meta" in prompt_sent
|
||||||
|
assert "Amazon" in prompt_sent
|
||||||
|
|
||||||
|
|
||||||
|
def test_suggest_search_terms_prompt_contains_mission():
|
||||||
|
from scripts.suggest_helpers import suggest_search_terms
|
||||||
|
with _mock_llm({"suggested_titles": [], "suggested_excludes": []}) as mock_cls:
|
||||||
|
suggest_search_terms(["CSM"], RESUME_PATH, BLOCKLIST, USER_PROFILE)
|
||||||
|
prompt_sent = mock_cls.return_value.complete.call_args[0][0]
|
||||||
|
assert "animal_welfare" in prompt_sent or "animal welfare" in prompt_sent.lower()
|
||||||
|
|
||||||
|
|
||||||
|
def test_suggest_search_terms_prompt_contains_career_summary():
|
||||||
|
from scripts.suggest_helpers import suggest_search_terms
|
||||||
|
with _mock_llm({"suggested_titles": [], "suggested_excludes": []}) as mock_cls:
|
||||||
|
suggest_search_terms(["CSM"], RESUME_PATH, BLOCKLIST, USER_PROFILE)
|
||||||
|
prompt_sent = mock_cls.return_value.complete.call_args[0][0]
|
||||||
|
assert "Customer success leader" in prompt_sent
|
||||||
|
|
||||||
|
|
||||||
|
def test_suggest_search_terms_returns_empty_on_bad_json():
|
||||||
|
from scripts.suggest_helpers import suggest_search_terms
|
||||||
|
mock_router = MagicMock()
|
||||||
|
mock_router.complete.return_value = "sorry, I cannot help with that"
|
||||||
|
with patch("scripts.suggest_helpers.LLMRouter", return_value=mock_router):
|
||||||
|
result = suggest_search_terms(["CSM"], RESUME_PATH, BLOCKLIST, USER_PROFILE)
|
||||||
|
assert result == {"suggested_titles": [], "suggested_excludes": []}
|
||||||
|
|
||||||
|
|
||||||
|
def test_suggest_search_terms_raises_on_llm_exhausted():
|
||||||
|
from scripts.suggest_helpers import suggest_search_terms
|
||||||
|
mock_router = MagicMock()
|
||||||
|
mock_router.complete.side_effect = RuntimeError("All LLM backends exhausted")
|
||||||
|
with patch("scripts.suggest_helpers.LLMRouter", return_value=mock_router):
|
||||||
|
with pytest.raises(RuntimeError, match="All LLM backends exhausted"):
|
||||||
|
suggest_search_terms(["CSM"], RESUME_PATH, BLOCKLIST, USER_PROFILE)
|
||||||
|
|
||||||
|
|
||||||
|
# ── suggest_resume_keywords ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
CURRENT_KW = {
|
||||||
|
"skills": ["Customer Success", "SQL"],
|
||||||
|
"domains": ["B2B SaaS"],
|
||||||
|
"keywords": ["NPS"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_suggest_resume_keywords_returns_all_three_categories():
|
||||||
|
from scripts.suggest_helpers import suggest_resume_keywords
|
||||||
|
payload = {
|
||||||
|
"skills": ["Project Management"],
|
||||||
|
"domains": ["EdTech"],
|
||||||
|
"keywords": ["churn prevention"],
|
||||||
|
}
|
||||||
|
with _mock_llm(payload):
|
||||||
|
result = suggest_resume_keywords(RESUME_PATH, CURRENT_KW)
|
||||||
|
assert "skills" in result
|
||||||
|
assert "domains" in result
|
||||||
|
assert "keywords" in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_suggest_resume_keywords_excludes_already_selected():
|
||||||
|
from scripts.suggest_helpers import suggest_resume_keywords
|
||||||
|
with _mock_llm({"skills": [], "domains": [], "keywords": []}) as mock_cls:
|
||||||
|
suggest_resume_keywords(RESUME_PATH, CURRENT_KW)
|
||||||
|
prompt_sent = mock_cls.return_value.complete.call_args[0][0]
|
||||||
|
# Already-selected tags should appear in the prompt so LLM knows to skip them
|
||||||
|
assert "Customer Success" in prompt_sent
|
||||||
|
assert "NPS" in prompt_sent
|
||||||
|
|
||||||
|
|
||||||
|
def test_suggest_resume_keywords_returns_empty_on_bad_json():
|
||||||
|
from scripts.suggest_helpers import suggest_resume_keywords
|
||||||
|
mock_router = MagicMock()
|
||||||
|
mock_router.complete.return_value = "I cannot assist."
|
||||||
|
with patch("scripts.suggest_helpers.LLMRouter", return_value=mock_router):
|
||||||
|
result = suggest_resume_keywords(RESUME_PATH, CURRENT_KW)
|
||||||
|
assert result == {"skills": [], "domains": [], "keywords": []}
|
||||||
|
|
||||||
|
|
||||||
|
def test_suggest_resume_keywords_raises_on_llm_exhausted():
|
||||||
|
from scripts.suggest_helpers import suggest_resume_keywords
|
||||||
|
mock_router = MagicMock()
|
||||||
|
mock_router.complete.side_effect = RuntimeError("All LLM backends exhausted")
|
||||||
|
with patch("scripts.suggest_helpers.LLMRouter", return_value=mock_router):
|
||||||
|
with pytest.raises(RuntimeError, match="All LLM backends exhausted"):
|
||||||
|
suggest_resume_keywords(RESUME_PATH, CURRENT_KW)
|
||||||
Loading…
Reference in a new issue