From db127848a1f142ac818d40670eccc9c358225414 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Thu, 26 Feb 2026 12:32:28 -0800 Subject: [PATCH] fix: resume CID glyphs, resume YAML path, PyJWT dep, candidate voice & mission UI - resume_parser: add _clean_cid() to strip (cid:NNN) glyph refs from ATS PDFs; CIDs 127/149/183 become bullets, unknowns are stripped; applied to PDF/DOCX/ODT - resume YAML: canonicalize plain_text_resume.yaml path to config/ across all references (Settings, Apply, Setup, company_research, migrate); was pointing at unmounted aihawk/data_folder/ in Docker - requirements/environment: add PyJWT>=2.8 (was missing; broke Settings page) - user_profile: add candidate_voice field - generate_cover_letter: inject candidate_voice into SYSTEM_CONTEXT; add social_impact mission signal category (nonprofit, community, equity, etc.) - Settings: add Voice & Personality textarea to Identity expander; add Mission & Values expander with editable fields for all 4 mission categories - .gitignore: exclude CLAUDE.md, config/plain_text_resume.yaml, config/user.yaml.working - search_profiles: add default profile --- .gitignore | 5 +++++ app/pages/0_Setup.py | 2 +- app/pages/2_Settings.py | 37 +++++++++++++++++++++++++++++--- app/pages/4_Apply.py | 2 +- config/search_profiles.yaml | 11 ++++++++++ environment.yml | 5 ++++- requirements.txt | 5 ++++- scripts/company_research.py | 2 +- scripts/generate_cover_letter.py | 30 +++++++++++++++++++++----- scripts/migrate.py | 10 +++++---- scripts/resume_parser.py | 18 +++++++++++++--- scripts/user_profile.py | 2 ++ 12 files changed, 109 insertions(+), 20 deletions(-) diff --git a/.gitignore b/.gitignore index 0787951..edf6c8c 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ unsloth_compiled_cache/ data/survey_screenshots/* !data/survey_screenshots/.gitkeep config/user.yaml +config/plain_text_resume.yaml config/.backup-* config/integrations/*.yaml !config/integrations/*.yaml.example @@ -30,3 +31,7 @@ scrapers/raw_scrapes/ compose.override.yml config/license.json +config/user.yaml.working + +# Claude context files — kept out of version control +CLAUDE.md diff --git a/app/pages/0_Setup.py b/app/pages/0_Setup.py index dce06b2..89670f3 100644 --- a/app/pages/0_Setup.py +++ b/app/pages/0_Setup.py @@ -405,7 +405,7 @@ elif step == 4: if errs: st.error("\n".join(errs)) else: - resume_yaml_path = _ROOT / "aihawk" / "data_folder" / "plain_text_resume.yaml" + resume_yaml_path = _ROOT / "config" / "plain_text_resume.yaml" resume_yaml_path.parent.mkdir(parents=True, exist_ok=True) resume_data = {**parsed, "experience": experience} if parsed else {"experience": experience} resume_yaml_path.write_text( diff --git a/app/pages/2_Settings.py b/app/pages/2_Settings.py index 2c5aae7..9922cb8 100644 --- a/app/pages/2_Settings.py +++ b/app/pages/2_Settings.py @@ -24,7 +24,7 @@ SEARCH_CFG = CONFIG_DIR / "search_profiles.yaml" BLOCKLIST_CFG = CONFIG_DIR / "blocklist.yaml" LLM_CFG = CONFIG_DIR / "llm.yaml" NOTION_CFG = CONFIG_DIR / "notion.yaml" -RESUME_PATH = Path(__file__).parent.parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml" +RESUME_PATH = Path(__file__).parent.parent.parent / "config" / "plain_text_resume.yaml" KEYWORDS_CFG = CONFIG_DIR / "resume_keywords.yaml" def load_yaml(path: Path) -> dict: @@ -113,6 +113,36 @@ with tab_profile: u_linkedin = c2.text_input("LinkedIn URL", _u.get("linkedin", "")) u_summary = st.text_area("Career Summary (used in LLM prompts)", _u.get("career_summary", ""), height=100) + u_voice = st.text_area( + "Voice & Personality (shapes cover letter tone)", + _u.get("candidate_voice", ""), + height=80, + help="Personality traits and writing voice that the LLM uses to write authentically in your style. Never disclosed in applications.", + ) + + with st.expander("šŸŽÆ Mission & Values"): + st.caption("Industry passions and causes you care about. Used to inject authentic Para 3 alignment when a company matches. Never disclosed in applications.") + _mission = dict(_u.get("mission_preferences", {})) + _mission_keys = ["animal_welfare", "education", "music", "social_impact"] + _mission_labels = { + "animal_welfare": "🐾 Animal Welfare", + "education": "šŸ“š Education / EdTech / Kids", + "music": "šŸŽµ Music Industry", + "social_impact": "šŸŒ Social Impact / Nonprofits", + } + _mission_updated = {} + for key in _mission_keys: + _mission_updated[key] = st.text_area( + _mission_labels[key], + _mission.get(key, ""), + height=68, + key=f"mission_{key}", + help=f"Your personal connection to this domain. Leave blank to use the default prompt hint.", + ) + # Preserve any extra keys the user may have added manually in YAML + for k, v in _mission.items(): + if k not in _mission_keys: + _mission_updated[k] = v with st.expander("šŸ”’ Sensitive Employers (NDA)"): st.caption("Companies listed here appear as 'previous employer (NDA)' in research briefs.") @@ -180,10 +210,11 @@ with tab_profile: new_data = { "name": u_name, "email": u_email, "phone": u_phone, "linkedin": u_linkedin, "career_summary": u_summary, + "candidate_voice": u_voice, "nda_companies": nda_list, "docs_dir": u_docs, "ollama_models_dir": u_ollama, "vllm_models_dir": u_vllm, "inference_profile": u_inf_profile, - "mission_preferences": _u.get("mission_preferences", {}), + "mission_preferences": {k: v for k, v in _mission_updated.items() if v.strip()}, "candidate_accessibility_focus": u_access_focus, "candidate_lgbtq_focus": u_lgbtq_focus, "services": { @@ -673,7 +704,7 @@ with tab_resume: ) if not RESUME_PATH.exists(): - st.error(f"Resume YAML not found at `{RESUME_PATH}`. Is AIHawk cloned?") + st.error(f"Resume YAML not found at `{RESUME_PATH}`. Copy or create `config/plain_text_resume.yaml`.") st.stop() _data = yaml.safe_load(RESUME_PATH.read_text()) or {} diff --git a/app/pages/4_Apply.py b/app/pages/4_Apply.py index 2c6bcef..41d98b9 100644 --- a/app/pages/4_Apply.py +++ b/app/pages/4_Apply.py @@ -28,7 +28,7 @@ from scripts.db import ( from scripts.task_runner import submit_task DOCS_DIR = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch" -RESUME_YAML = Path(__file__).parent.parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml" +RESUME_YAML = Path(__file__).parent.parent.parent / "config" / "plain_text_resume.yaml" st.title("šŸš€ Apply Workspace") diff --git a/config/search_profiles.yaml b/config/search_profiles.yaml index bada59a..8ab44dc 100644 --- a/config/search_profiles.yaml +++ b/config/search_profiles.yaml @@ -1,4 +1,15 @@ profiles: +- boards: + - linkedin + - indeed + - glassdoor + - zip_recruiter + job_titles: + - Customer Service Specialist + locations: + - San Francisco CA + name: default + remote_only: false - boards: - linkedin - indeed diff --git a/environment.yml b/environment.yml index 8839279..703118f 100644 --- a/environment.yml +++ b/environment.yml @@ -28,7 +28,7 @@ dependencies: - fake-useragent # company scraper rotation # ── LLM / AI backends ───────────────────────────────────────────────────── - - openai>=1.0 # used for OpenAI-compat backends (ollama, vllm, wrappers) + - openai>=1.55.0,<2.0.0 # >=1.55 required for httpx 0.28 compat; <2.0 for langchain-openai - anthropic>=0.80 # direct Anthropic API fallback - ollama # Python client for Ollama management - langchain>=0.2 @@ -54,6 +54,9 @@ dependencies: - pyyaml>=6.0 - python-dotenv + # ── Auth / licensing ────────────────────────────────────────────────────── + - PyJWT>=2.8 + # ── Utilities ───────────────────────────────────────────────────────────── - sqlalchemy - tqdm diff --git a/requirements.txt b/requirements.txt index e31b83e..1b0b597 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,7 +22,7 @@ curl_cffi fake-useragent # ── LLM / AI backends ───────────────────────────────────────────────────── -openai>=1.0 +openai>=1.55.0,<2.0.0 # >=1.55 required for httpx 0.28 compat; <2.0 for langchain-openai anthropic>=0.80 ollama langchain>=0.2 @@ -51,6 +51,9 @@ json-repair pyyaml>=6.0 python-dotenv +# ── Auth / licensing ────────────────────────────────────────────────────── +PyJWT>=2.8 + # ── Utilities ───────────────────────────────────────────────────────────── sqlalchemy tqdm diff --git a/scripts/company_research.py b/scripts/company_research.py index bdab12b..32fde8f 100644 --- a/scripts/company_research.py +++ b/scripts/company_research.py @@ -193,7 +193,7 @@ def _parse_sections(text: str) -> dict[str, str]: return sections -_RESUME_YAML = Path(__file__).parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml" +_RESUME_YAML = Path(__file__).parent.parent / "config" / "plain_text_resume.yaml" _KEYWORDS_YAML = Path(__file__).parent.parent / "config" / "resume_keywords.yaml" diff --git a/scripts/generate_cover_letter.py b/scripts/generate_cover_letter.py index 4f0da15..481c263 100644 --- a/scripts/generate_cover_letter.py +++ b/scripts/generate_cover_letter.py @@ -26,11 +26,19 @@ LETTERS_DIR = _profile.docs_dir if _profile else Path.home() / "Documents" / "Jo LETTER_GLOB = "*Cover Letter*.md" # Background injected into every prompt so the model has the candidate's facts -SYSTEM_CONTEXT = ( - f"You are writing cover letters for {_profile.name}. {_profile.career_summary}" - if _profile else - "You are a professional cover letter writer. Write in first person." -) +def _build_system_context() -> str: + if not _profile: + return "You are a professional cover letter writer. Write in first person." + parts = [f"You are writing cover letters for {_profile.name}. {_profile.career_summary}"] + if _profile.candidate_voice: + parts.append( + f"Voice and personality: {_profile.candidate_voice} " + "Write in a way that reflects these authentic traits — not as a checklist, " + "but as a natural expression of who this person is." + ) + return " ".join(parts) + +SYSTEM_CONTEXT = _build_system_context() # ── Mission-alignment detection ─────────────────────────────────────────────── @@ -58,6 +66,13 @@ _MISSION_SIGNALS: dict[str, list[str]] = { "instructure", "canvas lms", "clever", "district", "teacher", "k-12", "k12", "grade", "pedagogy", ], + "social_impact": [ + "nonprofit", "non-profit", "501(c)", "social impact", "mission-driven", + "public benefit", "community", "underserved", "equity", "justice", + "humanitarian", "advocacy", "charity", "foundation", "ngo", + "social good", "civic", "public health", "mental health", "food security", + "housing", "homelessness", "poverty", "workforce development", + ], } _candidate = _profile.name if _profile else "the candidate" @@ -79,6 +94,11 @@ _MISSION_DEFAULTS: dict[str, str] = { f"{_candidate}'s values. Para 3 should reflect this authentic connection specifically " "and warmly." ), + "social_impact": ( + f"This organization is mission-driven / social impact focused — exactly the kind of " + f"cause {_candidate} cares deeply about. Para 3 should warmly reflect their genuine " + "desire to apply their skills to work that makes a real difference in people's lives." + ), } diff --git a/scripts/migrate.py b/scripts/migrate.py index d370fb6..67cfad8 100644 --- a/scripts/migrate.py +++ b/scripts/migrate.py @@ -84,9 +84,9 @@ def _extract_career_summary(source: Path) -> str: def _extract_personal_info(source: Path) -> dict: """Extract personal info from aihawk resume yaml.""" - resume = source / "aihawk" / "data_folder" / "plain_text_resume.yaml" + resume = source / "config" / "plain_text_resume.yaml" if not resume.exists(): - resume = source / "config" / "plain_text_resume.yaml" + resume = source / "aihawk" / "data_folder" / "plain_text_resume.yaml" if not resume.exists(): return {} data = _load_yaml(resume) @@ -197,8 +197,10 @@ def _copy_configs(source: Path, dest: Path, apply: bool) -> None: def _copy_aihawk_resume(source: Path, dest: Path, apply: bool) -> None: print("\n── Copying AIHawk resume profile") - src = source / "aihawk" / "data_folder" / "plain_text_resume.yaml" - dst = dest / "aihawk" / "data_folder" / "plain_text_resume.yaml" + src = source / "config" / "plain_text_resume.yaml" + if not src.exists(): + src = source / "aihawk" / "data_folder" / "plain_text_resume.yaml" + dst = dest / "config" / "plain_text_resume.yaml" _copy_file(src, dst, apply) diff --git a/scripts/resume_parser.py b/scripts/resume_parser.py index 4450dbb..ed9f74b 100644 --- a/scripts/resume_parser.py +++ b/scripts/resume_parser.py @@ -92,6 +92,18 @@ def _find_column_split(page) -> float | None: return split_x if split_x and best_gap > page.width * 0.03 else None +_CID_BULLETS = {127, 149, 183} # common bullet CIDs across ATS-reembedded fonts + +def _clean_cid(text: str) -> str: + """Replace (cid:NNN) glyph references emitted by pdfplumber when a PDF font + lacks a ToUnicode map. Known bullet CIDs become '•'; everything else is + stripped so downstream section parsing sees clean text.""" + def _replace(m: re.Match) -> str: + n = int(m.group(1)) + return "•" if n in _CID_BULLETS else "" + return re.sub(r"\(cid:(\d+)\)", _replace, text) + + def extract_text_from_pdf(file_bytes: bytes) -> str: """Extract text from PDF, handling two-column layouts via gutter detection. @@ -116,12 +128,12 @@ def extract_text_from_pdf(file_bytes: bytes) -> str: pages.append("\n".join(filter(None, [header_text, left_text, right_text]))) continue pages.append(page.extract_text() or "") - return "\n".join(pages) + return _clean_cid("\n".join(pages)) def extract_text_from_docx(file_bytes: bytes) -> str: doc = Document(io.BytesIO(file_bytes)) - return "\n".join(p.text for p in doc.paragraphs if p.text.strip()) + return _clean_cid("\n".join(p.text for p in doc.paragraphs if p.text.strip())) def extract_text_from_odt(file_bytes: bytes) -> str: @@ -139,7 +151,7 @@ def extract_text_from_odt(file_bytes: bytes) -> str: text = "".join(elem.itertext()).strip() if text: lines.append(text) - return "\n".join(lines) + return _clean_cid("\n".join(lines)) # ── Section splitter ────────────────────────────────────────────────────────── diff --git a/scripts/user_profile.py b/scripts/user_profile.py index 1e4981b..fa2678f 100644 --- a/scripts/user_profile.py +++ b/scripts/user_profile.py @@ -15,6 +15,7 @@ _DEFAULTS = { "phone": "", "linkedin": "", "career_summary": "", + "candidate_voice": "", "nda_companies": [], "docs_dir": "~/Documents/JobSearch", "ollama_models_dir": "~/models/ollama", @@ -61,6 +62,7 @@ class UserProfile: self.phone: str = data["phone"] self.linkedin: str = data["linkedin"] self.career_summary: str = data["career_summary"] + self.candidate_voice: str = data.get("candidate_voice", "") self.nda_companies: list[str] = [c.lower() for c in data["nda_companies"]] self.docs_dir: Path = Path(data["docs_dir"]).expanduser().resolve() self.ollama_models_dir: Path = Path(data["ollama_models_dir"]).expanduser().resolve()