From db127848a1f142ac818d40670eccc9c358225414 Mon Sep 17 00:00:00 2001
From: pyr0ball <pyroballpcs@gmail.com>
Date: Thu, 26 Feb 2026 12:32:28 -0800
Subject: [PATCH] fix: resume CID glyphs, resume YAML path, PyJWT dep,
 candidate voice & mission UI

- resume_parser: add _clean_cid() to strip (cid:NNN) glyph refs from ATS PDFs;
  CIDs 127/149/183 become bullets, unknowns are stripped; applied to PDF/DOCX/ODT
- resume YAML: canonicalize plain_text_resume.yaml path to config/ across all
  references (Settings, Apply, Setup, company_research, migrate); was pointing at
  unmounted aihawk/data_folder/ in Docker
- requirements/environment: add PyJWT>=2.8 (was missing; broke Settings page)
- user_profile: add candidate_voice field
- generate_cover_letter: inject candidate_voice into SYSTEM_CONTEXT; add
  social_impact mission signal category (nonprofit, community, equity, etc.)
- Settings: add Voice & Personality textarea to Identity expander; add
  Mission & Values expander with editable fields for all 4 mission categories
- .gitignore: exclude CLAUDE.md, config/plain_text_resume.yaml,
  config/user.yaml.working
- search_profiles: add default profile
---
 .gitignore                       |  5 +++++
 app/pages/0_Setup.py             |  2 +-
 app/pages/2_Settings.py          | 37 +++++++++++++++++++++++++++++---
 app/pages/4_Apply.py             |  2 +-
 config/search_profiles.yaml      | 11 ++++++++++
 environment.yml                  |  5 ++++-
 requirements.txt                 |  5 ++++-
 scripts/company_research.py      |  2 +-
 scripts/generate_cover_letter.py | 30 +++++++++++++++++++++-----
 scripts/migrate.py               | 10 +++++----
 scripts/resume_parser.py         | 18 +++++++++++++---
 scripts/user_profile.py          |  2 ++
 12 files changed, 109 insertions(+), 20 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0787951..edf6c8c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,7 @@ unsloth_compiled_cache/
 data/survey_screenshots/*
 !data/survey_screenshots/.gitkeep
 config/user.yaml
+config/plain_text_resume.yaml
 config/.backup-*
 config/integrations/*.yaml
 !config/integrations/*.yaml.example
@@ -30,3 +31,7 @@ scrapers/raw_scrapes/
 
 compose.override.yml
 config/license.json
+config/user.yaml.working
+
+# Claude context files — kept out of version control
+CLAUDE.md
diff --git a/app/pages/0_Setup.py b/app/pages/0_Setup.py
index dce06b2..89670f3 100644
--- a/app/pages/0_Setup.py
+++ b/app/pages/0_Setup.py
@@ -405,7 +405,7 @@ elif step == 4:
         if errs:
             st.error("\n".join(errs))
         else:
-            resume_yaml_path = _ROOT / "aihawk" / "data_folder" / "plain_text_resume.yaml"
+            resume_yaml_path = _ROOT / "config" / "plain_text_resume.yaml"
             resume_yaml_path.parent.mkdir(parents=True, exist_ok=True)
             resume_data = {**parsed, "experience": experience} if parsed else {"experience": experience}
             resume_yaml_path.write_text(
diff --git a/app/pages/2_Settings.py b/app/pages/2_Settings.py
index 2c5aae7..9922cb8 100644
--- a/app/pages/2_Settings.py
+++ b/app/pages/2_Settings.py
@@ -24,7 +24,7 @@ SEARCH_CFG = CONFIG_DIR / "search_profiles.yaml"
 BLOCKLIST_CFG = CONFIG_DIR / "blocklist.yaml"
 LLM_CFG = CONFIG_DIR / "llm.yaml"
 NOTION_CFG = CONFIG_DIR / "notion.yaml"
-RESUME_PATH = Path(__file__).parent.parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml"
+RESUME_PATH = Path(__file__).parent.parent.parent / "config" / "plain_text_resume.yaml"
 KEYWORDS_CFG = CONFIG_DIR / "resume_keywords.yaml"
 
 def load_yaml(path: Path) -> dict:
@@ -113,6 +113,36 @@ with tab_profile:
         u_linkedin = c2.text_input("LinkedIn URL", _u.get("linkedin", ""))
         u_summary  = st.text_area("Career Summary (used in LLM prompts)",
                                    _u.get("career_summary", ""), height=100)
+        u_voice = st.text_area(
+            "Voice & Personality (shapes cover letter tone)",
+            _u.get("candidate_voice", ""),
+            height=80,
+            help="Personality traits and writing voice that the LLM uses to write authentically in your style. Never disclosed in applications.",
+        )
+
+    with st.expander("🎯 Mission & Values"):
+        st.caption("Industry passions and causes you care about. Used to inject authentic Para 3 alignment when a company matches. Never disclosed in applications.")
+        _mission = dict(_u.get("mission_preferences", {}))
+        _mission_keys = ["animal_welfare", "education", "music", "social_impact"]
+        _mission_labels = {
+            "animal_welfare": "🐾 Animal Welfare",
+            "education": "📚 Education / EdTech / Kids",
+            "music": "🎵 Music Industry",
+            "social_impact": "🌍 Social Impact / Nonprofits",
+        }
+        _mission_updated = {}
+        for key in _mission_keys:
+            _mission_updated[key] = st.text_area(
+                _mission_labels[key],
+                _mission.get(key, ""),
+                height=68,
+                key=f"mission_{key}",
+                help=f"Your personal connection to this domain. Leave blank to use the default prompt hint.",
+            )
+        # Preserve any extra keys the user may have added manually in YAML
+        for k, v in _mission.items():
+            if k not in _mission_keys:
+                _mission_updated[k] = v
 
     with st.expander("🔒 Sensitive Employers (NDA)"):
         st.caption("Companies listed here appear as 'previous employer (NDA)' in research briefs.")
@@ -180,10 +210,11 @@ with tab_profile:
         new_data = {
             "name": u_name, "email": u_email, "phone": u_phone,
             "linkedin": u_linkedin, "career_summary": u_summary,
+            "candidate_voice": u_voice,
             "nda_companies": nda_list,
             "docs_dir": u_docs, "ollama_models_dir": u_ollama, "vllm_models_dir": u_vllm,
             "inference_profile": u_inf_profile,
-            "mission_preferences": _u.get("mission_preferences", {}),
+            "mission_preferences": {k: v for k, v in _mission_updated.items() if v.strip()},
             "candidate_accessibility_focus": u_access_focus,
             "candidate_lgbtq_focus": u_lgbtq_focus,
             "services": {
@@ -673,7 +704,7 @@ with tab_resume:
     )
 
     if not RESUME_PATH.exists():
-        st.error(f"Resume YAML not found at `{RESUME_PATH}`. Is AIHawk cloned?")
+        st.error(f"Resume YAML not found at `{RESUME_PATH}`. Copy or create `config/plain_text_resume.yaml`.")
         st.stop()
 
     _data = yaml.safe_load(RESUME_PATH.read_text()) or {}
diff --git a/app/pages/4_Apply.py b/app/pages/4_Apply.py
index 2c6bcef..41d98b9 100644
--- a/app/pages/4_Apply.py
+++ b/app/pages/4_Apply.py
@@ -28,7 +28,7 @@ from scripts.db import (
 from scripts.task_runner import submit_task
 
 DOCS_DIR = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch"
-RESUME_YAML = Path(__file__).parent.parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml"
+RESUME_YAML = Path(__file__).parent.parent.parent / "config" / "plain_text_resume.yaml"
 
 st.title("🚀 Apply Workspace")
 
diff --git a/config/search_profiles.yaml b/config/search_profiles.yaml
index bada59a..8ab44dc 100644
--- a/config/search_profiles.yaml
+++ b/config/search_profiles.yaml
@@ -1,4 +1,15 @@
 profiles:
+- boards:
+  - linkedin
+  - indeed
+  - glassdoor
+  - zip_recruiter
+  job_titles:
+  - Customer Service Specialist
+  locations:
+  - San Francisco CA
+  name: default
+  remote_only: false
 - boards:
   - linkedin
   - indeed
diff --git a/environment.yml b/environment.yml
index 8839279..703118f 100644
--- a/environment.yml
+++ b/environment.yml
@@ -28,7 +28,7 @@ dependencies:
     - fake-useragent      # company scraper rotation
 
     # ── LLM / AI backends ─────────────────────────────────────────────────────
-    - openai>=1.0         # used for OpenAI-compat backends (ollama, vllm, wrappers)
+    - openai>=1.55.0,<2.0.0  # >=1.55 required for httpx 0.28 compat; <2.0 for langchain-openai
     - anthropic>=0.80     # direct Anthropic API fallback
     - ollama              # Python client for Ollama management
     - langchain>=0.2
@@ -54,6 +54,9 @@ dependencies:
     - pyyaml>=6.0
     - python-dotenv
 
+    # ── Auth / licensing ──────────────────────────────────────────────────────
+    - PyJWT>=2.8
+
     # ── Utilities ─────────────────────────────────────────────────────────────
     - sqlalchemy
     - tqdm
diff --git a/requirements.txt b/requirements.txt
index e31b83e..1b0b597 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,7 +22,7 @@ curl_cffi
 fake-useragent
 
 # ── LLM / AI backends ─────────────────────────────────────────────────────
-openai>=1.0
+openai>=1.55.0,<2.0.0  # >=1.55 required for httpx 0.28 compat; <2.0 for langchain-openai
 anthropic>=0.80
 ollama
 langchain>=0.2
@@ -51,6 +51,9 @@ json-repair
 pyyaml>=6.0
 python-dotenv
 
+# ── Auth / licensing ──────────────────────────────────────────────────────
+PyJWT>=2.8
+
 # ── Utilities ─────────────────────────────────────────────────────────────
 sqlalchemy
 tqdm
diff --git a/scripts/company_research.py b/scripts/company_research.py
index bdab12b..32fde8f 100644
--- a/scripts/company_research.py
+++ b/scripts/company_research.py
@@ -193,7 +193,7 @@ def _parse_sections(text: str) -> dict[str, str]:
     return sections
 
 
-_RESUME_YAML = Path(__file__).parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml"
+_RESUME_YAML = Path(__file__).parent.parent / "config" / "plain_text_resume.yaml"
 _KEYWORDS_YAML = Path(__file__).parent.parent / "config" / "resume_keywords.yaml"
 
 
diff --git a/scripts/generate_cover_letter.py b/scripts/generate_cover_letter.py
index 4f0da15..481c263 100644
--- a/scripts/generate_cover_letter.py
+++ b/scripts/generate_cover_letter.py
@@ -26,11 +26,19 @@ LETTERS_DIR = _profile.docs_dir if _profile else Path.home() / "Documents" / "Jo
 LETTER_GLOB = "*Cover Letter*.md"
 
 # Background injected into every prompt so the model has the candidate's facts
-SYSTEM_CONTEXT = (
-    f"You are writing cover letters for {_profile.name}. {_profile.career_summary}"
-    if _profile else
-    "You are a professional cover letter writer. Write in first person."
-)
+def _build_system_context() -> str:
+    if not _profile:
+        return "You are a professional cover letter writer. Write in first person."
+    parts = [f"You are writing cover letters for {_profile.name}. {_profile.career_summary}"]
+    if _profile.candidate_voice:
+        parts.append(
+            f"Voice and personality: {_profile.candidate_voice} "
+            "Write in a way that reflects these authentic traits — not as a checklist, "
+            "but as a natural expression of who this person is."
+        )
+    return " ".join(parts)
+
+SYSTEM_CONTEXT = _build_system_context()
 
 
 # ── Mission-alignment detection ───────────────────────────────────────────────
@@ -58,6 +66,13 @@ _MISSION_SIGNALS: dict[str, list[str]] = {
         "instructure", "canvas lms", "clever", "district", "teacher",
         "k-12", "k12", "grade", "pedagogy",
     ],
+    "social_impact": [
+        "nonprofit", "non-profit", "501(c)", "social impact", "mission-driven",
+        "public benefit", "community", "underserved", "equity", "justice",
+        "humanitarian", "advocacy", "charity", "foundation", "ngo",
+        "social good", "civic", "public health", "mental health", "food security",
+        "housing", "homelessness", "poverty", "workforce development",
+    ],
 }
 
 _candidate = _profile.name if _profile else "the candidate"
@@ -79,6 +94,11 @@ _MISSION_DEFAULTS: dict[str, str] = {
         f"{_candidate}'s values. Para 3 should reflect this authentic connection specifically "
         "and warmly."
     ),
+    "social_impact": (
+        f"This organization is mission-driven / social impact focused — exactly the kind of "
+        f"cause {_candidate} cares deeply about. Para 3 should warmly reflect their genuine "
+        "desire to apply their skills to work that makes a real difference in people's lives."
+    ),
 }
 
 
diff --git a/scripts/migrate.py b/scripts/migrate.py
index d370fb6..67cfad8 100644
--- a/scripts/migrate.py
+++ b/scripts/migrate.py
@@ -84,9 +84,9 @@ def _extract_career_summary(source: Path) -> str:
 
 def _extract_personal_info(source: Path) -> dict:
     """Extract personal info from aihawk resume yaml."""
-    resume = source / "aihawk" / "data_folder" / "plain_text_resume.yaml"
+    resume = source / "config" / "plain_text_resume.yaml"
     if not resume.exists():
-        resume = source / "config" / "plain_text_resume.yaml"
+        resume = source / "aihawk" / "data_folder" / "plain_text_resume.yaml"
     if not resume.exists():
         return {}
     data = _load_yaml(resume)
@@ -197,8 +197,10 @@ def _copy_configs(source: Path, dest: Path, apply: bool) -> None:
 
 def _copy_aihawk_resume(source: Path, dest: Path, apply: bool) -> None:
     print("\n── Copying AIHawk resume profile")
-    src = source / "aihawk" / "data_folder" / "plain_text_resume.yaml"
-    dst = dest / "aihawk" / "data_folder" / "plain_text_resume.yaml"
+    src = source / "config" / "plain_text_resume.yaml"
+    if not src.exists():
+        src = source / "aihawk" / "data_folder" / "plain_text_resume.yaml"
+    dst = dest / "config" / "plain_text_resume.yaml"
     _copy_file(src, dst, apply)
 
 
diff --git a/scripts/resume_parser.py b/scripts/resume_parser.py
index 4450dbb..ed9f74b 100644
--- a/scripts/resume_parser.py
+++ b/scripts/resume_parser.py
@@ -92,6 +92,18 @@ def _find_column_split(page) -> float | None:
     return split_x if split_x and best_gap > page.width * 0.03 else None
 
 
+_CID_BULLETS = {127, 149, 183}  # common bullet CIDs across ATS-reembedded fonts
+
+def _clean_cid(text: str) -> str:
+    """Replace (cid:NNN) glyph references emitted by pdfplumber when a PDF font
+    lacks a ToUnicode map.  Known bullet CIDs become '•'; everything else is
+    stripped so downstream section parsing sees clean text."""
+    def _replace(m: re.Match) -> str:
+        n = int(m.group(1))
+        return "•" if n in _CID_BULLETS else ""
+    return re.sub(r"\(cid:(\d+)\)", _replace, text)
+
+
 def extract_text_from_pdf(file_bytes: bytes) -> str:
     """Extract text from PDF, handling two-column layouts via gutter detection.
 
@@ -116,12 +128,12 @@ def extract_text_from_pdf(file_bytes: bytes) -> str:
                     pages.append("\n".join(filter(None, [header_text, left_text, right_text])))
                     continue
             pages.append(page.extract_text() or "")
-    return "\n".join(pages)
+    return _clean_cid("\n".join(pages))
 
 
 def extract_text_from_docx(file_bytes: bytes) -> str:
     doc = Document(io.BytesIO(file_bytes))
-    return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
+    return _clean_cid("\n".join(p.text for p in doc.paragraphs if p.text.strip()))
 
 
 def extract_text_from_odt(file_bytes: bytes) -> str:
@@ -139,7 +151,7 @@ def extract_text_from_odt(file_bytes: bytes) -> str:
             text = "".join(elem.itertext()).strip()
             if text:
                 lines.append(text)
-    return "\n".join(lines)
+    return _clean_cid("\n".join(lines))
 
 
 # ── Section splitter ──────────────────────────────────────────────────────────
diff --git a/scripts/user_profile.py b/scripts/user_profile.py
index 1e4981b..fa2678f 100644
--- a/scripts/user_profile.py
+++ b/scripts/user_profile.py
@@ -15,6 +15,7 @@ _DEFAULTS = {
     "phone": "",
     "linkedin": "",
     "career_summary": "",
+    "candidate_voice": "",
     "nda_companies": [],
     "docs_dir": "~/Documents/JobSearch",
     "ollama_models_dir": "~/models/ollama",
@@ -61,6 +62,7 @@ class UserProfile:
         self.phone: str = data["phone"]
         self.linkedin: str = data["linkedin"]
         self.career_summary: str = data["career_summary"]
+        self.candidate_voice: str = data.get("candidate_voice", "")
         self.nda_companies: list[str] = [c.lower() for c in data["nda_companies"]]
         self.docs_dir: Path = Path(data["docs_dir"]).expanduser().resolve()
         self.ollama_models_dir: Path = Path(data["ollama_models_dir"]).expanduser().resolve()