fix: resume CID glyphs, resume YAML path, PyJWT dep, candidate voice & mission UI

- resume_parser: add _clean_cid() to strip (cid:NNN) glyph refs from ATS PDFs;
  CIDs 127/149/183 become bullets, unknowns are stripped; applied to PDF/DOCX/ODT
- resume YAML: canonicalize plain_text_resume.yaml path to config/ across all
  references (Settings, Apply, Setup, company_research, migrate); was pointing at
  unmounted aihawk/data_folder/ in Docker
- requirements/environment: add PyJWT>=2.8 (was missing; broke Settings page)
- user_profile: add candidate_voice field
- generate_cover_letter: inject candidate_voice into SYSTEM_CONTEXT; add
  social_impact mission signal category (nonprofit, community, equity, etc.)
- Settings: add Voice & Personality textarea to Identity expander; add
  Mission & Values expander with editable fields for all 4 mission categories
- .gitignore: exclude CLAUDE.md, config/plain_text_resume.yaml,
  config/user.yaml.working
- search_profiles: add default profile
This commit is contained in:
pyr0ball 2026-02-26 12:32:28 -08:00
parent 07bdac6302
commit db127848a1
12 changed files with 109 additions and 20 deletions

5
.gitignore vendored
View file

@ -19,6 +19,7 @@ unsloth_compiled_cache/
data/survey_screenshots/* data/survey_screenshots/*
!data/survey_screenshots/.gitkeep !data/survey_screenshots/.gitkeep
config/user.yaml config/user.yaml
config/plain_text_resume.yaml
config/.backup-* config/.backup-*
config/integrations/*.yaml config/integrations/*.yaml
!config/integrations/*.yaml.example !config/integrations/*.yaml.example
@ -30,3 +31,7 @@ scrapers/raw_scrapes/
compose.override.yml compose.override.yml
config/license.json config/license.json
config/user.yaml.working
# Claude context files — kept out of version control
CLAUDE.md

View file

@ -405,7 +405,7 @@ elif step == 4:
if errs: if errs:
st.error("\n".join(errs)) st.error("\n".join(errs))
else: else:
resume_yaml_path = _ROOT / "aihawk" / "data_folder" / "plain_text_resume.yaml" resume_yaml_path = _ROOT / "config" / "plain_text_resume.yaml"
resume_yaml_path.parent.mkdir(parents=True, exist_ok=True) resume_yaml_path.parent.mkdir(parents=True, exist_ok=True)
resume_data = {**parsed, "experience": experience} if parsed else {"experience": experience} resume_data = {**parsed, "experience": experience} if parsed else {"experience": experience}
resume_yaml_path.write_text( resume_yaml_path.write_text(

View file

@ -24,7 +24,7 @@ SEARCH_CFG = CONFIG_DIR / "search_profiles.yaml"
BLOCKLIST_CFG = CONFIG_DIR / "blocklist.yaml" BLOCKLIST_CFG = CONFIG_DIR / "blocklist.yaml"
LLM_CFG = CONFIG_DIR / "llm.yaml" LLM_CFG = CONFIG_DIR / "llm.yaml"
NOTION_CFG = CONFIG_DIR / "notion.yaml" NOTION_CFG = CONFIG_DIR / "notion.yaml"
RESUME_PATH = Path(__file__).parent.parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml" RESUME_PATH = Path(__file__).parent.parent.parent / "config" / "plain_text_resume.yaml"
KEYWORDS_CFG = CONFIG_DIR / "resume_keywords.yaml" KEYWORDS_CFG = CONFIG_DIR / "resume_keywords.yaml"
def load_yaml(path: Path) -> dict: def load_yaml(path: Path) -> dict:
@ -113,6 +113,36 @@ with tab_profile:
u_linkedin = c2.text_input("LinkedIn URL", _u.get("linkedin", "")) u_linkedin = c2.text_input("LinkedIn URL", _u.get("linkedin", ""))
u_summary = st.text_area("Career Summary (used in LLM prompts)", u_summary = st.text_area("Career Summary (used in LLM prompts)",
_u.get("career_summary", ""), height=100) _u.get("career_summary", ""), height=100)
u_voice = st.text_area(
"Voice & Personality (shapes cover letter tone)",
_u.get("candidate_voice", ""),
height=80,
help="Personality traits and writing voice that the LLM uses to write authentically in your style. Never disclosed in applications.",
)
with st.expander("🎯 Mission & Values"):
st.caption("Industry passions and causes you care about. Used to inject authentic Para 3 alignment when a company matches. Never disclosed in applications.")
_mission = dict(_u.get("mission_preferences", {}))
_mission_keys = ["animal_welfare", "education", "music", "social_impact"]
_mission_labels = {
"animal_welfare": "🐾 Animal Welfare",
"education": "📚 Education / EdTech / Kids",
"music": "🎵 Music Industry",
"social_impact": "🌍 Social Impact / Nonprofits",
}
_mission_updated = {}
for key in _mission_keys:
_mission_updated[key] = st.text_area(
_mission_labels[key],
_mission.get(key, ""),
height=68,
key=f"mission_{key}",
help=f"Your personal connection to this domain. Leave blank to use the default prompt hint.",
)
# Preserve any extra keys the user may have added manually in YAML
for k, v in _mission.items():
if k not in _mission_keys:
_mission_updated[k] = v
with st.expander("🔒 Sensitive Employers (NDA)"): with st.expander("🔒 Sensitive Employers (NDA)"):
st.caption("Companies listed here appear as 'previous employer (NDA)' in research briefs.") st.caption("Companies listed here appear as 'previous employer (NDA)' in research briefs.")
@ -180,10 +210,11 @@ with tab_profile:
new_data = { new_data = {
"name": u_name, "email": u_email, "phone": u_phone, "name": u_name, "email": u_email, "phone": u_phone,
"linkedin": u_linkedin, "career_summary": u_summary, "linkedin": u_linkedin, "career_summary": u_summary,
"candidate_voice": u_voice,
"nda_companies": nda_list, "nda_companies": nda_list,
"docs_dir": u_docs, "ollama_models_dir": u_ollama, "vllm_models_dir": u_vllm, "docs_dir": u_docs, "ollama_models_dir": u_ollama, "vllm_models_dir": u_vllm,
"inference_profile": u_inf_profile, "inference_profile": u_inf_profile,
"mission_preferences": _u.get("mission_preferences", {}), "mission_preferences": {k: v for k, v in _mission_updated.items() if v.strip()},
"candidate_accessibility_focus": u_access_focus, "candidate_accessibility_focus": u_access_focus,
"candidate_lgbtq_focus": u_lgbtq_focus, "candidate_lgbtq_focus": u_lgbtq_focus,
"services": { "services": {
@ -673,7 +704,7 @@ with tab_resume:
) )
if not RESUME_PATH.exists(): if not RESUME_PATH.exists():
st.error(f"Resume YAML not found at `{RESUME_PATH}`. Is AIHawk cloned?") st.error(f"Resume YAML not found at `{RESUME_PATH}`. Copy or create `config/plain_text_resume.yaml`.")
st.stop() st.stop()
_data = yaml.safe_load(RESUME_PATH.read_text()) or {} _data = yaml.safe_load(RESUME_PATH.read_text()) or {}

View file

@ -28,7 +28,7 @@ from scripts.db import (
from scripts.task_runner import submit_task from scripts.task_runner import submit_task
DOCS_DIR = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch" DOCS_DIR = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch"
RESUME_YAML = Path(__file__).parent.parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml" RESUME_YAML = Path(__file__).parent.parent.parent / "config" / "plain_text_resume.yaml"
st.title("🚀 Apply Workspace") st.title("🚀 Apply Workspace")

View file

@ -1,4 +1,15 @@
profiles: profiles:
- boards:
- linkedin
- indeed
- glassdoor
- zip_recruiter
job_titles:
- Customer Service Specialist
locations:
- San Francisco CA
name: default
remote_only: false
- boards: - boards:
- linkedin - linkedin
- indeed - indeed

View file

@ -28,7 +28,7 @@ dependencies:
- fake-useragent # company scraper rotation - fake-useragent # company scraper rotation
# ── LLM / AI backends ───────────────────────────────────────────────────── # ── LLM / AI backends ─────────────────────────────────────────────────────
- openai>=1.0 # used for OpenAI-compat backends (ollama, vllm, wrappers) - openai>=1.55.0,<2.0.0 # >=1.55 required for httpx 0.28 compat; <2.0 for langchain-openai
- anthropic>=0.80 # direct Anthropic API fallback - anthropic>=0.80 # direct Anthropic API fallback
- ollama # Python client for Ollama management - ollama # Python client for Ollama management
- langchain>=0.2 - langchain>=0.2
@ -54,6 +54,9 @@ dependencies:
- pyyaml>=6.0 - pyyaml>=6.0
- python-dotenv - python-dotenv
# ── Auth / licensing ──────────────────────────────────────────────────────
- PyJWT>=2.8
# ── Utilities ───────────────────────────────────────────────────────────── # ── Utilities ─────────────────────────────────────────────────────────────
- sqlalchemy - sqlalchemy
- tqdm - tqdm

View file

@ -22,7 +22,7 @@ curl_cffi
fake-useragent fake-useragent
# ── LLM / AI backends ───────────────────────────────────────────────────── # ── LLM / AI backends ─────────────────────────────────────────────────────
openai>=1.0 openai>=1.55.0,<2.0.0 # >=1.55 required for httpx 0.28 compat; <2.0 for langchain-openai
anthropic>=0.80 anthropic>=0.80
ollama ollama
langchain>=0.2 langchain>=0.2
@ -51,6 +51,9 @@ json-repair
pyyaml>=6.0 pyyaml>=6.0
python-dotenv python-dotenv
# ── Auth / licensing ──────────────────────────────────────────────────────
PyJWT>=2.8
# ── Utilities ───────────────────────────────────────────────────────────── # ── Utilities ─────────────────────────────────────────────────────────────
sqlalchemy sqlalchemy
tqdm tqdm

View file

@ -193,7 +193,7 @@ def _parse_sections(text: str) -> dict[str, str]:
return sections return sections
_RESUME_YAML = Path(__file__).parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml" _RESUME_YAML = Path(__file__).parent.parent / "config" / "plain_text_resume.yaml"
_KEYWORDS_YAML = Path(__file__).parent.parent / "config" / "resume_keywords.yaml" _KEYWORDS_YAML = Path(__file__).parent.parent / "config" / "resume_keywords.yaml"

View file

@ -26,11 +26,19 @@ LETTERS_DIR = _profile.docs_dir if _profile else Path.home() / "Documents" / "Jo
LETTER_GLOB = "*Cover Letter*.md" LETTER_GLOB = "*Cover Letter*.md"
# Background injected into every prompt so the model has the candidate's facts # Background injected into every prompt so the model has the candidate's facts
SYSTEM_CONTEXT = ( def _build_system_context() -> str:
f"You are writing cover letters for {_profile.name}. {_profile.career_summary}" if not _profile:
if _profile else return "You are a professional cover letter writer. Write in first person."
"You are a professional cover letter writer. Write in first person." parts = [f"You are writing cover letters for {_profile.name}. {_profile.career_summary}"]
) if _profile.candidate_voice:
parts.append(
f"Voice and personality: {_profile.candidate_voice} "
"Write in a way that reflects these authentic traits — not as a checklist, "
"but as a natural expression of who this person is."
)
return " ".join(parts)
SYSTEM_CONTEXT = _build_system_context()
# ── Mission-alignment detection ─────────────────────────────────────────────── # ── Mission-alignment detection ───────────────────────────────────────────────
@ -58,6 +66,13 @@ _MISSION_SIGNALS: dict[str, list[str]] = {
"instructure", "canvas lms", "clever", "district", "teacher", "instructure", "canvas lms", "clever", "district", "teacher",
"k-12", "k12", "grade", "pedagogy", "k-12", "k12", "grade", "pedagogy",
], ],
"social_impact": [
"nonprofit", "non-profit", "501(c)", "social impact", "mission-driven",
"public benefit", "community", "underserved", "equity", "justice",
"humanitarian", "advocacy", "charity", "foundation", "ngo",
"social good", "civic", "public health", "mental health", "food security",
"housing", "homelessness", "poverty", "workforce development",
],
} }
_candidate = _profile.name if _profile else "the candidate" _candidate = _profile.name if _profile else "the candidate"
@ -79,6 +94,11 @@ _MISSION_DEFAULTS: dict[str, str] = {
f"{_candidate}'s values. Para 3 should reflect this authentic connection specifically " f"{_candidate}'s values. Para 3 should reflect this authentic connection specifically "
"and warmly." "and warmly."
), ),
"social_impact": (
f"This organization is mission-driven / social impact focused — exactly the kind of "
f"cause {_candidate} cares deeply about. Para 3 should warmly reflect their genuine "
"desire to apply their skills to work that makes a real difference in people's lives."
),
} }

View file

@ -84,9 +84,9 @@ def _extract_career_summary(source: Path) -> str:
def _extract_personal_info(source: Path) -> dict: def _extract_personal_info(source: Path) -> dict:
"""Extract personal info from aihawk resume yaml.""" """Extract personal info from aihawk resume yaml."""
resume = source / "aihawk" / "data_folder" / "plain_text_resume.yaml" resume = source / "config" / "plain_text_resume.yaml"
if not resume.exists(): if not resume.exists():
resume = source / "config" / "plain_text_resume.yaml" resume = source / "aihawk" / "data_folder" / "plain_text_resume.yaml"
if not resume.exists(): if not resume.exists():
return {} return {}
data = _load_yaml(resume) data = _load_yaml(resume)
@ -197,8 +197,10 @@ def _copy_configs(source: Path, dest: Path, apply: bool) -> None:
def _copy_aihawk_resume(source: Path, dest: Path, apply: bool) -> None: def _copy_aihawk_resume(source: Path, dest: Path, apply: bool) -> None:
print("\n── Copying AIHawk resume profile") print("\n── Copying AIHawk resume profile")
src = source / "aihawk" / "data_folder" / "plain_text_resume.yaml" src = source / "config" / "plain_text_resume.yaml"
dst = dest / "aihawk" / "data_folder" / "plain_text_resume.yaml" if not src.exists():
src = source / "aihawk" / "data_folder" / "plain_text_resume.yaml"
dst = dest / "config" / "plain_text_resume.yaml"
_copy_file(src, dst, apply) _copy_file(src, dst, apply)

View file

@ -92,6 +92,18 @@ def _find_column_split(page) -> float | None:
return split_x if split_x and best_gap > page.width * 0.03 else None return split_x if split_x and best_gap > page.width * 0.03 else None
_CID_BULLETS = {127, 149, 183} # common bullet CIDs across ATS-reembedded fonts
def _clean_cid(text: str) -> str:
"""Replace (cid:NNN) glyph references emitted by pdfplumber when a PDF font
lacks a ToUnicode map. Known bullet CIDs become ''; everything else is
stripped so downstream section parsing sees clean text."""
def _replace(m: re.Match) -> str:
n = int(m.group(1))
return "" if n in _CID_BULLETS else ""
return re.sub(r"\(cid:(\d+)\)", _replace, text)
def extract_text_from_pdf(file_bytes: bytes) -> str: def extract_text_from_pdf(file_bytes: bytes) -> str:
"""Extract text from PDF, handling two-column layouts via gutter detection. """Extract text from PDF, handling two-column layouts via gutter detection.
@ -116,12 +128,12 @@ def extract_text_from_pdf(file_bytes: bytes) -> str:
pages.append("\n".join(filter(None, [header_text, left_text, right_text]))) pages.append("\n".join(filter(None, [header_text, left_text, right_text])))
continue continue
pages.append(page.extract_text() or "") pages.append(page.extract_text() or "")
return "\n".join(pages) return _clean_cid("\n".join(pages))
def extract_text_from_docx(file_bytes: bytes) -> str: def extract_text_from_docx(file_bytes: bytes) -> str:
doc = Document(io.BytesIO(file_bytes)) doc = Document(io.BytesIO(file_bytes))
return "\n".join(p.text for p in doc.paragraphs if p.text.strip()) return _clean_cid("\n".join(p.text for p in doc.paragraphs if p.text.strip()))
def extract_text_from_odt(file_bytes: bytes) -> str: def extract_text_from_odt(file_bytes: bytes) -> str:
@ -139,7 +151,7 @@ def extract_text_from_odt(file_bytes: bytes) -> str:
text = "".join(elem.itertext()).strip() text = "".join(elem.itertext()).strip()
if text: if text:
lines.append(text) lines.append(text)
return "\n".join(lines) return _clean_cid("\n".join(lines))
# ── Section splitter ────────────────────────────────────────────────────────── # ── Section splitter ──────────────────────────────────────────────────────────

View file

@ -15,6 +15,7 @@ _DEFAULTS = {
"phone": "", "phone": "",
"linkedin": "", "linkedin": "",
"career_summary": "", "career_summary": "",
"candidate_voice": "",
"nda_companies": [], "nda_companies": [],
"docs_dir": "~/Documents/JobSearch", "docs_dir": "~/Documents/JobSearch",
"ollama_models_dir": "~/models/ollama", "ollama_models_dir": "~/models/ollama",
@ -61,6 +62,7 @@ class UserProfile:
self.phone: str = data["phone"] self.phone: str = data["phone"]
self.linkedin: str = data["linkedin"] self.linkedin: str = data["linkedin"]
self.career_summary: str = data["career_summary"] self.career_summary: str = data["career_summary"]
self.candidate_voice: str = data.get("candidate_voice", "")
self.nda_companies: list[str] = [c.lower() for c in data["nda_companies"]] self.nda_companies: list[str] = [c.lower() for c in data["nda_companies"]]
self.docs_dir: Path = Path(data["docs_dir"]).expanduser().resolve() self.docs_dir: Path = Path(data["docs_dir"]).expanduser().resolve()
self.ollama_models_dir: Path = Path(data["ollama_models_dir"]).expanduser().resolve() self.ollama_models_dir: Path = Path(data["ollama_models_dir"]).expanduser().resolve()