fix: resume CID glyphs, resume YAML path, PyJWT dep, candidate voice & mission UI
- resume_parser: add _clean_cid() to strip (cid:NNN) glyph refs from ATS PDFs; CIDs 127/149/183 become bullets, unknowns are stripped; applied to PDF/DOCX/ODT - resume YAML: canonicalize plain_text_resume.yaml path to config/ across all references (Settings, Apply, Setup, company_research, migrate); was pointing at unmounted aihawk/data_folder/ in Docker - requirements/environment: add PyJWT>=2.8 (was missing; broke Settings page) - user_profile: add candidate_voice field - generate_cover_letter: inject candidate_voice into SYSTEM_CONTEXT; add social_impact mission signal category (nonprofit, community, equity, etc.) - Settings: add Voice & Personality textarea to Identity expander; add Mission & Values expander with editable fields for all 4 mission categories - .gitignore: exclude CLAUDE.md, config/plain_text_resume.yaml, config/user.yaml.working - search_profiles: add default profile
This commit is contained in:
parent
07bdac6302
commit
db127848a1
12 changed files with 109 additions and 20 deletions
5
.gitignore
vendored
5
.gitignore
vendored
|
|
@ -19,6 +19,7 @@ unsloth_compiled_cache/
|
||||||
data/survey_screenshots/*
|
data/survey_screenshots/*
|
||||||
!data/survey_screenshots/.gitkeep
|
!data/survey_screenshots/.gitkeep
|
||||||
config/user.yaml
|
config/user.yaml
|
||||||
|
config/plain_text_resume.yaml
|
||||||
config/.backup-*
|
config/.backup-*
|
||||||
config/integrations/*.yaml
|
config/integrations/*.yaml
|
||||||
!config/integrations/*.yaml.example
|
!config/integrations/*.yaml.example
|
||||||
|
|
@ -30,3 +31,7 @@ scrapers/raw_scrapes/
|
||||||
|
|
||||||
compose.override.yml
|
compose.override.yml
|
||||||
config/license.json
|
config/license.json
|
||||||
|
config/user.yaml.working
|
||||||
|
|
||||||
|
# Claude context files — kept out of version control
|
||||||
|
CLAUDE.md
|
||||||
|
|
|
||||||
|
|
@ -405,7 +405,7 @@ elif step == 4:
|
||||||
if errs:
|
if errs:
|
||||||
st.error("\n".join(errs))
|
st.error("\n".join(errs))
|
||||||
else:
|
else:
|
||||||
resume_yaml_path = _ROOT / "aihawk" / "data_folder" / "plain_text_resume.yaml"
|
resume_yaml_path = _ROOT / "config" / "plain_text_resume.yaml"
|
||||||
resume_yaml_path.parent.mkdir(parents=True, exist_ok=True)
|
resume_yaml_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
resume_data = {**parsed, "experience": experience} if parsed else {"experience": experience}
|
resume_data = {**parsed, "experience": experience} if parsed else {"experience": experience}
|
||||||
resume_yaml_path.write_text(
|
resume_yaml_path.write_text(
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,7 @@ SEARCH_CFG = CONFIG_DIR / "search_profiles.yaml"
|
||||||
BLOCKLIST_CFG = CONFIG_DIR / "blocklist.yaml"
|
BLOCKLIST_CFG = CONFIG_DIR / "blocklist.yaml"
|
||||||
LLM_CFG = CONFIG_DIR / "llm.yaml"
|
LLM_CFG = CONFIG_DIR / "llm.yaml"
|
||||||
NOTION_CFG = CONFIG_DIR / "notion.yaml"
|
NOTION_CFG = CONFIG_DIR / "notion.yaml"
|
||||||
RESUME_PATH = Path(__file__).parent.parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml"
|
RESUME_PATH = Path(__file__).parent.parent.parent / "config" / "plain_text_resume.yaml"
|
||||||
KEYWORDS_CFG = CONFIG_DIR / "resume_keywords.yaml"
|
KEYWORDS_CFG = CONFIG_DIR / "resume_keywords.yaml"
|
||||||
|
|
||||||
def load_yaml(path: Path) -> dict:
|
def load_yaml(path: Path) -> dict:
|
||||||
|
|
@ -113,6 +113,36 @@ with tab_profile:
|
||||||
u_linkedin = c2.text_input("LinkedIn URL", _u.get("linkedin", ""))
|
u_linkedin = c2.text_input("LinkedIn URL", _u.get("linkedin", ""))
|
||||||
u_summary = st.text_area("Career Summary (used in LLM prompts)",
|
u_summary = st.text_area("Career Summary (used in LLM prompts)",
|
||||||
_u.get("career_summary", ""), height=100)
|
_u.get("career_summary", ""), height=100)
|
||||||
|
u_voice = st.text_area(
|
||||||
|
"Voice & Personality (shapes cover letter tone)",
|
||||||
|
_u.get("candidate_voice", ""),
|
||||||
|
height=80,
|
||||||
|
help="Personality traits and writing voice that the LLM uses to write authentically in your style. Never disclosed in applications.",
|
||||||
|
)
|
||||||
|
|
||||||
|
with st.expander("🎯 Mission & Values"):
|
||||||
|
st.caption("Industry passions and causes you care about. Used to inject authentic Para 3 alignment when a company matches. Never disclosed in applications.")
|
||||||
|
_mission = dict(_u.get("mission_preferences", {}))
|
||||||
|
_mission_keys = ["animal_welfare", "education", "music", "social_impact"]
|
||||||
|
_mission_labels = {
|
||||||
|
"animal_welfare": "🐾 Animal Welfare",
|
||||||
|
"education": "📚 Education / EdTech / Kids",
|
||||||
|
"music": "🎵 Music Industry",
|
||||||
|
"social_impact": "🌍 Social Impact / Nonprofits",
|
||||||
|
}
|
||||||
|
_mission_updated = {}
|
||||||
|
for key in _mission_keys:
|
||||||
|
_mission_updated[key] = st.text_area(
|
||||||
|
_mission_labels[key],
|
||||||
|
_mission.get(key, ""),
|
||||||
|
height=68,
|
||||||
|
key=f"mission_{key}",
|
||||||
|
help=f"Your personal connection to this domain. Leave blank to use the default prompt hint.",
|
||||||
|
)
|
||||||
|
# Preserve any extra keys the user may have added manually in YAML
|
||||||
|
for k, v in _mission.items():
|
||||||
|
if k not in _mission_keys:
|
||||||
|
_mission_updated[k] = v
|
||||||
|
|
||||||
with st.expander("🔒 Sensitive Employers (NDA)"):
|
with st.expander("🔒 Sensitive Employers (NDA)"):
|
||||||
st.caption("Companies listed here appear as 'previous employer (NDA)' in research briefs.")
|
st.caption("Companies listed here appear as 'previous employer (NDA)' in research briefs.")
|
||||||
|
|
@ -180,10 +210,11 @@ with tab_profile:
|
||||||
new_data = {
|
new_data = {
|
||||||
"name": u_name, "email": u_email, "phone": u_phone,
|
"name": u_name, "email": u_email, "phone": u_phone,
|
||||||
"linkedin": u_linkedin, "career_summary": u_summary,
|
"linkedin": u_linkedin, "career_summary": u_summary,
|
||||||
|
"candidate_voice": u_voice,
|
||||||
"nda_companies": nda_list,
|
"nda_companies": nda_list,
|
||||||
"docs_dir": u_docs, "ollama_models_dir": u_ollama, "vllm_models_dir": u_vllm,
|
"docs_dir": u_docs, "ollama_models_dir": u_ollama, "vllm_models_dir": u_vllm,
|
||||||
"inference_profile": u_inf_profile,
|
"inference_profile": u_inf_profile,
|
||||||
"mission_preferences": _u.get("mission_preferences", {}),
|
"mission_preferences": {k: v for k, v in _mission_updated.items() if v.strip()},
|
||||||
"candidate_accessibility_focus": u_access_focus,
|
"candidate_accessibility_focus": u_access_focus,
|
||||||
"candidate_lgbtq_focus": u_lgbtq_focus,
|
"candidate_lgbtq_focus": u_lgbtq_focus,
|
||||||
"services": {
|
"services": {
|
||||||
|
|
@ -673,7 +704,7 @@ with tab_resume:
|
||||||
)
|
)
|
||||||
|
|
||||||
if not RESUME_PATH.exists():
|
if not RESUME_PATH.exists():
|
||||||
st.error(f"Resume YAML not found at `{RESUME_PATH}`. Is AIHawk cloned?")
|
st.error(f"Resume YAML not found at `{RESUME_PATH}`. Copy or create `config/plain_text_resume.yaml`.")
|
||||||
st.stop()
|
st.stop()
|
||||||
|
|
||||||
_data = yaml.safe_load(RESUME_PATH.read_text()) or {}
|
_data = yaml.safe_load(RESUME_PATH.read_text()) or {}
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,7 @@ from scripts.db import (
|
||||||
from scripts.task_runner import submit_task
|
from scripts.task_runner import submit_task
|
||||||
|
|
||||||
DOCS_DIR = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch"
|
DOCS_DIR = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch"
|
||||||
RESUME_YAML = Path(__file__).parent.parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml"
|
RESUME_YAML = Path(__file__).parent.parent.parent / "config" / "plain_text_resume.yaml"
|
||||||
|
|
||||||
st.title("🚀 Apply Workspace")
|
st.title("🚀 Apply Workspace")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,15 @@
|
||||||
profiles:
|
profiles:
|
||||||
|
- boards:
|
||||||
|
- linkedin
|
||||||
|
- indeed
|
||||||
|
- glassdoor
|
||||||
|
- zip_recruiter
|
||||||
|
job_titles:
|
||||||
|
- Customer Service Specialist
|
||||||
|
locations:
|
||||||
|
- San Francisco CA
|
||||||
|
name: default
|
||||||
|
remote_only: false
|
||||||
- boards:
|
- boards:
|
||||||
- linkedin
|
- linkedin
|
||||||
- indeed
|
- indeed
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,7 @@ dependencies:
|
||||||
- fake-useragent # company scraper rotation
|
- fake-useragent # company scraper rotation
|
||||||
|
|
||||||
# ── LLM / AI backends ─────────────────────────────────────────────────────
|
# ── LLM / AI backends ─────────────────────────────────────────────────────
|
||||||
- openai>=1.0 # used for OpenAI-compat backends (ollama, vllm, wrappers)
|
- openai>=1.55.0,<2.0.0 # >=1.55 required for httpx 0.28 compat; <2.0 for langchain-openai
|
||||||
- anthropic>=0.80 # direct Anthropic API fallback
|
- anthropic>=0.80 # direct Anthropic API fallback
|
||||||
- ollama # Python client for Ollama management
|
- ollama # Python client for Ollama management
|
||||||
- langchain>=0.2
|
- langchain>=0.2
|
||||||
|
|
@ -54,6 +54,9 @@ dependencies:
|
||||||
- pyyaml>=6.0
|
- pyyaml>=6.0
|
||||||
- python-dotenv
|
- python-dotenv
|
||||||
|
|
||||||
|
# ── Auth / licensing ──────────────────────────────────────────────────────
|
||||||
|
- PyJWT>=2.8
|
||||||
|
|
||||||
# ── Utilities ─────────────────────────────────────────────────────────────
|
# ── Utilities ─────────────────────────────────────────────────────────────
|
||||||
- sqlalchemy
|
- sqlalchemy
|
||||||
- tqdm
|
- tqdm
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,7 @@ curl_cffi
|
||||||
fake-useragent
|
fake-useragent
|
||||||
|
|
||||||
# ── LLM / AI backends ─────────────────────────────────────────────────────
|
# ── LLM / AI backends ─────────────────────────────────────────────────────
|
||||||
openai>=1.0
|
openai>=1.55.0,<2.0.0 # >=1.55 required for httpx 0.28 compat; <2.0 for langchain-openai
|
||||||
anthropic>=0.80
|
anthropic>=0.80
|
||||||
ollama
|
ollama
|
||||||
langchain>=0.2
|
langchain>=0.2
|
||||||
|
|
@ -51,6 +51,9 @@ json-repair
|
||||||
pyyaml>=6.0
|
pyyaml>=6.0
|
||||||
python-dotenv
|
python-dotenv
|
||||||
|
|
||||||
|
# ── Auth / licensing ──────────────────────────────────────────────────────
|
||||||
|
PyJWT>=2.8
|
||||||
|
|
||||||
# ── Utilities ─────────────────────────────────────────────────────────────
|
# ── Utilities ─────────────────────────────────────────────────────────────
|
||||||
sqlalchemy
|
sqlalchemy
|
||||||
tqdm
|
tqdm
|
||||||
|
|
|
||||||
|
|
@ -193,7 +193,7 @@ def _parse_sections(text: str) -> dict[str, str]:
|
||||||
return sections
|
return sections
|
||||||
|
|
||||||
|
|
||||||
_RESUME_YAML = Path(__file__).parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml"
|
_RESUME_YAML = Path(__file__).parent.parent / "config" / "plain_text_resume.yaml"
|
||||||
_KEYWORDS_YAML = Path(__file__).parent.parent / "config" / "resume_keywords.yaml"
|
_KEYWORDS_YAML = Path(__file__).parent.parent / "config" / "resume_keywords.yaml"
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -26,11 +26,19 @@ LETTERS_DIR = _profile.docs_dir if _profile else Path.home() / "Documents" / "Jo
|
||||||
LETTER_GLOB = "*Cover Letter*.md"
|
LETTER_GLOB = "*Cover Letter*.md"
|
||||||
|
|
||||||
# Background injected into every prompt so the model has the candidate's facts
|
# Background injected into every prompt so the model has the candidate's facts
|
||||||
SYSTEM_CONTEXT = (
|
def _build_system_context() -> str:
|
||||||
f"You are writing cover letters for {_profile.name}. {_profile.career_summary}"
|
if not _profile:
|
||||||
if _profile else
|
return "You are a professional cover letter writer. Write in first person."
|
||||||
"You are a professional cover letter writer. Write in first person."
|
parts = [f"You are writing cover letters for {_profile.name}. {_profile.career_summary}"]
|
||||||
)
|
if _profile.candidate_voice:
|
||||||
|
parts.append(
|
||||||
|
f"Voice and personality: {_profile.candidate_voice} "
|
||||||
|
"Write in a way that reflects these authentic traits — not as a checklist, "
|
||||||
|
"but as a natural expression of who this person is."
|
||||||
|
)
|
||||||
|
return " ".join(parts)
|
||||||
|
|
||||||
|
SYSTEM_CONTEXT = _build_system_context()
|
||||||
|
|
||||||
|
|
||||||
# ── Mission-alignment detection ───────────────────────────────────────────────
|
# ── Mission-alignment detection ───────────────────────────────────────────────
|
||||||
|
|
@ -58,6 +66,13 @@ _MISSION_SIGNALS: dict[str, list[str]] = {
|
||||||
"instructure", "canvas lms", "clever", "district", "teacher",
|
"instructure", "canvas lms", "clever", "district", "teacher",
|
||||||
"k-12", "k12", "grade", "pedagogy",
|
"k-12", "k12", "grade", "pedagogy",
|
||||||
],
|
],
|
||||||
|
"social_impact": [
|
||||||
|
"nonprofit", "non-profit", "501(c)", "social impact", "mission-driven",
|
||||||
|
"public benefit", "community", "underserved", "equity", "justice",
|
||||||
|
"humanitarian", "advocacy", "charity", "foundation", "ngo",
|
||||||
|
"social good", "civic", "public health", "mental health", "food security",
|
||||||
|
"housing", "homelessness", "poverty", "workforce development",
|
||||||
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
_candidate = _profile.name if _profile else "the candidate"
|
_candidate = _profile.name if _profile else "the candidate"
|
||||||
|
|
@ -79,6 +94,11 @@ _MISSION_DEFAULTS: dict[str, str] = {
|
||||||
f"{_candidate}'s values. Para 3 should reflect this authentic connection specifically "
|
f"{_candidate}'s values. Para 3 should reflect this authentic connection specifically "
|
||||||
"and warmly."
|
"and warmly."
|
||||||
),
|
),
|
||||||
|
"social_impact": (
|
||||||
|
f"This organization is mission-driven / social impact focused — exactly the kind of "
|
||||||
|
f"cause {_candidate} cares deeply about. Para 3 should warmly reflect their genuine "
|
||||||
|
"desire to apply their skills to work that makes a real difference in people's lives."
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -84,9 +84,9 @@ def _extract_career_summary(source: Path) -> str:
|
||||||
|
|
||||||
def _extract_personal_info(source: Path) -> dict:
|
def _extract_personal_info(source: Path) -> dict:
|
||||||
"""Extract personal info from aihawk resume yaml."""
|
"""Extract personal info from aihawk resume yaml."""
|
||||||
resume = source / "aihawk" / "data_folder" / "plain_text_resume.yaml"
|
|
||||||
if not resume.exists():
|
|
||||||
resume = source / "config" / "plain_text_resume.yaml"
|
resume = source / "config" / "plain_text_resume.yaml"
|
||||||
|
if not resume.exists():
|
||||||
|
resume = source / "aihawk" / "data_folder" / "plain_text_resume.yaml"
|
||||||
if not resume.exists():
|
if not resume.exists():
|
||||||
return {}
|
return {}
|
||||||
data = _load_yaml(resume)
|
data = _load_yaml(resume)
|
||||||
|
|
@ -197,8 +197,10 @@ def _copy_configs(source: Path, dest: Path, apply: bool) -> None:
|
||||||
|
|
||||||
def _copy_aihawk_resume(source: Path, dest: Path, apply: bool) -> None:
|
def _copy_aihawk_resume(source: Path, dest: Path, apply: bool) -> None:
|
||||||
print("\n── Copying AIHawk resume profile")
|
print("\n── Copying AIHawk resume profile")
|
||||||
|
src = source / "config" / "plain_text_resume.yaml"
|
||||||
|
if not src.exists():
|
||||||
src = source / "aihawk" / "data_folder" / "plain_text_resume.yaml"
|
src = source / "aihawk" / "data_folder" / "plain_text_resume.yaml"
|
||||||
dst = dest / "aihawk" / "data_folder" / "plain_text_resume.yaml"
|
dst = dest / "config" / "plain_text_resume.yaml"
|
||||||
_copy_file(src, dst, apply)
|
_copy_file(src, dst, apply)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -92,6 +92,18 @@ def _find_column_split(page) -> float | None:
|
||||||
return split_x if split_x and best_gap > page.width * 0.03 else None
|
return split_x if split_x and best_gap > page.width * 0.03 else None
|
||||||
|
|
||||||
|
|
||||||
|
_CID_BULLETS = {127, 149, 183} # common bullet CIDs across ATS-reembedded fonts
|
||||||
|
|
||||||
|
def _clean_cid(text: str) -> str:
|
||||||
|
"""Replace (cid:NNN) glyph references emitted by pdfplumber when a PDF font
|
||||||
|
lacks a ToUnicode map. Known bullet CIDs become '•'; everything else is
|
||||||
|
stripped so downstream section parsing sees clean text."""
|
||||||
|
def _replace(m: re.Match) -> str:
|
||||||
|
n = int(m.group(1))
|
||||||
|
return "•" if n in _CID_BULLETS else ""
|
||||||
|
return re.sub(r"\(cid:(\d+)\)", _replace, text)
|
||||||
|
|
||||||
|
|
||||||
def extract_text_from_pdf(file_bytes: bytes) -> str:
|
def extract_text_from_pdf(file_bytes: bytes) -> str:
|
||||||
"""Extract text from PDF, handling two-column layouts via gutter detection.
|
"""Extract text from PDF, handling two-column layouts via gutter detection.
|
||||||
|
|
||||||
|
|
@ -116,12 +128,12 @@ def extract_text_from_pdf(file_bytes: bytes) -> str:
|
||||||
pages.append("\n".join(filter(None, [header_text, left_text, right_text])))
|
pages.append("\n".join(filter(None, [header_text, left_text, right_text])))
|
||||||
continue
|
continue
|
||||||
pages.append(page.extract_text() or "")
|
pages.append(page.extract_text() or "")
|
||||||
return "\n".join(pages)
|
return _clean_cid("\n".join(pages))
|
||||||
|
|
||||||
|
|
||||||
def extract_text_from_docx(file_bytes: bytes) -> str:
|
def extract_text_from_docx(file_bytes: bytes) -> str:
|
||||||
doc = Document(io.BytesIO(file_bytes))
|
doc = Document(io.BytesIO(file_bytes))
|
||||||
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
|
return _clean_cid("\n".join(p.text for p in doc.paragraphs if p.text.strip()))
|
||||||
|
|
||||||
|
|
||||||
def extract_text_from_odt(file_bytes: bytes) -> str:
|
def extract_text_from_odt(file_bytes: bytes) -> str:
|
||||||
|
|
@ -139,7 +151,7 @@ def extract_text_from_odt(file_bytes: bytes) -> str:
|
||||||
text = "".join(elem.itertext()).strip()
|
text = "".join(elem.itertext()).strip()
|
||||||
if text:
|
if text:
|
||||||
lines.append(text)
|
lines.append(text)
|
||||||
return "\n".join(lines)
|
return _clean_cid("\n".join(lines))
|
||||||
|
|
||||||
|
|
||||||
# ── Section splitter ──────────────────────────────────────────────────────────
|
# ── Section splitter ──────────────────────────────────────────────────────────
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@ _DEFAULTS = {
|
||||||
"phone": "",
|
"phone": "",
|
||||||
"linkedin": "",
|
"linkedin": "",
|
||||||
"career_summary": "",
|
"career_summary": "",
|
||||||
|
"candidate_voice": "",
|
||||||
"nda_companies": [],
|
"nda_companies": [],
|
||||||
"docs_dir": "~/Documents/JobSearch",
|
"docs_dir": "~/Documents/JobSearch",
|
||||||
"ollama_models_dir": "~/models/ollama",
|
"ollama_models_dir": "~/models/ollama",
|
||||||
|
|
@ -61,6 +62,7 @@ class UserProfile:
|
||||||
self.phone: str = data["phone"]
|
self.phone: str = data["phone"]
|
||||||
self.linkedin: str = data["linkedin"]
|
self.linkedin: str = data["linkedin"]
|
||||||
self.career_summary: str = data["career_summary"]
|
self.career_summary: str = data["career_summary"]
|
||||||
|
self.candidate_voice: str = data.get("candidate_voice", "")
|
||||||
self.nda_companies: list[str] = [c.lower() for c in data["nda_companies"]]
|
self.nda_companies: list[str] = [c.lower() for c in data["nda_companies"]]
|
||||||
self.docs_dir: Path = Path(data["docs_dir"]).expanduser().resolve()
|
self.docs_dir: Path = Path(data["docs_dir"]).expanduser().resolve()
|
||||||
self.ollama_models_dir: Path = Path(data["ollama_models_dir"]).expanduser().resolve()
|
self.ollama_models_dir: Path = Path(data["ollama_models_dir"]).expanduser().resolve()
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue