peregrine/scripts/resume_parser.py
pyr0ball 82c26074d8 fix: search prefs wizard data loss, resume sync link, docs + GUI help links
Bug fixes (filed as #125–#128):
- Wizard step 7 read data.titles instead of data.search.titles — user-entered
  job titles and locations were silently dropped on every wizard run (#125)
- GET /api/settings/search returned "titles" key but store expected "job_titles" —
  Settings → Search Prefs always showed empty even when data existed (#126)
- remote_only preference not persisted during wizard setup (#127)
- apply-to-profile didn't set default_resume_id in user.yaml, so future
  Resume Profile saves never synced back to the library entry (#128)

Also:
- Wizard step headings corrected (off-by-one after Training step was inserted)
- Ollama host in wizard inference step now reads from saved wizard state
- Resume upload during wizard now creates a library entry and sets it as default

Docs:
- New: docs/user-guide/daily-workflow.md — end-to-end daily usage guide
- Updated: docs/user-guide/settings.md — rewritten for Vue SPA (was Streamlit)
- mkdocs.yml nav: Daily Workflow added as first User Guide entry

GUI help links:
- web/src/composables/useDocsUrl.ts — shared docs base URL composable
- Home: "Daily Workflow guide ↗" link in subtitle
- Job Review: "? Docs" link in title row
- Resume Library: "? Help" link in header
- Settings → Resume Profile: "? Help" link in page header
- Settings → Search Prefs: "? Help" link in page header
2026-06-15 16:52:56 -07:00

509 lines
22 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Resume parser — extract text from PDF/DOCX and structure via section parsing.
Primary path: regex + section detection (no LLM, no token limits).
Optional enhancement: LLM-generated career_summary if a capable backend is configured.
Falls back to empty dict on unrecoverable errors — caller shows the form builder.
"""
from __future__ import annotations
import io
import logging
import re
import zipfile
from xml.etree import ElementTree as ET
import pdfplumber
from docx import Document
log = logging.getLogger(__name__)
# Browser print artifact patterns — lines injected when a PDF is printed from a browser
# (print header "MM/DD/YY, H:MM AM/PM <title>" and print footer "file:///... N/N")
_BROWSER_ARTIFACT_RE = re.compile(
r"^file:///" # file:// URL footer
r"|^\d{1,2}/\d{1,2}/\d{2,4},\s+\d{1,2}:\d{2}\s+[AP]M\b", # MM/DD/YY, H:MM AM/PM header
re.I,
)
# ── Section header detection ──────────────────────────────────────────────────
_SECTION_NAMES = {
"summary": re.compile(r"^(summary|objective|profile|about me|professional summary|career summary|career objective|personal statement)\s*:?\s*$", re.I),
"experience": re.compile(r"^(experience|work experience|employment|work history|professional experience|career history|relevant experience|professional history|employment history|positions? held)\s*:?\s*$", re.I),
"education": re.compile(r"^(education|academic|qualifications|degrees?|educational background|academic background)\s*:?\s*$", re.I),
"skills": re.compile(r"^(skills?|technical skills?|core competencies|competencies|expertise|areas? of expertise|key skills?|proficiencies|tools? & technologies)\s*:?\s*$", re.I),
"achievements": re.compile(r"^(achievements?|accomplishments?|awards?|honors?|certifications?|publications?|volunteer)\s*:?\s*$", re.I),
"projects": re.compile(r"^(projects?|independent development|independent projects?|side projects?|personal projects?|open.?source|portfolio)\s*:?\s*$", re.I),
"references": re.compile(r"^references?\s*:?\s*$", re.I),
}
# Degrees — used to detect education lines
_DEGREE_RE = re.compile(
r"\b(b\.?s\.?|b\.?a\.?|m\.?s\.?|m\.?b\.?a\.?|ph\.?d\.?|bachelor|master|associate|doctorate|diploma)\b",
re.I,
)
# Date patterns for experience entries: "Jan 2020", "2020", "01/2020", "2019 - 2022"
_DATE_RE = re.compile(
r"\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|january|february|march|april|june|"
r"july|august|september|october|november|december)?\s*\d{4}\b"
r"|\b\d{1,2}/\d{4}\b",
re.I,
)
_DATE_RANGE_RE = re.compile(
r"("
r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\.?\s+\d{4}"
r"|\d{1,2}/\d{4}"
r"|\d{4}"
r")"
r"\s*[-—to]+\s*"
r"("
r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\.?\s+\d{4}"
r"|\d{1,2}/\d{4}"
r"|\d{4}"
r"|present|current|now"
r")",
re.I,
)
# Contact info
_EMAIL_RE = re.compile(r"[\w.+\-]+@[\w\-]+\.[\w.\-]+")
_PHONE_RE = re.compile(r"(?:\+1[\s.\-]?)?\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4}")
_LINKEDIN_RE = re.compile(r"linkedin\.com/in/[\w\-]+", re.I)
# ── Text extraction ───────────────────────────────────────────────────────────
def _find_column_split(page) -> float | None:
"""Return the x-coordinate of the gutter between two columns, or None if single-column.
Finds the largest horizontal gap between word x0 positions in the middle 40%
of the page width — that gap is the column gutter.
"""
words = page.extract_words()
if len(words) < 10:
return None
lo, hi = page.width * 0.25, page.width * 0.75
# Collect unique left-edge positions of words that start in the middle band
xs = sorted({int(w["x0"]) for w in words if lo <= w["x0"] <= hi})
if len(xs) < 2:
return None
# Find the biggest consecutive gap
best_gap, split_x = 0.0, None
for i in range(len(xs) - 1):
gap = xs[i + 1] - xs[i]
if gap > best_gap:
best_gap, split_x = gap, (xs[i] + xs[i + 1]) / 2
# Only treat as two-column if the gap is substantial (> 3% of page width)
return split_x if split_x and best_gap > page.width * 0.03 else None
_CID_BULLETS = {127, 149, 183} # common bullet CIDs across ATS-reembedded fonts
def _clean_cid(text: str) -> str:
"""Replace (cid:NNN) glyph references emitted by pdfplumber when a PDF font
lacks a ToUnicode map. Known bullet CIDs become ''; everything else is
stripped so downstream section parsing sees clean text."""
def _replace(m: re.Match) -> str:
n = int(m.group(1))
return "" if n in _CID_BULLETS else ""
return re.sub(r"\(cid:(\d+)\)", _replace, text)
def extract_text_from_pdf(file_bytes: bytes) -> str:
"""Extract text from PDF, handling two-column layouts via gutter detection.
For two-column pages, the full-width header (name, contact) is extracted
separately from the columnar body to avoid the centered header being clipped.
"""
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
pages: list[str] = []
for page in pdf.pages:
w, h = page.width, page.height
split_x = _find_column_split(page)
if split_x:
# Find y-coordinate where right-column content starts.
# Everything above that belongs to the full-width header.
words = page.extract_words()
right_words = [wd for wd in words if wd["x0"] >= split_x]
col_start_y = min(wd["top"] for wd in right_words) if right_words else 0
header_text = page.within_bbox((0, 0, w, col_start_y)).extract_text() or ""
left_text = page.within_bbox((0, col_start_y, split_x, h)).extract_text() or ""
right_text = page.within_bbox((split_x, col_start_y, w, h)).extract_text() or ""
if len(left_text.strip()) > 60 and len(right_text.strip()) > 60:
pages.append("\n".join(filter(None, [header_text, left_text, right_text])))
continue
pages.append(page.extract_text() or "")
return _clean_cid("\n".join(pages))
def extract_text_from_docx(file_bytes: bytes) -> str:
doc = Document(io.BytesIO(file_bytes))
return _clean_cid("\n".join(p.text for p in doc.paragraphs if p.text.strip()))
def extract_text_from_odt(file_bytes: bytes) -> str:
"""Extract plain text from an ODT file (ZIP + XML, no external deps required)."""
# ODT is a ZIP archive; content.xml holds the document body
_NS = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
lines: list[str] = []
with zipfile.ZipFile(io.BytesIO(file_bytes)) as zf:
with zf.open("content.xml") as f:
tree = ET.parse(f)
# Walk all text:p and text:h elements in document order
for elem in tree.iter():
tag = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
if tag in ("p", "h"):
text = "".join(elem.itertext()).strip()
if text:
lines.append(text)
return _clean_cid("\n".join(lines))
# ── Section splitter ──────────────────────────────────────────────────────────
def _split_sections(text: str) -> dict[str, list[str]]:
"""Split resume text into named sections. Lines that don't match a known
section header go into 'header' (assumed to be contact/name block)."""
sections: dict[str, list[str]] = {"header": []}
current = "header"
for line in text.splitlines():
stripped = line.strip()
if not stripped:
continue
if _BROWSER_ARTIFACT_RE.match(stripped):
continue
matched = False
for section, pattern in _SECTION_NAMES.items():
# Match if the line IS a section header (short + matches pattern)
if pattern.match(stripped) and len(stripped.split()) <= 5:
current = section
matched = True
break
if not matched:
sections.setdefault(current, []).append(stripped)
return sections
# ── Contact info ──────────────────────────────────────────────────────────────
def _parse_header(lines: list[str]) -> dict:
"""Extract name, email, phone from the top-of-resume block."""
full_text = "\n".join(lines)
email_m = _EMAIL_RE.search(full_text)
phone_m = _PHONE_RE.search(full_text)
# Name heuristic: first non-empty line that looks like a person's name.
# Handle two common layouts:
# (A) Name on its own line
# (B) "email@example.com Firstname Lastname" on one line
name = ""
for line in lines[:8]:
stripped = line.strip()
if not stripped:
continue
# Layout B: line contains email — extract the part after the email as name
if "@" in stripped:
email_m = _EMAIL_RE.search(stripped)
if email_m:
after = stripped[email_m.end():].strip(" |•,")
after_clean = re.sub(r"\s{2,}", " ", after)
alpha_check = re.sub(r"[.\-'\u2019]", "", after_clean.replace(" ", ""))
if 2 <= len(after_clean.split()) <= 5 and alpha_check.isalpha():
name = after_clean
break
continue
# Skip phone/URL/city lines
if re.match(r"^\d", stripped):
continue
if re.search(r"\b[A-Z]{2}\b\s*\d{5}", stripped) or re.search(r"https?://|linkedin|github", stripped, re.I):
continue
# Layout A: plain name line
candidate = re.sub(r"[|•·,]+", " ", stripped).strip()
candidate = re.sub(r"\s{2,}", " ", candidate)
alpha_check = re.sub(r"[.\-'\u2019]", "", candidate.replace(" ", ""))
if 2 <= len(candidate.split()) <= 5 and alpha_check.isalpha():
name = candidate
break
return {
"name": name,
"email": email_m.group(0) if email_m else "",
"phone": phone_m.group(0) if phone_m else "",
}
# ── Experience ────────────────────────────────────────────────────────────────
def _parse_experience(lines: list[str]) -> list[dict]:
"""Parse work experience entries from section lines.
Handles two common layouts:
(A) Title | Company (B) Title | Company | Dates
Dates • bullet
• bullet
(C) Title\tDates (tab-separated, common in DOCX exports)
Company | Location
• bullet
"""
entries: list[dict] = []
current: dict | None = None
prev_line = ""
seen_bullets = False # True once we've appended the first bullet to current
for line in lines:
date_match = _DATE_RANGE_RE.search(line)
if date_match:
if current:
entries.append(current)
# Title/company extraction — three layouts:
# (A) Title on prev_line (not a bullet), "Company | Location | Dates" on date line
# (B) "Title | Company" on prev_line, dates on date line (same_line empty)
# (C) "Title | Company | Dates" all on one line
same_line = _DATE_RANGE_RE.sub("", line)
# Remove residual punctuation-only fragments like "()" left after date removal
same_line = re.sub(r"[()[\]{}\s]+$", "", same_line).strip(" –—|-•")
# Only use prev_line as title if it isn't bullet text (cleared after bullets)
if prev_line and same_line.strip():
# Layout A: title = prev_line, company = first segment of same_line
title = prev_line.strip()
co_part = re.split(r"\s{2,}|[|,]\s*", same_line.strip(), maxsplit=1)[0]
company = co_part.strip()
else:
# Layout B/C: title | company are together (prev_line or same_line)
header = same_line if same_line.strip() else prev_line
parts = re.split(r"\s{2,}|[|•·,–—]\s*", header.strip(), maxsplit=1)
title = parts[0].strip() if parts else ""
company = parts[1].strip() if len(parts) > 1 else ""
current = {
"title": title,
"company": company,
"start_date": date_match.group(1),
"end_date": date_match.group(2),
"bullets": [],
}
prev_line = ""
seen_bullets = False
elif current is not None:
is_bullet = bool(re.match(r"^[•\-–—*◦▪▸►]\s*", line))
# Layout C: company/location on the line immediately after the date line,
# before any bullets. Short non-date line = company, not a next-job header.
if (not is_bullet and not seen_bullets and not current["company"]
and not _DATE_RE.search(line) and len(line.strip()) < 80):
co_part = re.split(r"\s{2,}|[|,]\s*", line.strip(), maxsplit=1)[0]
current["company"] = co_part.strip()
prev_line = ""
continue
looks_like_header = (
not is_bullet
and " | " in line
and not _DATE_RE.search(line)
)
if looks_like_header:
# Likely the title/company of the next entry — hold it as prev_line
prev_line = line
else:
clean = re.sub(r"^[•\-–—*◦▪▸►]\s*", "", line).strip()
if clean:
current["bullets"].append(clean)
seen_bullets = True
# Clear prev_line after non-header content so the next date match
# doesn't mistake a bullet as a job title (Layout A false-positive).
prev_line = ""
else:
prev_line = line
if current:
entries.append(current)
return entries
# ── Education ─────────────────────────────────────────────────────────────────
_INSTITUTION_RE = re.compile(r"\b(university|college|institute|school|academy)\b", re.I)
def _parse_education(lines: list[str]) -> list[dict]:
"""Parse education entries.
Primary path: degree keyword detected (B.S., Master, etc.)
Fallback path: year range detected without a degree keyword — handles resumes
with courses, programmes, or non-degree study (e.g. "San Jose State University 2005-2006").
"""
entries: list[dict] = []
current: dict | None = None
prev_line = ""
for line in lines:
has_degree = bool(_DEGREE_RE.search(line))
date_range = _DATE_RANGE_RE.search(line)
has_year = bool(re.search(r"\b(19|20)\d{2}\b", line))
if has_degree or (has_year and date_range):
if current:
entries.append(current)
current = {"institution": "", "degree": "", "field": "", "graduation_year": ""}
year_m = re.search(r"\b(19|20)\d{2}\b", line)
if year_m:
current["graduation_year"] = year_m.group(0)
if has_degree:
degree_m = _DEGREE_RE.search(line)
if degree_m:
current["degree"] = degree_m.group(0).upper()
remainder = _DEGREE_RE.sub("", _DATE_RE.sub("", line))
remainder = re.sub(r"\b(19|20)\d{2}\b", "", remainder)
current["field"] = remainder.strip(" ,–—|•.")
if prev_line and not _DEGREE_RE.search(prev_line) and not _DATE_RE.search(prev_line):
current["institution"] = prev_line.strip(" ,–—|•")
else:
# Fallback: year-range line without a degree keyword.
# Two layouts:
# (A) PDF: "Graphic Design, 20052006" with institution on prev_line
# (B) DOCX: "San Jose State University\t2005-2006" — institution on same line
same = _DATE_RANGE_RE.sub("", line)
same = re.sub(r"\b(19|20)\d{2}\b", "", same).strip(" ,–—|•\t")
prev_clean = prev_line.strip(" ,–—|•") if prev_line else ""
if same and _INSTITUTION_RE.search(prev_clean):
# Layout A: institution on prev_line (e.g. "San Jose State University")
current["institution"] = prev_clean
current["field"] = same
elif same:
# Layout B: institution embedded on same line as year
current["institution"] = same
elif prev_clean:
current["institution"] = prev_clean
prev_line = "" # consumed; prevent leaking into the next entry
elif current is not None:
clean = line.strip(" ,–—|•\t")
if clean:
if not current["institution"]:
current["institution"] = clean
elif not current["field"]:
current["field"] = clean
prev_line = "" # field consumed — don't seed the next entry
continue
prev_line = line.strip()
else:
prev_line = line.strip()
if current:
entries.append(current)
return entries
# ── Skills ────────────────────────────────────────────────────────────────────
def _split_skill_tokens(line: str) -> list[str]:
"""Split a skills line on delimiters, but not on commas inside parentheses.
Splits on |, •, ·, tab first (always separators), then on comma only when
paren depth is zero — so "CRM Ticketing (Jira, Salesforce)" stays intact.
"""
tokens: list[str] = []
for part in re.split(r"[|•·\t]+", line):
depth, buf = 0, ""
for ch in part:
if ch == "(":
depth += 1
buf += ch
elif ch == ")":
depth -= 1
buf += ch
elif ch == "," and depth == 0:
tokens.append(buf)
buf = ""
else:
buf += ch
tokens.append(buf)
return tokens
def _parse_skills(lines: list[str]) -> list[str]:
skills: list[str] = []
for line in lines:
for item in _split_skill_tokens(line):
# Strip only bullet/dash markers and whitespace, NOT parentheses —
# many skills contain parens, e.g. "C++ (Arduino / Embedded)"
clean = item.strip(" -–—*◦▪▸►")
if 1 < len(clean) <= 60:
skills.append(clean)
return skills
# ── Main parser ───────────────────────────────────────────────────────────────
def parse_resume(raw_text: str) -> tuple[dict, str]:
"""Parse resume text into a structured dict using section detection + regex.
Returns (result_dict, error_message). result_dict is empty on failure.
"""
if not raw_text.strip():
return {}, "Text extraction returned empty — the file may be image-based or unreadable."
try:
sections = _split_sections(raw_text)
contact = _parse_header(sections.get("header", []))
result = {
**contact,
"career_summary": " ".join(sections.get("summary", [])),
"experience": _parse_experience(sections.get("experience", [])),
"education": _parse_education(sections.get("education", [])),
"skills": _parse_skills(sections.get("skills", [])),
"achievements": sections.get("achievements", []),
}
return result, ""
except Exception as e:
import traceback
log.error("[resume_parser] parse_resume error:\n%s", traceback.format_exc())
return {}, str(e)
# ── LLM enhancement (career summary only, optional) ──────────────────────────
def _llm_career_summary(raw_text: str) -> str:
"""Use LLM to generate a career summary. Returns empty string on any failure."""
try:
from scripts.llm_router import LLMRouter
prompt = (
"Write a 2-3 sentence professional career summary for this candidate "
"based on their resume. Return only the summary text, no labels.\n\n"
f"Resume:\n{raw_text[:1500]}"
)
return LLMRouter().complete(prompt)
except Exception:
return ""
# ── Public entry point ────────────────────────────────────────────────────────
def structure_resume(raw_text: str) -> tuple[dict, str]:
"""Parse resume and optionally enhance career_summary via LLM.
Returns (result_dict, error_message).
"""
result, err = parse_resume(raw_text)
if not result:
return result, err
# Enhance career summary via LLM if the section wasn't found in the document
if not result.get("career_summary"):
try:
summary = _llm_career_summary(raw_text)
except Exception:
summary = ""
if summary:
result["career_summary"] = summary.strip()
return result, ""