From 01a341e4c5ad58d694b47e59c2b3437d2b0c85bb Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Thu, 26 Feb 2026 09:28:31 -0800 Subject: [PATCH] =?UTF-8?q?fix:=20harden=20resume=20section=20detection=20?= =?UTF-8?q?=E2=80=94=20anchor=20patterns=20to=20full=20line,=20expand=20he?= =?UTF-8?q?ader=20synonyms,=20fix=20name=20heuristic=20for=20hyphenated/mi?= =?UTF-8?q?ddle-initial=20names,=20add=20parse=20diagnostics=20UI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/pages/0_Setup.py | 23 +++++++++++++++++++++-- scripts/resume_parser.py | 21 ++++++++++++--------- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/app/pages/0_Setup.py b/app/pages/0_Setup.py index a31bf4b..dcf804c 100644 --- a/app/pages/0_Setup.py +++ b/app/pages/0_Setup.py @@ -317,14 +317,33 @@ elif step == 4: else extract_text_from_docx(file_bytes) ) with st.spinner("Parsing\u2026"): - parsed = structure_resume(raw_text) - if parsed: + parsed, parse_err = structure_resume(raw_text) + + # Diagnostic: show raw extraction + detected fields regardless of outcome + with st.expander("๐Ÿ” Parse diagnostics", expanded=not bool(parsed and any( + parsed.get(k) for k in ("name", "experience", "skills") + ))): + st.caption("**Raw extracted text (first 800 chars)**") + st.code(raw_text[:800] if raw_text else "(empty)", language="text") + if parsed: + st.caption("**Detected fields**") + st.json({k: (v[:3] if isinstance(v, list) else v) for k, v in parsed.items()}) + + if parsed and any(parsed.get(k) for k in ("name", "experience", "skills")): st.session_state["_parsed_resume"] = parsed st.session_state["_raw_resume_text"] = raw_text _save_yaml({"_raw_resume_text": raw_text[:8000]}) st.success("Parsed! Review the builder tab to edit entries.") + elif parsed: + # Parsed but empty โ€” show what we got and let them proceed or build manually + st.session_state["_parsed_resume"] = parsed + st.warning("Resume text was extracted but no fields were recognised. " + "Check the diagnostics above โ€” the section headers may use unusual labels. " + "You can still fill in the Build tab manually.") else: st.warning("Auto-parse failed \u2014 switch to the Build tab and add entries manually.") + if parse_err: + st.caption(f"Reason: {parse_err}") with tab_builder: parsed = st.session_state.get("_parsed_resume", {}) diff --git a/scripts/resume_parser.py b/scripts/resume_parser.py index 6644779..e5bddad 100644 --- a/scripts/resume_parser.py +++ b/scripts/resume_parser.py @@ -22,11 +22,11 @@ log = logging.getLogger(__name__) # โ”€โ”€ Section header detection โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ _SECTION_NAMES = { - "summary": re.compile(r"^(summary|objective|profile|about me|professional summary)", re.I), - "experience": re.compile(r"^(experience|work experience|employment|work history|professional experience)", re.I), - "education": re.compile(r"^(education|academic|qualifications|degrees?)", re.I), - "skills": re.compile(r"^(skills?|technical skills?|core competencies|competencies|expertise)", re.I), - "achievements": re.compile(r"^(achievements?|accomplishments?|awards?|honors?|certifications?)", re.I), + "summary": re.compile(r"^(summary|objective|profile|about me|professional summary|career summary|career objective|personal statement)\s*:?\s*$", re.I), + "experience": re.compile(r"^(experience|work experience|employment|work history|professional experience|career history|relevant experience|professional history|employment history|positions? held)\s*:?\s*$", re.I), + "education": re.compile(r"^(education|academic|qualifications|degrees?|educational background|academic background)\s*:?\s*$", re.I), + "skills": re.compile(r"^(skills?|technical skills?|core competencies|competencies|expertise|areas? of expertise|key skills?|proficiencies|tools? & technologies)\s*:?\s*$", re.I), + "achievements": re.compile(r"^(achievements?|accomplishments?|awards?|honors?|certifications?|publications?|volunteer)\s*:?\s*$", re.I), } # Degrees โ€” used to detect education lines @@ -108,17 +108,20 @@ def _parse_header(lines: list[str]) -> dict: email_m = _EMAIL_RE.search(full_text) phone_m = _PHONE_RE.search(full_text) - # Name heuristic: first non-empty line that has no @ and no digits-only tokens + # Name heuristic: first non-empty line that looks like a person's name name = "" for line in lines[:5]: if "@" in line or re.match(r"^\d", line.strip()): continue - # Skip lines that look like city/state/zip - if re.search(r"\b[A-Z]{2}\b\s*\d{5}", line): + # Skip lines that look like city/state/zip or URLs + if re.search(r"\b[A-Z]{2}\b\s*\d{5}", line) or re.search(r"https?://|linkedin|github", line, re.I): continue + # Strip separators and credential suffixes (MBA, PhD, etc.) for the alpha check candidate = re.sub(r"[|โ€ขยท,]+", " ", line).strip() candidate = re.sub(r"\s{2,}", " ", candidate) - if 2 <= len(candidate.split()) <= 5 and candidate.replace(" ", "").isalpha(): + # Normalise: remove periods, hyphens for the alpha-only check + alpha_check = re.sub(r"[.\-'\u2019]", "", candidate.replace(" ", "")) + if 2 <= len(candidate.split()) <= 5 and alpha_check.isalpha(): name = candidate break