fix: harden resume section detection — anchor patterns to full line, expand header synonyms, fix name heuristic for hyphenated/middle-initial names, add parse diagnostics UI
This commit is contained in:
parent
d6545cf496
commit
01a341e4c5
2 changed files with 33 additions and 11 deletions
|
|
@ -317,14 +317,33 @@ elif step == 4:
|
||||||
else extract_text_from_docx(file_bytes)
|
else extract_text_from_docx(file_bytes)
|
||||||
)
|
)
|
||||||
with st.spinner("Parsing\u2026"):
|
with st.spinner("Parsing\u2026"):
|
||||||
parsed = structure_resume(raw_text)
|
parsed, parse_err = structure_resume(raw_text)
|
||||||
if parsed:
|
|
||||||
|
# Diagnostic: show raw extraction + detected fields regardless of outcome
|
||||||
|
with st.expander("🔍 Parse diagnostics", expanded=not bool(parsed and any(
|
||||||
|
parsed.get(k) for k in ("name", "experience", "skills")
|
||||||
|
))):
|
||||||
|
st.caption("**Raw extracted text (first 800 chars)**")
|
||||||
|
st.code(raw_text[:800] if raw_text else "(empty)", language="text")
|
||||||
|
if parsed:
|
||||||
|
st.caption("**Detected fields**")
|
||||||
|
st.json({k: (v[:3] if isinstance(v, list) else v) for k, v in parsed.items()})
|
||||||
|
|
||||||
|
if parsed and any(parsed.get(k) for k in ("name", "experience", "skills")):
|
||||||
st.session_state["_parsed_resume"] = parsed
|
st.session_state["_parsed_resume"] = parsed
|
||||||
st.session_state["_raw_resume_text"] = raw_text
|
st.session_state["_raw_resume_text"] = raw_text
|
||||||
_save_yaml({"_raw_resume_text": raw_text[:8000]})
|
_save_yaml({"_raw_resume_text": raw_text[:8000]})
|
||||||
st.success("Parsed! Review the builder tab to edit entries.")
|
st.success("Parsed! Review the builder tab to edit entries.")
|
||||||
|
elif parsed:
|
||||||
|
# Parsed but empty — show what we got and let them proceed or build manually
|
||||||
|
st.session_state["_parsed_resume"] = parsed
|
||||||
|
st.warning("Resume text was extracted but no fields were recognised. "
|
||||||
|
"Check the diagnostics above — the section headers may use unusual labels. "
|
||||||
|
"You can still fill in the Build tab manually.")
|
||||||
else:
|
else:
|
||||||
st.warning("Auto-parse failed \u2014 switch to the Build tab and add entries manually.")
|
st.warning("Auto-parse failed \u2014 switch to the Build tab and add entries manually.")
|
||||||
|
if parse_err:
|
||||||
|
st.caption(f"Reason: {parse_err}")
|
||||||
|
|
||||||
with tab_builder:
|
with tab_builder:
|
||||||
parsed = st.session_state.get("_parsed_resume", {})
|
parsed = st.session_state.get("_parsed_resume", {})
|
||||||
|
|
|
||||||
|
|
@ -22,11 +22,11 @@ log = logging.getLogger(__name__)
|
||||||
# ── Section header detection ──────────────────────────────────────────────────
|
# ── Section header detection ──────────────────────────────────────────────────
|
||||||
|
|
||||||
_SECTION_NAMES = {
|
_SECTION_NAMES = {
|
||||||
"summary": re.compile(r"^(summary|objective|profile|about me|professional summary)", re.I),
|
"summary": re.compile(r"^(summary|objective|profile|about me|professional summary|career summary|career objective|personal statement)\s*:?\s*$", re.I),
|
||||||
"experience": re.compile(r"^(experience|work experience|employment|work history|professional experience)", re.I),
|
"experience": re.compile(r"^(experience|work experience|employment|work history|professional experience|career history|relevant experience|professional history|employment history|positions? held)\s*:?\s*$", re.I),
|
||||||
"education": re.compile(r"^(education|academic|qualifications|degrees?)", re.I),
|
"education": re.compile(r"^(education|academic|qualifications|degrees?|educational background|academic background)\s*:?\s*$", re.I),
|
||||||
"skills": re.compile(r"^(skills?|technical skills?|core competencies|competencies|expertise)", re.I),
|
"skills": re.compile(r"^(skills?|technical skills?|core competencies|competencies|expertise|areas? of expertise|key skills?|proficiencies|tools? & technologies)\s*:?\s*$", re.I),
|
||||||
"achievements": re.compile(r"^(achievements?|accomplishments?|awards?|honors?|certifications?)", re.I),
|
"achievements": re.compile(r"^(achievements?|accomplishments?|awards?|honors?|certifications?|publications?|volunteer)\s*:?\s*$", re.I),
|
||||||
}
|
}
|
||||||
|
|
||||||
# Degrees — used to detect education lines
|
# Degrees — used to detect education lines
|
||||||
|
|
@ -108,17 +108,20 @@ def _parse_header(lines: list[str]) -> dict:
|
||||||
email_m = _EMAIL_RE.search(full_text)
|
email_m = _EMAIL_RE.search(full_text)
|
||||||
phone_m = _PHONE_RE.search(full_text)
|
phone_m = _PHONE_RE.search(full_text)
|
||||||
|
|
||||||
# Name heuristic: first non-empty line that has no @ and no digits-only tokens
|
# Name heuristic: first non-empty line that looks like a person's name
|
||||||
name = ""
|
name = ""
|
||||||
for line in lines[:5]:
|
for line in lines[:5]:
|
||||||
if "@" in line or re.match(r"^\d", line.strip()):
|
if "@" in line or re.match(r"^\d", line.strip()):
|
||||||
continue
|
continue
|
||||||
# Skip lines that look like city/state/zip
|
# Skip lines that look like city/state/zip or URLs
|
||||||
if re.search(r"\b[A-Z]{2}\b\s*\d{5}", line):
|
if re.search(r"\b[A-Z]{2}\b\s*\d{5}", line) or re.search(r"https?://|linkedin|github", line, re.I):
|
||||||
continue
|
continue
|
||||||
|
# Strip separators and credential suffixes (MBA, PhD, etc.) for the alpha check
|
||||||
candidate = re.sub(r"[|•·,]+", " ", line).strip()
|
candidate = re.sub(r"[|•·,]+", " ", line).strip()
|
||||||
candidate = re.sub(r"\s{2,}", " ", candidate)
|
candidate = re.sub(r"\s{2,}", " ", candidate)
|
||||||
if 2 <= len(candidate.split()) <= 5 and candidate.replace(" ", "").isalpha():
|
# Normalise: remove periods, hyphens for the alpha-only check
|
||||||
|
alpha_check = re.sub(r"[.\-'\u2019]", "", candidate.replace(" ", ""))
|
||||||
|
if 2 <= len(candidate.split()) <= 5 and alpha_check.isalpha():
|
||||||
name = candidate
|
name = candidate
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue