Compare commits

..

2 commits

2 changed files with 147 additions and 27 deletions

View file

@ -305,26 +305,48 @@ elif step == 4:
tab_upload, tab_builder = st.tabs(["\U0001f4ce Upload", "\U0001f4dd Build manually"]) tab_upload, tab_builder = st.tabs(["\U0001f4ce Upload", "\U0001f4dd Build manually"])
with tab_upload: with tab_upload:
uploaded = st.file_uploader("Upload PDF or DOCX", type=["pdf", "docx"]) uploaded = st.file_uploader("Upload PDF, DOCX, or ODT", type=["pdf", "docx", "odt"])
if uploaded and st.button("Parse Resume", type="primary", key="parse_resume"): if uploaded and st.button("Parse Resume", type="primary", key="parse_resume"):
from scripts.resume_parser import ( from scripts.resume_parser import (
extract_text_from_pdf, extract_text_from_docx, structure_resume, extract_text_from_pdf, extract_text_from_docx,
extract_text_from_odt, structure_resume,
) )
file_bytes = uploaded.read() file_bytes = uploaded.read()
ext = uploaded.name.rsplit(".", 1)[-1].lower() ext = uploaded.name.rsplit(".", 1)[-1].lower()
raw_text = ( if ext == "pdf":
extract_text_from_pdf(file_bytes) if ext == "pdf" raw_text = extract_text_from_pdf(file_bytes)
else extract_text_from_docx(file_bytes) elif ext == "odt":
) raw_text = extract_text_from_odt(file_bytes)
else:
raw_text = extract_text_from_docx(file_bytes)
with st.spinner("Parsing\u2026"): with st.spinner("Parsing\u2026"):
parsed = structure_resume(raw_text) parsed, parse_err = structure_resume(raw_text)
if parsed:
# Diagnostic: show raw extraction + detected fields regardless of outcome
with st.expander("🔍 Parse diagnostics", expanded=not bool(parsed and any(
parsed.get(k) for k in ("name", "experience", "skills")
))):
st.caption("**Raw extracted text (first 800 chars)**")
st.code(raw_text[:800] if raw_text else "(empty)", language="text")
if parsed:
st.caption("**Detected fields**")
st.json({k: (v[:3] if isinstance(v, list) else v) for k, v in parsed.items()})
if parsed and any(parsed.get(k) for k in ("name", "experience", "skills")):
st.session_state["_parsed_resume"] = parsed st.session_state["_parsed_resume"] = parsed
st.session_state["_raw_resume_text"] = raw_text st.session_state["_raw_resume_text"] = raw_text
_save_yaml({"_raw_resume_text": raw_text[:8000]}) _save_yaml({"_raw_resume_text": raw_text[:8000]})
st.success("Parsed! Review the builder tab to edit entries.") st.success("Parsed! Review the builder tab to edit entries.")
elif parsed:
# Parsed but empty — show what we got and let them proceed or build manually
st.session_state["_parsed_resume"] = parsed
st.warning("Resume text was extracted but no fields were recognised. "
"Check the diagnostics above — the section headers may use unusual labels. "
"You can still fill in the Build tab manually.")
else: else:
st.warning("Auto-parse failed \u2014 switch to the Build tab and add entries manually.") st.warning("Auto-parse failed \u2014 switch to the Build tab and add entries manually.")
if parse_err:
st.caption(f"Reason: {parse_err}")
with tab_builder: with tab_builder:
parsed = st.session_state.get("_parsed_resume", {}) parsed = st.session_state.get("_parsed_resume", {})

View file

@ -12,7 +12,9 @@ import io
import json import json
import logging import logging
import re import re
import zipfile
from pathlib import Path from pathlib import Path
from xml.etree import ElementTree as ET
import pdfplumber import pdfplumber
from docx import Document from docx import Document
@ -22,11 +24,11 @@ log = logging.getLogger(__name__)
# ── Section header detection ────────────────────────────────────────────────── # ── Section header detection ──────────────────────────────────────────────────
_SECTION_NAMES = { _SECTION_NAMES = {
"summary": re.compile(r"^(summary|objective|profile|about me|professional summary)", re.I), "summary": re.compile(r"^(summary|objective|profile|about me|professional summary|career summary|career objective|personal statement)\s*:?\s*$", re.I),
"experience": re.compile(r"^(experience|work experience|employment|work history|professional experience)", re.I), "experience": re.compile(r"^(experience|work experience|employment|work history|professional experience|career history|relevant experience|professional history|employment history|positions? held)\s*:?\s*$", re.I),
"education": re.compile(r"^(education|academic|qualifications|degrees?)", re.I), "education": re.compile(r"^(education|academic|qualifications|degrees?|educational background|academic background)\s*:?\s*$", re.I),
"skills": re.compile(r"^(skills?|technical skills?|core competencies|competencies|expertise)", re.I), "skills": re.compile(r"^(skills?|technical skills?|core competencies|competencies|expertise|areas? of expertise|key skills?|proficiencies|tools? & technologies)\s*:?\s*$", re.I),
"achievements": re.compile(r"^(achievements?|accomplishments?|awards?|honors?|certifications?)", re.I), "achievements": re.compile(r"^(achievements?|accomplishments?|awards?|honors?|certifications?|publications?|volunteer)\s*:?\s*$", re.I),
} }
# Degrees — used to detect education lines # Degrees — used to detect education lines
@ -66,9 +68,54 @@ _LINKEDIN_RE = re.compile(r"linkedin\.com/in/[\w\-]+", re.I)
# ── Text extraction ─────────────────────────────────────────────────────────── # ── Text extraction ───────────────────────────────────────────────────────────
def _find_column_split(page) -> float | None:
"""Return the x-coordinate of the gutter between two columns, or None if single-column.
Finds the largest horizontal gap between word x0 positions in the middle 40%
of the page width that gap is the column gutter.
"""
words = page.extract_words()
if len(words) < 10:
return None
lo, hi = page.width * 0.25, page.width * 0.75
# Collect unique left-edge positions of words that start in the middle band
xs = sorted({int(w["x0"]) for w in words if lo <= w["x0"] <= hi})
if len(xs) < 2:
return None
# Find the biggest consecutive gap
best_gap, split_x = 0.0, None
for i in range(len(xs) - 1):
gap = xs[i + 1] - xs[i]
if gap > best_gap:
best_gap, split_x = gap, (xs[i] + xs[i + 1]) / 2
# Only treat as two-column if the gap is substantial (> 3% of page width)
return split_x if split_x and best_gap > page.width * 0.03 else None
def extract_text_from_pdf(file_bytes: bytes) -> str: def extract_text_from_pdf(file_bytes: bytes) -> str:
"""Extract text from PDF, handling two-column layouts via gutter detection.
For two-column pages, the full-width header (name, contact) is extracted
separately from the columnar body to avoid the centered header being clipped.
"""
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf: with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
pages = [page.extract_text() or "" for page in pdf.pages] pages: list[str] = []
for page in pdf.pages:
w, h = page.width, page.height
split_x = _find_column_split(page)
if split_x:
# Find y-coordinate where right-column content starts.
# Everything above that belongs to the full-width header.
words = page.extract_words()
right_words = [wd for wd in words if wd["x0"] >= split_x]
col_start_y = min(wd["top"] for wd in right_words) if right_words else 0
header_text = page.within_bbox((0, 0, w, col_start_y)).extract_text() or ""
left_text = page.within_bbox((0, col_start_y, split_x, h)).extract_text() or ""
right_text = page.within_bbox((split_x, col_start_y, w, h)).extract_text() or ""
if len(left_text.strip()) > 60 and len(right_text.strip()) > 60:
pages.append("\n".join(filter(None, [header_text, left_text, right_text])))
continue
pages.append(page.extract_text() or "")
return "\n".join(pages) return "\n".join(pages)
@ -77,6 +124,24 @@ def extract_text_from_docx(file_bytes: bytes) -> str:
return "\n".join(p.text for p in doc.paragraphs if p.text.strip()) return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
def extract_text_from_odt(file_bytes: bytes) -> str:
"""Extract plain text from an ODT file (ZIP + XML, no external deps required)."""
# ODT is a ZIP archive; content.xml holds the document body
_NS = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
lines: list[str] = []
with zipfile.ZipFile(io.BytesIO(file_bytes)) as zf:
with zf.open("content.xml") as f:
tree = ET.parse(f)
# Walk all text:p and text:h elements in document order
for elem in tree.iter():
tag = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
if tag in ("p", "h"):
text = "".join(elem.itertext()).strip()
if text:
lines.append(text)
return "\n".join(lines)
# ── Section splitter ────────────────────────────────────────────────────────── # ── Section splitter ──────────────────────────────────────────────────────────
def _split_sections(text: str) -> dict[str, list[str]]: def _split_sections(text: str) -> dict[str, list[str]]:
@ -108,17 +173,36 @@ def _parse_header(lines: list[str]) -> dict:
email_m = _EMAIL_RE.search(full_text) email_m = _EMAIL_RE.search(full_text)
phone_m = _PHONE_RE.search(full_text) phone_m = _PHONE_RE.search(full_text)
# Name heuristic: first non-empty line that has no @ and no digits-only tokens # Name heuristic: first non-empty line that looks like a person's name.
# Handle two common layouts:
# (A) Name on its own line
# (B) "email@example.com Firstname Lastname" on one line
name = "" name = ""
for line in lines[:5]: for line in lines[:8]:
if "@" in line or re.match(r"^\d", line.strip()): stripped = line.strip()
if not stripped:
continue continue
# Skip lines that look like city/state/zip # Layout B: line contains email — extract the part after the email as name
if re.search(r"\b[A-Z]{2}\b\s*\d{5}", line): if "@" in stripped:
email_m = _EMAIL_RE.search(stripped)
if email_m:
after = stripped[email_m.end():].strip(" |•,")
after_clean = re.sub(r"\s{2,}", " ", after)
alpha_check = re.sub(r"[.\-'\u2019]", "", after_clean.replace(" ", ""))
if 2 <= len(after_clean.split()) <= 5 and alpha_check.isalpha():
name = after_clean
break
continue continue
candidate = re.sub(r"[|•·,]+", " ", line).strip() # Skip phone/URL/city lines
if re.match(r"^\d", stripped):
continue
if re.search(r"\b[A-Z]{2}\b\s*\d{5}", stripped) or re.search(r"https?://|linkedin|github", stripped, re.I):
continue
# Layout A: plain name line
candidate = re.sub(r"[|•·,]+", " ", stripped).strip()
candidate = re.sub(r"\s{2,}", " ", candidate) candidate = re.sub(r"\s{2,}", " ", candidate)
if 2 <= len(candidate.split()) <= 5 and candidate.replace(" ", "").isalpha(): alpha_check = re.sub(r"[.\-'\u2019]", "", candidate.replace(" ", ""))
if 2 <= len(candidate.split()) <= 5 and alpha_check.isalpha():
name = candidate name = candidate
break break
@ -148,13 +232,27 @@ def _parse_experience(lines: list[str]) -> list[dict]:
if date_match: if date_match:
if current: if current:
entries.append(current) entries.append(current)
# Title/company may be on this line (layout B) or the previous line (layout A) # Title/company extraction — three layouts:
same_line = _DATE_RANGE_RE.sub("", line).strip(" –—|-•") # (A) Title on prev_line, "Company | Location | Dates" on date line
header = same_line if same_line.strip() else prev_line # (B) "Title | Company" on prev_line, dates on date line (same_line empty)
parts = re.split(r"\s{2,}|[|•·,–—]\s*", header.strip(), maxsplit=1) # (C) "Title | Company | Dates" all on one line
same_line = _DATE_RANGE_RE.sub("", line)
# Remove residual punctuation-only fragments like "()" left after date removal
same_line = re.sub(r"[()[\]{}\s]+$", "", same_line).strip(" –—|-•")
if prev_line and same_line.strip():
# Layout A: title = prev_line, company = first segment of same_line
title = prev_line.strip()
co_part = re.split(r"\s{2,}|[|,]\s*", same_line.strip(), maxsplit=1)[0]
company = co_part.strip()
else:
# Layout B/C: title | company are together (prev_line or same_line)
header = same_line if same_line.strip() else prev_line
parts = re.split(r"\s{2,}|[|•·,–—]\s*", header.strip(), maxsplit=1)
title = parts[0].strip() if parts else ""
company = parts[1].strip() if len(parts) > 1 else ""
current = { current = {
"title": parts[0].strip() if parts else "", "title": title,
"company": parts[1].strip() if len(parts) > 1 else "", "company": company,
"start_date": date_match.group(1), "start_date": date_match.group(1),
"end_date": date_match.group(2), "end_date": date_match.group(2),
"bullets": [], "bullets": [],