feat: ODT support, two-column PDF column-split extraction, title/company layout detection hardening

This commit is contained in:
pyr0ball 2026-02-26 10:33:28 -08:00
parent 5af2b20d82
commit 07bdac6302
2 changed files with 119 additions and 21 deletions

View file

@ -305,17 +305,20 @@ elif step == 4:
tab_upload, tab_builder = st.tabs(["\U0001f4ce Upload", "\U0001f4dd Build manually"]) tab_upload, tab_builder = st.tabs(["\U0001f4ce Upload", "\U0001f4dd Build manually"])
with tab_upload: with tab_upload:
uploaded = st.file_uploader("Upload PDF or DOCX", type=["pdf", "docx"]) uploaded = st.file_uploader("Upload PDF, DOCX, or ODT", type=["pdf", "docx", "odt"])
if uploaded and st.button("Parse Resume", type="primary", key="parse_resume"): if uploaded and st.button("Parse Resume", type="primary", key="parse_resume"):
from scripts.resume_parser import ( from scripts.resume_parser import (
extract_text_from_pdf, extract_text_from_docx, structure_resume, extract_text_from_pdf, extract_text_from_docx,
extract_text_from_odt, structure_resume,
) )
file_bytes = uploaded.read() file_bytes = uploaded.read()
ext = uploaded.name.rsplit(".", 1)[-1].lower() ext = uploaded.name.rsplit(".", 1)[-1].lower()
raw_text = ( if ext == "pdf":
extract_text_from_pdf(file_bytes) if ext == "pdf" raw_text = extract_text_from_pdf(file_bytes)
else extract_text_from_docx(file_bytes) elif ext == "odt":
) raw_text = extract_text_from_odt(file_bytes)
else:
raw_text = extract_text_from_docx(file_bytes)
with st.spinner("Parsing\u2026"): with st.spinner("Parsing\u2026"):
parsed, parse_err = structure_resume(raw_text) parsed, parse_err = structure_resume(raw_text)

View file

@ -12,7 +12,9 @@ import io
import json import json
import logging import logging
import re import re
import zipfile
from pathlib import Path from pathlib import Path
from xml.etree import ElementTree as ET
import pdfplumber import pdfplumber
from docx import Document from docx import Document
@ -66,9 +68,54 @@ _LINKEDIN_RE = re.compile(r"linkedin\.com/in/[\w\-]+", re.I)
# ── Text extraction ─────────────────────────────────────────────────────────── # ── Text extraction ───────────────────────────────────────────────────────────
def _find_column_split(page) -> float | None:
"""Return the x-coordinate of the gutter between two columns, or None if single-column.
Finds the largest horizontal gap between word x0 positions in the middle 40%
of the page width that gap is the column gutter.
"""
words = page.extract_words()
if len(words) < 10:
return None
lo, hi = page.width * 0.25, page.width * 0.75
# Collect unique left-edge positions of words that start in the middle band
xs = sorted({int(w["x0"]) for w in words if lo <= w["x0"] <= hi})
if len(xs) < 2:
return None
# Find the biggest consecutive gap
best_gap, split_x = 0.0, None
for i in range(len(xs) - 1):
gap = xs[i + 1] - xs[i]
if gap > best_gap:
best_gap, split_x = gap, (xs[i] + xs[i + 1]) / 2
# Only treat as two-column if the gap is substantial (> 3% of page width)
return split_x if split_x and best_gap > page.width * 0.03 else None
def extract_text_from_pdf(file_bytes: bytes) -> str: def extract_text_from_pdf(file_bytes: bytes) -> str:
"""Extract text from PDF, handling two-column layouts via gutter detection.
For two-column pages, the full-width header (name, contact) is extracted
separately from the columnar body to avoid the centered header being clipped.
"""
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf: with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
pages = [page.extract_text() or "" for page in pdf.pages] pages: list[str] = []
for page in pdf.pages:
w, h = page.width, page.height
split_x = _find_column_split(page)
if split_x:
# Find y-coordinate where right-column content starts.
# Everything above that belongs to the full-width header.
words = page.extract_words()
right_words = [wd for wd in words if wd["x0"] >= split_x]
col_start_y = min(wd["top"] for wd in right_words) if right_words else 0
header_text = page.within_bbox((0, 0, w, col_start_y)).extract_text() or ""
left_text = page.within_bbox((0, col_start_y, split_x, h)).extract_text() or ""
right_text = page.within_bbox((split_x, col_start_y, w, h)).extract_text() or ""
if len(left_text.strip()) > 60 and len(right_text.strip()) > 60:
pages.append("\n".join(filter(None, [header_text, left_text, right_text])))
continue
pages.append(page.extract_text() or "")
return "\n".join(pages) return "\n".join(pages)
@ -77,6 +124,24 @@ def extract_text_from_docx(file_bytes: bytes) -> str:
return "\n".join(p.text for p in doc.paragraphs if p.text.strip()) return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
def extract_text_from_odt(file_bytes: bytes) -> str:
"""Extract plain text from an ODT file (ZIP + XML, no external deps required)."""
# ODT is a ZIP archive; content.xml holds the document body
_NS = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
lines: list[str] = []
with zipfile.ZipFile(io.BytesIO(file_bytes)) as zf:
with zf.open("content.xml") as f:
tree = ET.parse(f)
# Walk all text:p and text:h elements in document order
for elem in tree.iter():
tag = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
if tag in ("p", "h"):
text = "".join(elem.itertext()).strip()
if text:
lines.append(text)
return "\n".join(lines)
# ── Section splitter ────────────────────────────────────────────────────────── # ── Section splitter ──────────────────────────────────────────────────────────
def _split_sections(text: str) -> dict[str, list[str]]: def _split_sections(text: str) -> dict[str, list[str]]:
@ -108,18 +173,34 @@ def _parse_header(lines: list[str]) -> dict:
email_m = _EMAIL_RE.search(full_text) email_m = _EMAIL_RE.search(full_text)
phone_m = _PHONE_RE.search(full_text) phone_m = _PHONE_RE.search(full_text)
# Name heuristic: first non-empty line that looks like a person's name # Name heuristic: first non-empty line that looks like a person's name.
# Handle two common layouts:
# (A) Name on its own line
# (B) "email@example.com Firstname Lastname" on one line
name = "" name = ""
for line in lines[:5]: for line in lines[:8]:
if "@" in line or re.match(r"^\d", line.strip()): stripped = line.strip()
if not stripped:
continue continue
# Skip lines that look like city/state/zip or URLs # Layout B: line contains email — extract the part after the email as name
if re.search(r"\b[A-Z]{2}\b\s*\d{5}", line) or re.search(r"https?://|linkedin|github", line, re.I): if "@" in stripped:
email_m = _EMAIL_RE.search(stripped)
if email_m:
after = stripped[email_m.end():].strip(" |•,")
after_clean = re.sub(r"\s{2,}", " ", after)
alpha_check = re.sub(r"[.\-'\u2019]", "", after_clean.replace(" ", ""))
if 2 <= len(after_clean.split()) <= 5 and alpha_check.isalpha():
name = after_clean
break
continue continue
# Strip separators and credential suffixes (MBA, PhD, etc.) for the alpha check # Skip phone/URL/city lines
candidate = re.sub(r"[|•·,]+", " ", line).strip() if re.match(r"^\d", stripped):
continue
if re.search(r"\b[A-Z]{2}\b\s*\d{5}", stripped) or re.search(r"https?://|linkedin|github", stripped, re.I):
continue
# Layout A: plain name line
candidate = re.sub(r"[|•·,]+", " ", stripped).strip()
candidate = re.sub(r"\s{2,}", " ", candidate) candidate = re.sub(r"\s{2,}", " ", candidate)
# Normalise: remove periods, hyphens for the alpha-only check
alpha_check = re.sub(r"[.\-'\u2019]", "", candidate.replace(" ", "")) alpha_check = re.sub(r"[.\-'\u2019]", "", candidate.replace(" ", ""))
if 2 <= len(candidate.split()) <= 5 and alpha_check.isalpha(): if 2 <= len(candidate.split()) <= 5 and alpha_check.isalpha():
name = candidate name = candidate
@ -151,13 +232,27 @@ def _parse_experience(lines: list[str]) -> list[dict]:
if date_match: if date_match:
if current: if current:
entries.append(current) entries.append(current)
# Title/company may be on this line (layout B) or the previous line (layout A) # Title/company extraction — three layouts:
same_line = _DATE_RANGE_RE.sub("", line).strip(" –—|-•") # (A) Title on prev_line, "Company | Location | Dates" on date line
header = same_line if same_line.strip() else prev_line # (B) "Title | Company" on prev_line, dates on date line (same_line empty)
parts = re.split(r"\s{2,}|[|•·,–—]\s*", header.strip(), maxsplit=1) # (C) "Title | Company | Dates" all on one line
same_line = _DATE_RANGE_RE.sub("", line)
# Remove residual punctuation-only fragments like "()" left after date removal
same_line = re.sub(r"[()[\]{}\s]+$", "", same_line).strip(" –—|-•")
if prev_line and same_line.strip():
# Layout A: title = prev_line, company = first segment of same_line
title = prev_line.strip()
co_part = re.split(r"\s{2,}|[|,]\s*", same_line.strip(), maxsplit=1)[0]
company = co_part.strip()
else:
# Layout B/C: title | company are together (prev_line or same_line)
header = same_line if same_line.strip() else prev_line
parts = re.split(r"\s{2,}|[|•·,–—]\s*", header.strip(), maxsplit=1)
title = parts[0].strip() if parts else ""
company = parts[1].strip() if len(parts) > 1 else ""
current = { current = {
"title": parts[0].strip() if parts else "", "title": title,
"company": parts[1].strip() if len(parts) > 1 else "", "company": company,
"start_date": date_match.group(1), "start_date": date_match.group(1),
"end_date": date_match.group(2), "end_date": date_match.group(2),
"bullets": [], "bullets": [],