Compare commits
No commits in common. "48e7748b43b1ddf297a0e6f0e78c8b29b55564d1" and "ab6d7f2c874a1716537b62b60ce32082cb0c6960" have entirely different histories.
48e7748b43
...
ab6d7f2c87
2 changed files with 27 additions and 147 deletions
|
|
@ -305,48 +305,26 @@ elif step == 4:
|
||||||
tab_upload, tab_builder = st.tabs(["\U0001f4ce Upload", "\U0001f4dd Build manually"])
|
tab_upload, tab_builder = st.tabs(["\U0001f4ce Upload", "\U0001f4dd Build manually"])
|
||||||
|
|
||||||
with tab_upload:
|
with tab_upload:
|
||||||
uploaded = st.file_uploader("Upload PDF, DOCX, or ODT", type=["pdf", "docx", "odt"])
|
uploaded = st.file_uploader("Upload PDF or DOCX", type=["pdf", "docx"])
|
||||||
if uploaded and st.button("Parse Resume", type="primary", key="parse_resume"):
|
if uploaded and st.button("Parse Resume", type="primary", key="parse_resume"):
|
||||||
from scripts.resume_parser import (
|
from scripts.resume_parser import (
|
||||||
extract_text_from_pdf, extract_text_from_docx,
|
extract_text_from_pdf, extract_text_from_docx, structure_resume,
|
||||||
extract_text_from_odt, structure_resume,
|
|
||||||
)
|
)
|
||||||
file_bytes = uploaded.read()
|
file_bytes = uploaded.read()
|
||||||
ext = uploaded.name.rsplit(".", 1)[-1].lower()
|
ext = uploaded.name.rsplit(".", 1)[-1].lower()
|
||||||
if ext == "pdf":
|
raw_text = (
|
||||||
raw_text = extract_text_from_pdf(file_bytes)
|
extract_text_from_pdf(file_bytes) if ext == "pdf"
|
||||||
elif ext == "odt":
|
else extract_text_from_docx(file_bytes)
|
||||||
raw_text = extract_text_from_odt(file_bytes)
|
)
|
||||||
else:
|
|
||||||
raw_text = extract_text_from_docx(file_bytes)
|
|
||||||
with st.spinner("Parsing\u2026"):
|
with st.spinner("Parsing\u2026"):
|
||||||
parsed, parse_err = structure_resume(raw_text)
|
parsed = structure_resume(raw_text)
|
||||||
|
if parsed:
|
||||||
# Diagnostic: show raw extraction + detected fields regardless of outcome
|
|
||||||
with st.expander("🔍 Parse diagnostics", expanded=not bool(parsed and any(
|
|
||||||
parsed.get(k) for k in ("name", "experience", "skills")
|
|
||||||
))):
|
|
||||||
st.caption("**Raw extracted text (first 800 chars)**")
|
|
||||||
st.code(raw_text[:800] if raw_text else "(empty)", language="text")
|
|
||||||
if parsed:
|
|
||||||
st.caption("**Detected fields**")
|
|
||||||
st.json({k: (v[:3] if isinstance(v, list) else v) for k, v in parsed.items()})
|
|
||||||
|
|
||||||
if parsed and any(parsed.get(k) for k in ("name", "experience", "skills")):
|
|
||||||
st.session_state["_parsed_resume"] = parsed
|
st.session_state["_parsed_resume"] = parsed
|
||||||
st.session_state["_raw_resume_text"] = raw_text
|
st.session_state["_raw_resume_text"] = raw_text
|
||||||
_save_yaml({"_raw_resume_text": raw_text[:8000]})
|
_save_yaml({"_raw_resume_text": raw_text[:8000]})
|
||||||
st.success("Parsed! Review the builder tab to edit entries.")
|
st.success("Parsed! Review the builder tab to edit entries.")
|
||||||
elif parsed:
|
|
||||||
# Parsed but empty — show what we got and let them proceed or build manually
|
|
||||||
st.session_state["_parsed_resume"] = parsed
|
|
||||||
st.warning("Resume text was extracted but no fields were recognised. "
|
|
||||||
"Check the diagnostics above — the section headers may use unusual labels. "
|
|
||||||
"You can still fill in the Build tab manually.")
|
|
||||||
else:
|
else:
|
||||||
st.warning("Auto-parse failed \u2014 switch to the Build tab and add entries manually.")
|
st.warning("Auto-parse failed \u2014 switch to the Build tab and add entries manually.")
|
||||||
if parse_err:
|
|
||||||
st.caption(f"Reason: {parse_err}")
|
|
||||||
|
|
||||||
with tab_builder:
|
with tab_builder:
|
||||||
parsed = st.session_state.get("_parsed_resume", {})
|
parsed = st.session_state.get("_parsed_resume", {})
|
||||||
|
|
|
||||||
|
|
@ -12,9 +12,7 @@ import io
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import zipfile
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from xml.etree import ElementTree as ET
|
|
||||||
|
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
from docx import Document
|
from docx import Document
|
||||||
|
|
@ -24,11 +22,11 @@ log = logging.getLogger(__name__)
|
||||||
# ── Section header detection ──────────────────────────────────────────────────
|
# ── Section header detection ──────────────────────────────────────────────────
|
||||||
|
|
||||||
_SECTION_NAMES = {
|
_SECTION_NAMES = {
|
||||||
"summary": re.compile(r"^(summary|objective|profile|about me|professional summary|career summary|career objective|personal statement)\s*:?\s*$", re.I),
|
"summary": re.compile(r"^(summary|objective|profile|about me|professional summary)", re.I),
|
||||||
"experience": re.compile(r"^(experience|work experience|employment|work history|professional experience|career history|relevant experience|professional history|employment history|positions? held)\s*:?\s*$", re.I),
|
"experience": re.compile(r"^(experience|work experience|employment|work history|professional experience)", re.I),
|
||||||
"education": re.compile(r"^(education|academic|qualifications|degrees?|educational background|academic background)\s*:?\s*$", re.I),
|
"education": re.compile(r"^(education|academic|qualifications|degrees?)", re.I),
|
||||||
"skills": re.compile(r"^(skills?|technical skills?|core competencies|competencies|expertise|areas? of expertise|key skills?|proficiencies|tools? & technologies)\s*:?\s*$", re.I),
|
"skills": re.compile(r"^(skills?|technical skills?|core competencies|competencies|expertise)", re.I),
|
||||||
"achievements": re.compile(r"^(achievements?|accomplishments?|awards?|honors?|certifications?|publications?|volunteer)\s*:?\s*$", re.I),
|
"achievements": re.compile(r"^(achievements?|accomplishments?|awards?|honors?|certifications?)", re.I),
|
||||||
}
|
}
|
||||||
|
|
||||||
# Degrees — used to detect education lines
|
# Degrees — used to detect education lines
|
||||||
|
|
@ -68,54 +66,9 @@ _LINKEDIN_RE = re.compile(r"linkedin\.com/in/[\w\-]+", re.I)
|
||||||
|
|
||||||
# ── Text extraction ───────────────────────────────────────────────────────────
|
# ── Text extraction ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def _find_column_split(page) -> float | None:
|
|
||||||
"""Return the x-coordinate of the gutter between two columns, or None if single-column.
|
|
||||||
|
|
||||||
Finds the largest horizontal gap between word x0 positions in the middle 40%
|
|
||||||
of the page width — that gap is the column gutter.
|
|
||||||
"""
|
|
||||||
words = page.extract_words()
|
|
||||||
if len(words) < 10:
|
|
||||||
return None
|
|
||||||
lo, hi = page.width * 0.25, page.width * 0.75
|
|
||||||
# Collect unique left-edge positions of words that start in the middle band
|
|
||||||
xs = sorted({int(w["x0"]) for w in words if lo <= w["x0"] <= hi})
|
|
||||||
if len(xs) < 2:
|
|
||||||
return None
|
|
||||||
# Find the biggest consecutive gap
|
|
||||||
best_gap, split_x = 0.0, None
|
|
||||||
for i in range(len(xs) - 1):
|
|
||||||
gap = xs[i + 1] - xs[i]
|
|
||||||
if gap > best_gap:
|
|
||||||
best_gap, split_x = gap, (xs[i] + xs[i + 1]) / 2
|
|
||||||
# Only treat as two-column if the gap is substantial (> 3% of page width)
|
|
||||||
return split_x if split_x and best_gap > page.width * 0.03 else None
|
|
||||||
|
|
||||||
|
|
||||||
def extract_text_from_pdf(file_bytes: bytes) -> str:
|
def extract_text_from_pdf(file_bytes: bytes) -> str:
|
||||||
"""Extract text from PDF, handling two-column layouts via gutter detection.
|
|
||||||
|
|
||||||
For two-column pages, the full-width header (name, contact) is extracted
|
|
||||||
separately from the columnar body to avoid the centered header being clipped.
|
|
||||||
"""
|
|
||||||
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
|
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
|
||||||
pages: list[str] = []
|
pages = [page.extract_text() or "" for page in pdf.pages]
|
||||||
for page in pdf.pages:
|
|
||||||
w, h = page.width, page.height
|
|
||||||
split_x = _find_column_split(page)
|
|
||||||
if split_x:
|
|
||||||
# Find y-coordinate where right-column content starts.
|
|
||||||
# Everything above that belongs to the full-width header.
|
|
||||||
words = page.extract_words()
|
|
||||||
right_words = [wd for wd in words if wd["x0"] >= split_x]
|
|
||||||
col_start_y = min(wd["top"] for wd in right_words) if right_words else 0
|
|
||||||
header_text = page.within_bbox((0, 0, w, col_start_y)).extract_text() or ""
|
|
||||||
left_text = page.within_bbox((0, col_start_y, split_x, h)).extract_text() or ""
|
|
||||||
right_text = page.within_bbox((split_x, col_start_y, w, h)).extract_text() or ""
|
|
||||||
if len(left_text.strip()) > 60 and len(right_text.strip()) > 60:
|
|
||||||
pages.append("\n".join(filter(None, [header_text, left_text, right_text])))
|
|
||||||
continue
|
|
||||||
pages.append(page.extract_text() or "")
|
|
||||||
return "\n".join(pages)
|
return "\n".join(pages)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -124,24 +77,6 @@ def extract_text_from_docx(file_bytes: bytes) -> str:
|
||||||
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
|
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
|
||||||
|
|
||||||
|
|
||||||
def extract_text_from_odt(file_bytes: bytes) -> str:
|
|
||||||
"""Extract plain text from an ODT file (ZIP + XML, no external deps required)."""
|
|
||||||
# ODT is a ZIP archive; content.xml holds the document body
|
|
||||||
_NS = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
|
|
||||||
lines: list[str] = []
|
|
||||||
with zipfile.ZipFile(io.BytesIO(file_bytes)) as zf:
|
|
||||||
with zf.open("content.xml") as f:
|
|
||||||
tree = ET.parse(f)
|
|
||||||
# Walk all text:p and text:h elements in document order
|
|
||||||
for elem in tree.iter():
|
|
||||||
tag = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
|
|
||||||
if tag in ("p", "h"):
|
|
||||||
text = "".join(elem.itertext()).strip()
|
|
||||||
if text:
|
|
||||||
lines.append(text)
|
|
||||||
return "\n".join(lines)
|
|
||||||
|
|
||||||
|
|
||||||
# ── Section splitter ──────────────────────────────────────────────────────────
|
# ── Section splitter ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def _split_sections(text: str) -> dict[str, list[str]]:
|
def _split_sections(text: str) -> dict[str, list[str]]:
|
||||||
|
|
@ -173,36 +108,17 @@ def _parse_header(lines: list[str]) -> dict:
|
||||||
email_m = _EMAIL_RE.search(full_text)
|
email_m = _EMAIL_RE.search(full_text)
|
||||||
phone_m = _PHONE_RE.search(full_text)
|
phone_m = _PHONE_RE.search(full_text)
|
||||||
|
|
||||||
# Name heuristic: first non-empty line that looks like a person's name.
|
# Name heuristic: first non-empty line that has no @ and no digits-only tokens
|
||||||
# Handle two common layouts:
|
|
||||||
# (A) Name on its own line
|
|
||||||
# (B) "email@example.com Firstname Lastname" on one line
|
|
||||||
name = ""
|
name = ""
|
||||||
for line in lines[:8]:
|
for line in lines[:5]:
|
||||||
stripped = line.strip()
|
if "@" in line or re.match(r"^\d", line.strip()):
|
||||||
if not stripped:
|
|
||||||
continue
|
continue
|
||||||
# Layout B: line contains email — extract the part after the email as name
|
# Skip lines that look like city/state/zip
|
||||||
if "@" in stripped:
|
if re.search(r"\b[A-Z]{2}\b\s*\d{5}", line):
|
||||||
email_m = _EMAIL_RE.search(stripped)
|
|
||||||
if email_m:
|
|
||||||
after = stripped[email_m.end():].strip(" |•,")
|
|
||||||
after_clean = re.sub(r"\s{2,}", " ", after)
|
|
||||||
alpha_check = re.sub(r"[.\-'\u2019]", "", after_clean.replace(" ", ""))
|
|
||||||
if 2 <= len(after_clean.split()) <= 5 and alpha_check.isalpha():
|
|
||||||
name = after_clean
|
|
||||||
break
|
|
||||||
continue
|
continue
|
||||||
# Skip phone/URL/city lines
|
candidate = re.sub(r"[|•·,]+", " ", line).strip()
|
||||||
if re.match(r"^\d", stripped):
|
|
||||||
continue
|
|
||||||
if re.search(r"\b[A-Z]{2}\b\s*\d{5}", stripped) or re.search(r"https?://|linkedin|github", stripped, re.I):
|
|
||||||
continue
|
|
||||||
# Layout A: plain name line
|
|
||||||
candidate = re.sub(r"[|•·,]+", " ", stripped).strip()
|
|
||||||
candidate = re.sub(r"\s{2,}", " ", candidate)
|
candidate = re.sub(r"\s{2,}", " ", candidate)
|
||||||
alpha_check = re.sub(r"[.\-'\u2019]", "", candidate.replace(" ", ""))
|
if 2 <= len(candidate.split()) <= 5 and candidate.replace(" ", "").isalpha():
|
||||||
if 2 <= len(candidate.split()) <= 5 and alpha_check.isalpha():
|
|
||||||
name = candidate
|
name = candidate
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
@ -232,27 +148,13 @@ def _parse_experience(lines: list[str]) -> list[dict]:
|
||||||
if date_match:
|
if date_match:
|
||||||
if current:
|
if current:
|
||||||
entries.append(current)
|
entries.append(current)
|
||||||
# Title/company extraction — three layouts:
|
# Title/company may be on this line (layout B) or the previous line (layout A)
|
||||||
# (A) Title on prev_line, "Company | Location | Dates" on date line
|
same_line = _DATE_RANGE_RE.sub("", line).strip(" –—|-•")
|
||||||
# (B) "Title | Company" on prev_line, dates on date line (same_line empty)
|
header = same_line if same_line.strip() else prev_line
|
||||||
# (C) "Title | Company | Dates" all on one line
|
parts = re.split(r"\s{2,}|[|•·,–—]\s*", header.strip(), maxsplit=1)
|
||||||
same_line = _DATE_RANGE_RE.sub("", line)
|
|
||||||
# Remove residual punctuation-only fragments like "()" left after date removal
|
|
||||||
same_line = re.sub(r"[()[\]{}\s]+$", "", same_line).strip(" –—|-•")
|
|
||||||
if prev_line and same_line.strip():
|
|
||||||
# Layout A: title = prev_line, company = first segment of same_line
|
|
||||||
title = prev_line.strip()
|
|
||||||
co_part = re.split(r"\s{2,}|[|,]\s*", same_line.strip(), maxsplit=1)[0]
|
|
||||||
company = co_part.strip()
|
|
||||||
else:
|
|
||||||
# Layout B/C: title | company are together (prev_line or same_line)
|
|
||||||
header = same_line if same_line.strip() else prev_line
|
|
||||||
parts = re.split(r"\s{2,}|[|•·,–—]\s*", header.strip(), maxsplit=1)
|
|
||||||
title = parts[0].strip() if parts else ""
|
|
||||||
company = parts[1].strip() if len(parts) > 1 else ""
|
|
||||||
current = {
|
current = {
|
||||||
"title": title,
|
"title": parts[0].strip() if parts else "",
|
||||||
"company": company,
|
"company": parts[1].strip() if len(parts) > 1 else "",
|
||||||
"start_date": date_match.group(1),
|
"start_date": date_match.group(1),
|
||||||
"end_date": date_match.group(2),
|
"end_date": date_match.group(2),
|
||||||
"bullets": [],
|
"bullets": [],
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue