peregrine/scripts/resume_parser.py

"""
Resume parser — extract text from PDF/DOCX and structure via section parsing.

Primary path: regex + section detection (no LLM, no token limits).
Optional enhancement: LLM-generated career_summary if a capable backend is configured.

Falls back to empty dict on unrecoverable errors — caller shows the form builder.
"""
from __future__ import annotations

import io
import logging
import re
import zipfile
from xml.etree import ElementTree as ET

import pdfplumber
from docx import Document

log = logging.getLogger(__name__)

# Browser print artifact patterns — lines injected when a PDF is printed from a browser
# (print header "MM/DD/YY, H:MM AM/PM <title>" and print footer "file:///... N/N")
_BROWSER_ARTIFACT_RE = re.compile(
    r"^file:///"                                                      # file:// URL footer
    r"|^\d{1,2}/\d{1,2}/\d{2,4},\s+\d{1,2}:\d{2}\s+[AP]M\b",       # MM/DD/YY, H:MM AM/PM header
    re.I,
)

# ── Section header detection ──────────────────────────────────────────────────

_SECTION_NAMES = {
    "summary":    re.compile(r"^(summary|objective|profile|about me|professional summary|career summary|career objective|personal statement)\s*:?\s*$", re.I),
    "experience": re.compile(r"^(experience|work experience|employment|work history|professional experience|career history|relevant experience|professional history|employment history|positions? held)\s*:?\s*$", re.I),
    "education":  re.compile(r"^(education|academic|qualifications|degrees?|educational background|academic background)\s*:?\s*$", re.I),
    "skills":     re.compile(r"^(skills?|technical skills?|core competencies|competencies|expertise|areas? of expertise|key skills?|proficiencies|tools? & technologies)\s*:?\s*$", re.I),
    "achievements": re.compile(r"^(achievements?|accomplishments?|awards?|honors?|certifications?|publications?|volunteer)\s*:?\s*$", re.I),
    "projects":     re.compile(r"^(projects?|independent development|independent projects?|side projects?|personal projects?|open.?source|portfolio)\s*:?\s*$", re.I),
    "references": re.compile(r"^references?\s*:?\s*$", re.I),
}

# Degrees — used to detect education lines
_DEGREE_RE = re.compile(
    r"\b(b\.?s\.?|b\.?a\.?|m\.?s\.?|m\.?b\.?a\.?|ph\.?d\.?|bachelor|master|associate|doctorate|diploma)\b",
    re.I,
)

# Date patterns for experience entries: "Jan 2020", "2020", "01/2020", "2019 - 2022"
_DATE_RE = re.compile(
    r"\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|january|february|march|april|june|"
    r"july|august|september|october|november|december)?\s*\d{4}\b"
    r"|\b\d{1,2}/\d{4}\b",
    re.I,
)
_DATE_RANGE_RE = re.compile(
    r"("
    r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\.?\s+\d{4}"
    r"|\d{1,2}/\d{4}"
    r"|\d{4}"
    r")"
    r"\s*[-–—to]+\s*"
    r"("
    r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\.?\s+\d{4}"
    r"|\d{1,2}/\d{4}"
    r"|\d{4}"
    r"|present|current|now"
    r")",
    re.I,
)

# Contact info
_EMAIL_RE    = re.compile(r"[\w.+\-]+@[\w\-]+\.[\w.\-]+")
_PHONE_RE    = re.compile(r"(?:\+1[\s.\-]?)?\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4}")
_LINKEDIN_RE = re.compile(r"linkedin\.com/in/[\w\-]+", re.I)


# ── Text extraction ───────────────────────────────────────────────────────────

def _find_column_split(page) -> float | None:
    """Return the x-coordinate of the gutter between two columns, or None if single-column.

    Finds the largest horizontal gap between word x0 positions in the middle 40%
    of the page width — that gap is the column gutter.
    """
    words = page.extract_words()
    if len(words) < 10:
        return None
    lo, hi = page.width * 0.25, page.width * 0.75
    # Collect unique left-edge positions of words that start in the middle band
    xs = sorted({int(w["x0"]) for w in words if lo <= w["x0"] <= hi})
    if len(xs) < 2:
        return None
    # Find the biggest consecutive gap
    best_gap, split_x = 0.0, None
    for i in range(len(xs) - 1):
        gap = xs[i + 1] - xs[i]
        if gap > best_gap:
            best_gap, split_x = gap, (xs[i] + xs[i + 1]) / 2
    # Only treat as two-column if the gap is substantial (> 3% of page width)
    return split_x if split_x and best_gap > page.width * 0.03 else None


_CID_BULLETS = {127, 149, 183}  # common bullet CIDs across ATS-reembedded fonts

def _clean_cid(text: str) -> str:
    """Replace (cid:NNN) glyph references emitted by pdfplumber when a PDF font
    lacks a ToUnicode map.  Known bullet CIDs become '•'; everything else is
    stripped so downstream section parsing sees clean text."""
    def _replace(m: re.Match) -> str:
        n = int(m.group(1))
        return "•" if n in _CID_BULLETS else ""
    return re.sub(r"\(cid:(\d+)\)", _replace, text)


def extract_text_from_pdf(file_bytes: bytes) -> str:
    """Extract text from PDF, handling two-column layouts via gutter detection.

    For two-column pages, the full-width header (name, contact) is extracted
    separately from the columnar body to avoid the centered header being clipped.
    """
    with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
        pages: list[str] = []
        for page in pdf.pages:
            w, h = page.width, page.height
            split_x = _find_column_split(page)
            if split_x:
                # Find y-coordinate where right-column content starts.
                # Everything above that belongs to the full-width header.
                words = page.extract_words()
                right_words = [wd for wd in words if wd["x0"] >= split_x]
                col_start_y = min(wd["top"] for wd in right_words) if right_words else 0
                header_text = page.within_bbox((0,       0,       w,       col_start_y)).extract_text() or ""
                left_text   = page.within_bbox((0,       col_start_y, split_x, h)).extract_text() or ""
                right_text  = page.within_bbox((split_x, col_start_y, w,       h)).extract_text() or ""
                if len(left_text.strip()) > 60 and len(right_text.strip()) > 60:
                    pages.append("\n".join(filter(None, [header_text, left_text, right_text])))
                    continue
            pages.append(page.extract_text() or "")
    return _clean_cid("\n".join(pages))


def extract_text_from_docx(file_bytes: bytes) -> str:
    doc = Document(io.BytesIO(file_bytes))
    return _clean_cid("\n".join(p.text for p in doc.paragraphs if p.text.strip()))


def extract_text_from_odt(file_bytes: bytes) -> str:
    """Extract plain text from an ODT file (ZIP + XML, no external deps required)."""
    # ODT is a ZIP archive; content.xml holds the document body
    _NS = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
    lines: list[str] = []
    with zipfile.ZipFile(io.BytesIO(file_bytes)) as zf:
        with zf.open("content.xml") as f:
            tree = ET.parse(f)
    # Walk all text:p and text:h elements in document order
    for elem in tree.iter():
        tag = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
        if tag in ("p", "h"):
            text = "".join(elem.itertext()).strip()
            if text:
                lines.append(text)
    return _clean_cid("\n".join(lines))


# ── Section splitter ──────────────────────────────────────────────────────────

def _split_sections(text: str) -> dict[str, list[str]]:
    """Split resume text into named sections. Lines that don't match a known
    section header go into 'header' (assumed to be contact/name block)."""
    sections: dict[str, list[str]] = {"header": []}
    current = "header"
    for line in text.splitlines():
        stripped = line.strip()
        if not stripped:
            continue
        if _BROWSER_ARTIFACT_RE.match(stripped):
            continue
        matched = False
        for section, pattern in _SECTION_NAMES.items():
            # Match if the line IS a section header (short + matches pattern)
            if pattern.match(stripped) and len(stripped.split()) <= 5:
                current = section
                matched = True
                break
        if not matched:
            sections.setdefault(current, []).append(stripped)
    return sections


# ── Contact info ──────────────────────────────────────────────────────────────

def _parse_header(lines: list[str]) -> dict:
    """Extract name, email, phone from the top-of-resume block."""
    full_text = "\n".join(lines)
    email_m   = _EMAIL_RE.search(full_text)
    phone_m   = _PHONE_RE.search(full_text)

    # Name heuristic: first non-empty line that looks like a person's name.
    # Handle two common layouts:
    #   (A) Name on its own line
    #   (B) "email@example.com Firstname Lastname" on one line
    name = ""
    for line in lines[:8]:
        stripped = line.strip()
        if not stripped:
            continue
        # Layout B: line contains email — extract the part after the email as name
        if "@" in stripped:
            email_m = _EMAIL_RE.search(stripped)
            if email_m:
                after = stripped[email_m.end():].strip(" |•,")
                after_clean = re.sub(r"\s{2,}", " ", after)
                alpha_check = re.sub(r"[.\-'\u2019]", "", after_clean.replace(" ", ""))
                if 2 <= len(after_clean.split()) <= 5 and alpha_check.isalpha():
                    name = after_clean
                    break
            continue
        # Skip phone/URL/city lines
        if re.match(r"^\d", stripped):
            continue
        if re.search(r"\b[A-Z]{2}\b\s*\d{5}", stripped) or re.search(r"https?://|linkedin|github", stripped, re.I):
            continue
        # Layout A: plain name line
        candidate = re.sub(r"[|•·,]+", " ", stripped).strip()
        candidate = re.sub(r"\s{2,}", " ", candidate)
        alpha_check = re.sub(r"[.\-'\u2019]", "", candidate.replace(" ", ""))
        if 2 <= len(candidate.split()) <= 5 and alpha_check.isalpha():
            name = candidate
            break

    return {
        "name":  name,
        "email": email_m.group(0) if email_m else "",
        "phone": phone_m.group(0) if phone_m else "",
    }


# ── Experience ────────────────────────────────────────────────────────────────

def _parse_experience(lines: list[str]) -> list[dict]:
    """Parse work experience entries from section lines.

    Handles two common layouts:
      (A) Title | Company          (B) Title | Company | Dates
          Dates                        • bullet
          • bullet
      (C) Title\tDates             (tab-separated, common in DOCX exports)
          Company | Location
          • bullet
    """
    entries: list[dict] = []
    current: dict | None = None
    prev_line = ""
    seen_bullets = False  # True once we've appended the first bullet to current

    for line in lines:
        date_match = _DATE_RANGE_RE.search(line)
        if date_match:
            if current:
                entries.append(current)
            # Title/company extraction — three layouts:
            #  (A) Title on prev_line (not a bullet), "Company | Location | Dates" on date line
            #  (B) "Title | Company" on prev_line, dates on date line (same_line empty)
            #  (C) "Title | Company | Dates" all on one line
            same_line = _DATE_RANGE_RE.sub("", line)
            # Remove residual punctuation-only fragments like "()" left after date removal
            same_line = re.sub(r"[()[\]{}\s]+$", "", same_line).strip(" –—|-•")
            # Only use prev_line as title if it isn't bullet text (cleared after bullets)
            if prev_line and same_line.strip():
                # Layout A: title = prev_line, company = first segment of same_line
                title   = prev_line.strip()
                co_part = re.split(r"\s{2,}|[|,]\s*", same_line.strip(), maxsplit=1)[0]
                company = co_part.strip()
            else:
                # Layout B/C: title | company are together (prev_line or same_line)
                header = same_line if same_line.strip() else prev_line
                parts  = re.split(r"\s{2,}|[|•·,–—]\s*", header.strip(), maxsplit=1)
                title   = parts[0].strip() if parts else ""
                company = parts[1].strip() if len(parts) > 1 else ""
            current = {
                "title":      title,
                "company":    company,
                "start_date": date_match.group(1),
                "end_date":   date_match.group(2),
                "bullets":    [],
            }
            prev_line = ""
            seen_bullets = False
        elif current is not None:
            is_bullet = bool(re.match(r"^[•\-–—*◦▪▸►]\s*", line))

            # Layout C: company/location on the line immediately after the date line,
            # before any bullets. Short non-date line = company, not a next-job header.
            if (not is_bullet and not seen_bullets and not current["company"]
                    and not _DATE_RE.search(line) and len(line.strip()) < 80):
                co_part = re.split(r"\s{2,}|[|,]\s*", line.strip(), maxsplit=1)[0]
                current["company"] = co_part.strip()
                prev_line = ""
                continue

            looks_like_header = (
                not is_bullet
                and " | " in line
                and not _DATE_RE.search(line)
            )
            if looks_like_header:
                # Likely the title/company of the next entry — hold it as prev_line
                prev_line = line
            else:
                clean = re.sub(r"^[•\-–—*◦▪▸►]\s*", "", line).strip()
                if clean:
                    current["bullets"].append(clean)
                    seen_bullets = True
                # Clear prev_line after non-header content so the next date match
                # doesn't mistake a bullet as a job title (Layout A false-positive).
                prev_line = ""
        else:
            prev_line = line

    if current:
        entries.append(current)

    return entries


# ── Education ─────────────────────────────────────────────────────────────────

_INSTITUTION_RE = re.compile(r"\b(university|college|institute|school|academy)\b", re.I)


def _parse_education(lines: list[str]) -> list[dict]:
    """Parse education entries.

    Primary path: degree keyword detected (B.S., Master, etc.)
    Fallback path: year range detected without a degree keyword — handles resumes
    with courses, programmes, or non-degree study (e.g. "San Jose State University  2005-2006").
    """
    entries: list[dict] = []
    current: dict | None = None
    prev_line = ""

    for line in lines:
        has_degree = bool(_DEGREE_RE.search(line))
        date_range = _DATE_RANGE_RE.search(line)
        has_year   = bool(re.search(r"\b(19|20)\d{2}\b", line))

        if has_degree or (has_year and date_range):
            if current:
                entries.append(current)
            current = {"institution": "", "degree": "", "field": "", "graduation_year": ""}

            year_m = re.search(r"\b(19|20)\d{2}\b", line)
            if year_m:
                current["graduation_year"] = year_m.group(0)

            if has_degree:
                degree_m = _DEGREE_RE.search(line)
                if degree_m:
                    current["degree"] = degree_m.group(0).upper()
                remainder = _DEGREE_RE.sub("", _DATE_RE.sub("", line))
                remainder = re.sub(r"\b(19|20)\d{2}\b", "", remainder)
                current["field"] = remainder.strip(" ,–—|•.")
                if prev_line and not _DEGREE_RE.search(prev_line) and not _DATE_RE.search(prev_line):
                    current["institution"] = prev_line.strip(" ,–—|•")
            else:
                # Fallback: year-range line without a degree keyword.
                # Two layouts:
                #   (A) PDF: "Graphic Design, 2005–2006" with institution on prev_line
                #   (B) DOCX: "San Jose State University\t2005-2006" — institution on same line
                same = _DATE_RANGE_RE.sub("", line)
                same = re.sub(r"\b(19|20)\d{2}\b", "", same).strip(" ,–—|•\t")
                prev_clean = prev_line.strip(" ,–—|•") if prev_line else ""

                if same and _INSTITUTION_RE.search(prev_clean):
                    # Layout A: institution on prev_line (e.g. "San Jose State University")
                    current["institution"] = prev_clean
                    current["field"] = same
                elif same:
                    # Layout B: institution embedded on same line as year
                    current["institution"] = same
                elif prev_clean:
                    current["institution"] = prev_clean

            prev_line = ""  # consumed; prevent leaking into the next entry

        elif current is not None:
            clean = line.strip(" ,–—|•\t")
            if clean:
                if not current["institution"]:
                    current["institution"] = clean
                elif not current["field"]:
                    current["field"] = clean
                    prev_line = ""  # field consumed — don't seed the next entry
                    continue
            prev_line = line.strip()

        else:
            prev_line = line.strip()

    if current:
        entries.append(current)

    return entries


# ── Skills ────────────────────────────────────────────────────────────────────

def _split_skill_tokens(line: str) -> list[str]:
    """Split a skills line on delimiters, but not on commas inside parentheses.

    Splits on |, •, ·, tab first (always separators), then on comma only when
    paren depth is zero — so "CRM Ticketing (Jira, Salesforce)" stays intact.
    """
    tokens: list[str] = []
    for part in re.split(r"[|•·\t]+", line):
        depth, buf = 0, ""
        for ch in part:
            if ch == "(":
                depth += 1
                buf += ch
            elif ch == ")":
                depth -= 1
                buf += ch
            elif ch == "," and depth == 0:
                tokens.append(buf)
                buf = ""
            else:
                buf += ch
        tokens.append(buf)
    return tokens


def _parse_skills(lines: list[str]) -> list[str]:
    skills: list[str] = []
    for line in lines:
        for item in _split_skill_tokens(line):
            # Strip only bullet/dash markers and whitespace, NOT parentheses —
            # many skills contain parens, e.g. "C++ (Arduino / Embedded)"
            clean = item.strip(" -–—*◦▪▸►")
            if 1 < len(clean) <= 60:
                skills.append(clean)
    return skills


# ── Main parser ───────────────────────────────────────────────────────────────

def parse_resume(raw_text: str) -> tuple[dict, str]:
    """Parse resume text into a structured dict using section detection + regex.

    Returns (result_dict, error_message). result_dict is empty on failure.
    """
    if not raw_text.strip():
        return {}, "Text extraction returned empty — the file may be image-based or unreadable."

    try:
        sections = _split_sections(raw_text)
        contact  = _parse_header(sections.get("header", []))
        result = {
            **contact,
            "career_summary": " ".join(sections.get("summary", [])),
            "experience":     _parse_experience(sections.get("experience", [])),
            "education":      _parse_education(sections.get("education", [])),
            "skills":         _parse_skills(sections.get("skills", [])),
            "achievements":   sections.get("achievements", []),
        }
        return result, ""
    except Exception as e:
        import traceback
        log.error("[resume_parser] parse_resume error:\n%s", traceback.format_exc())
        return {}, str(e)


# ── LLM enhancement (career summary only, optional) ──────────────────────────

def _llm_career_summary(raw_text: str) -> str:
    """Use LLM to generate a career summary. Returns empty string on any failure."""
    try:
        from scripts.llm_router import LLMRouter
        prompt = (
            "Write a 2-3 sentence professional career summary for this candidate "
            "based on their resume. Return only the summary text, no labels.\n\n"
            f"Resume:\n{raw_text[:1500]}"
        )
        return LLMRouter().complete(prompt)
    except Exception:
        return ""


# ── Public entry point ────────────────────────────────────────────────────────

def structure_resume(raw_text: str) -> tuple[dict, str]:
    """Parse resume and optionally enhance career_summary via LLM.

    Returns (result_dict, error_message).
    """
    result, err = parse_resume(raw_text)
    if not result:
        return result, err

    # Enhance career summary via LLM if the section wasn't found in the document
    if not result.get("career_summary"):
        try:
            summary = _llm_career_summary(raw_text)
        except Exception:
            summary = ""
        if summary:
            result["career_summary"] = summary.strip()

    return result, ""