peregrine/scripts/linkedin_utils.py

# scripts/linkedin_utils.py
"""
LinkedIn profile HTML parser.

Extracts structured profile data from a raw LinkedIn public profile page.
No Playwright dependency — importable by both linkedin_scraper and linkedin_parser.

** LinkedIn public profile limitations (2025) **
Unauthenticated requests receive a degraded page where experience titles, past
roles, education detail, and skills are replaced with blur placeholders or omitted
entirely.  Only the following are reliably available without login:
  - Name + headline (top card)
  - About/summary (truncated; login prompt injected after "see more")
  - Current employer name only (no title, dates, or description)
  - Certifications/licenses (if publicly listed)
  - Volunteer experience, publications, projects (if public)
For full profile data use the LinkedIn data export zip path instead.

Selectors target the 2025 LinkedIn public profile DOM.
When LinkedIn changes their markup, update the selector lists here only.
Each section uses ordered fallbacks — first matching selector wins.
"""
from __future__ import annotations
import re
from bs4 import BeautifulSoup

# Noise phrases injected by LinkedIn's login wall — stripped from summary text
_LOGIN_NOISE = re.compile(
    r"see more.*$|welcome back.*$|sign in.*$|by clicking.*$|new to linkedin.*$",
    re.I | re.S,
)

# ── Selector fallback lists ────────────────────────────────────────────────────

_NAME_SELECTORS = [
    "h1.top-card-layout__title",
    "h1[class*='title']",
    ".pv-top-card--list h1",
    "h1",
]

# 2025 DOM: data-section="summary" (not "about")
_SUMMARY_SECTION_SELECTOR = "section[data-section='summary'] .core-section-container__content"
_SUMMARY_SELECTORS = [
    "section[data-section='summary'] .core-section-container__content",
    "section[data-section='about'] .core-section-container__content",
    "section[data-section='about'] .show-more-less-text__text--less",
    "section[data-section='about'] p",
    ".pv-about-section p",
]

# 2025 DOM: experience lives in .visible-list inside .experience-education section.
# Only the current employer h3 is unblurred; past roles use aria-hidden blurred-list.
_EXPERIENCE_ITEM_SELECTORS = [
    "section.experience-education .visible-list li.profile-section-card",
    "section[data-section='experience'] li.experience-item",
    "section[data-section='experience'] li",
    "#experience-section li",
]

_EXP_TITLE_SELECTORS   = ["span.experience-item__title", "span[class*='title']"]
_EXP_COMPANY_SELECTORS = ["h3", "span.experience-item__subtitle", "span[class*='subtitle']"]
_EXP_DATE_SELECTORS    = ["span.date-range", "[class*='date-range']", "span[class*='duration']"]
_EXP_DESC_SELECTORS    = [".show-more-less-text__text--less", "p[class*='description']"]

# 2025 DOM: education is also blurred; top-card shows most recent school only
_EDUCATION_ITEM_SELECTORS = [
    "section[data-section='education'] li.education__list-item",
    "section[data-section='education'] li",
    "#education ~ * li",
]

_EDU_SCHOOL_SELECTORS = ["h3.education__school-name", "h3[class*='school']", "h3"]
_EDU_DEGREE_SELECTORS = ["span.education__item--degree-name", "span[class*='degree']", "p[class*='degree']"]
_EDU_DATES_SELECTORS  = ["span.education__item--duration", "span[class*='duration']", "time"]

# Skills are not present on the 2025 unauthenticated public profile page
_SKILLS_SELECTORS = [
    "section[data-section='skills'] span.mr1",
    "section[data-section='skills'] li span[class*='bold']",
    "section[data-section='skills'] li span",
    "#skills ~ * li span",
]

# 2025 DOM: certifications use li.profile-section-card with h3 for name
_CERT_ITEM_SELECTORS = [
    "section[data-section='certifications'] li.profile-section-card",
    "section[data-section='certifications'] li",
    "#certifications ~ * li",
    "#licenses_and_certifications ~ * li",
]
_CERT_NAME_SELECTORS = ["h3", "h3.certifications__name", "h3[class*='name']", "span[class*='title']"]


# ── Helpers ───────────────────────────────────────────────────────────────────

def _select_first(soup, selectors):
    for sel in selectors:
        try:
            el = soup.select_one(sel)
            if el and el.get_text(strip=True):
                return el.get_text(strip=True)
        except Exception:
            continue
    return ""


def _select_all(soup, selectors):
    for sel in selectors:
        try:
            els = soup.select(sel)
            if els:
                return els
        except Exception:
            continue
    return []


def _split_bullets(text):
    parts = re.split(r"[•·]\s*|(?<=\s)–\s+|\n+", text)
    return [p.strip() for p in parts if p.strip() and len(p.strip()) > 3]


def _date_range_text(item):
    for sel in _EXP_DATE_SELECTORS:
        try:
            el = item.select_one(sel)
            if el:
                times = [t.get_text(strip=True) for t in el.find_all("time")]
                if times:
                    return " – ".join(times)
                text = el.get_text(strip=True)
                if text:
                    return text
        except Exception:
            continue
    return ""


# ── Public API ────────────────────────────────────────────────────────────────

def parse_html(raw_html: str) -> dict:
    """
    Extract structured profile data from a raw LinkedIn public profile HTML page.

    Returns a dict with keys: name, email, phone, linkedin, career_summary,
    experience[], education[], skills[], achievements[]

    Never raises — returns empty values for sections that cannot be parsed.
    """
    soup = BeautifulSoup(raw_html, "lxml")

    name = _select_first(soup, _NAME_SELECTORS)

    # Summary: strip login-wall noise injected after "see more"
    career_summary = ""
    for sel in _SUMMARY_SELECTORS:
        try:
            el = soup.select_one(sel)
            if el:
                raw_text = el.get_text(" ", strip=True)
                career_summary = _LOGIN_NOISE.sub("", raw_text).strip()
                if career_summary:
                    break
        except Exception:
            continue

    experience = []
    for item in _select_all(soup, _EXPERIENCE_ITEM_SELECTORS):
        # Skip blurred items (aria-hidden list shown as decorative background)
        if item.get("aria-hidden") == "true":
            continue
        title   = _select_first(item, _EXP_TITLE_SELECTORS)
        company = _select_first(item, _EXP_COMPANY_SELECTORS)
        # Skip entries where the title text is pure asterisks (blurred placeholder)
        if title and re.fullmatch(r"[\*\s]+", title):
            title = ""
        dates   = _date_range_text(item)
        desc_el = None
        for sel in _EXP_DESC_SELECTORS:
            try:
                desc_el = item.select_one(sel)
                if desc_el:
                    break
            except Exception:
                continue
        bullets = _split_bullets(desc_el.get_text(" ", strip=True)) if desc_el else []
        if title or company:
            experience.append({
                "company":    company,
                "title":      title,
                "date_range": dates,
                "bullets":    bullets,
            })

    education = []
    for item in _select_all(soup, _EDUCATION_ITEM_SELECTORS):
        school = _select_first(item, _EDU_SCHOOL_SELECTORS)
        degree = _select_first(item, _EDU_DEGREE_SELECTORS)
        dates  = ""
        for sel in _EDU_DATES_SELECTORS:
            try:
                el = item.select_one(sel)
                if el:
                    dates = el.get_text(strip=True)
                    break
            except Exception:
                continue
        if school or degree:
            education.append({
                "school": school,
                "degree": degree,
                "field":  "",
                "dates":  dates,
            })

    skills = [el.get_text(strip=True) for el in _select_all(soup, _SKILLS_SELECTORS)
              if el.get_text(strip=True)]
    skills = list(dict.fromkeys(skills))

    achievements = []
    for item in _select_all(soup, _CERT_ITEM_SELECTORS):
        label = _select_first(item, _CERT_NAME_SELECTORS)
        if label:
            achievements.append(label)

    return {
        "name":           name,
        "email":          "",
        "phone":          "",
        "linkedin":       "",
        "career_summary": career_summary,
        "experience":     experience,
        "education":      education,
        "skills":         skills,
        "achievements":   achievements,
    }