feat(linkedin): add HTML parser utils with fixture tests

2026-03-13 01:01:05 -07:00 · 2026-03-13 01:01:05 -07:00 · a43e29e50d
commit a43e29e50d
parent d703bebb5e
3 changed files with 377 additions and 0 deletions
--- a/scripts/linkedin_utils.py
+++ b/scripts/linkedin_utils.py
@ -0,0 +1,194 @@
+# scripts/linkedin_utils.py
+"""
+LinkedIn profile HTML parser.
+
+Extracts structured profile data from a raw LinkedIn public profile page.
+No Playwright dependency — importable by both linkedin_scraper and linkedin_parser.
+
+Selectors target the 2024-2025 LinkedIn public profile DOM.
+When LinkedIn changes their markup, update the selector lists here only.
+Each section uses ordered fallbacks — first matching selector wins.
+"""
+from __future__ import annotations
+import re
+from bs4 import BeautifulSoup
+
+
+# ── Selector fallback lists ────────────────────────────────────────────────────
+
+_NAME_SELECTORS = [
+    "h1.top-card-layout__title",
+    "h1[class*='title']",
+    ".pv-top-card--list h1",
+    "h1",
+]
+
+_SUMMARY_SELECTORS = [
+    "section[data-section='about'] .show-more-less-text__text--less",
+    "section[data-section='about'] p",
+    "#about ~ * p.show-more-less-text__text--less",
+    ".pv-about-section p",
+]
+
+_EXPERIENCE_ITEM_SELECTORS = [
+    "section[data-section='experience'] li.experience-item",
+    "section[data-section='experience'] li",
+    "#experience-section li",
+    "#experience ~ * li",
+]
+
+_EXP_TITLE_SELECTORS   = ["span.experience-item__title", "span[class*='title']", "h3"]
+_EXP_COMPANY_SELECTORS = ["span.experience-item__subtitle", "span[class*='subtitle']", "p[class*='company']"]
+_EXP_DATE_SELECTORS    = ["span.date-range", "[class*='date-range']", "span[class*='duration']"]
+_EXP_DESC_SELECTORS    = [".show-more-less-text__text--less", "p[class*='description']", "p"]
+
+_EDUCATION_ITEM_SELECTORS = [
+    "section[data-section='education'] li.education__list-item",
+    "section[data-section='education'] li",
+    "#education ~ * li",
+]
+
+_EDU_SCHOOL_SELECTORS = ["h3.education__school-name", "h3[class*='school']", "h3"]
+_EDU_DEGREE_SELECTORS = ["span.education__item--degree-name", "span[class*='degree']", "p[class*='degree']"]
+_EDU_DATES_SELECTORS  = ["span.education__item--duration", "span[class*='duration']", "time"]
+
+_SKILLS_SELECTORS = [
+    "section[data-section='skills'] span.mr1",
+    "section[data-section='skills'] li span[class*='bold']",
+    "section[data-section='skills'] li span",
+    "#skills ~ * li span",
+]
+
+_CERT_ITEM_SELECTORS = [
+    "section[data-section='certifications'] li",
+    "#certifications ~ * li",
+    "#licenses_and_certifications ~ * li",
+]
+_CERT_NAME_SELECTORS = ["h3.certifications__name", "h3[class*='name']", "h3", "span[class*='title']"]
+
+
+# ── Helpers ───────────────────────────────────────────────────────────────────
+
+def _select_first(soup, selectors):
+    for sel in selectors:
+        try:
+            el = soup.select_one(sel)
+            if el and el.get_text(strip=True):
+                return el.get_text(strip=True)
+        except Exception:
+            continue
+    return ""
+
+
+def _select_all(soup, selectors):
+    for sel in selectors:
+        try:
+            els = soup.select(sel)
+            if els:
+                return els
+        except Exception:
+            continue
+    return []
+
+
+def _split_bullets(text):
+    parts = re.split(r"[•·]\s*|(?<=\s)–\s+|\n+", text)
+    return [p.strip() for p in parts if p.strip() and len(p.strip()) > 3]
+
+
+def _date_range_text(item):
+    for sel in _EXP_DATE_SELECTORS:
+        try:
+            el = item.select_one(sel)
+            if el:
+                times = [t.get_text(strip=True) for t in el.find_all("time")]
+                if times:
+                    return " – ".join(times)
+                text = el.get_text(strip=True)
+                if text:
+                    return text
+        except Exception:
+            continue
+    return ""
+
+
+# ── Public API ────────────────────────────────────────────────────────────────
+
+def parse_html(raw_html: str) -> dict:
+    """
+    Extract structured profile data from a raw LinkedIn public profile HTML page.
+
+    Returns a dict with keys: name, email, phone, linkedin, career_summary,
+    experience[], education[], skills[], achievements[]
+
+    Never raises — returns empty values for sections that cannot be parsed.
+    """
+    soup = BeautifulSoup(raw_html, "lxml")
+
+    name = _select_first(soup, _NAME_SELECTORS)
+    career_summary = _select_first(soup, _SUMMARY_SELECTORS)
+
+    experience = []
+    for item in _select_all(soup, _EXPERIENCE_ITEM_SELECTORS):
+        title   = _select_first(item, _EXP_TITLE_SELECTORS)
+        company = _select_first(item, _EXP_COMPANY_SELECTORS)
+        dates   = _date_range_text(item)
+        desc_el = None
+        for sel in _EXP_DESC_SELECTORS:
+            try:
+                desc_el = item.select_one(sel)
+                if desc_el:
+                    break
+            except Exception:
+                continue
+        bullets = _split_bullets(desc_el.get_text(" ", strip=True)) if desc_el else []
+        if title or company:
+            experience.append({
+                "company":    company,
+                "title":      title,
+                "date_range": dates,
+                "bullets":    bullets,
+            })
+
+    education = []
+    for item in _select_all(soup, _EDUCATION_ITEM_SELECTORS):
+        school = _select_first(item, _EDU_SCHOOL_SELECTORS)
+        degree = _select_first(item, _EDU_DEGREE_SELECTORS)
+        dates  = ""
+        for sel in _EDU_DATES_SELECTORS:
+            try:
+                el = item.select_one(sel)
+                if el:
+                    dates = el.get_text(strip=True)
+                    break
+            except Exception:
+                continue
+        if school or degree:
+            education.append({
+                "school": school,
+                "degree": degree,
+                "field":  "",
+                "dates":  dates,
+            })
+
+    skills = [el.get_text(strip=True) for el in _select_all(soup, _SKILLS_SELECTORS)
+              if el.get_text(strip=True)]
+    skills = list(dict.fromkeys(skills))
+
+    achievements = []
+    for item in _select_all(soup, _CERT_ITEM_SELECTORS):
+        label = _select_first(item, _CERT_NAME_SELECTORS)
+        if label:
+            achievements.append(label)
+
+    return {
+        "name":           name,
+        "email":          "",
+        "phone":          "",
+        "linkedin":       "",
+        "career_summary": career_summary,
+        "experience":     experience,
+        "education":      education,
+        "skills":         skills,
+        "achievements":   achievements,
+    }
--- a/tests/fixtures/linkedin_profile.html
+++ b/tests/fixtures/linkedin_profile.html
@ -0,0 +1,110 @@
+<!-- tests/fixtures/linkedin_profile.html -->
+<!DOCTYPE html>
+<html>
+<head><title>Alan Weinstock | LinkedIn</title></head>
+<body>
+  <!-- Name and headline -->
+  <div class="top-card-layout__entity-info">
+    <h1 class="top-card-layout__title">Alan Weinstock</h1>
+    <h2 class="top-card-layout__headline">Staff Engineer · Open to Work</h2>
+  </div>
+
+  <!-- About / Summary -->
+  <section data-section="about">
+    <div class="core-section-container__content">
+      <p class="show-more-less-text__text--less">
+        Experienced engineer with 10 years in embedded systems and DevOps.
+        Passionate about open-source and accessibility tooling.
+      </p>
+    </div>
+  </section>
+
+  <!-- Experience -->
+  <section data-section="experience">
+    <ul>
+      <li class="experience-item">
+        <div class="experience-item__info">
+          <span class="experience-item__title">Staff Engineer</span>
+          <span class="experience-item__subtitle">Acme Corp</span>
+          <span class="experience-item__duration">
+            <span class="date-range">
+              <time>Jan 2022</time>
+              <time>Present</time>
+            </span>
+          </span>
+        </div>
+        <div class="experience-item__description">
+          <p class="show-more-less-text__text--less">
+            Led migration of monolith to microservices. &bull;
+            Reduced p99 latency by 40%. &bull;
+            Mentored three junior engineers.
+          </p>
+        </div>
+      </li>
+      <li class="experience-item">
+        <div class="experience-item__info">
+          <span class="experience-item__title">Senior Engineer</span>
+          <span class="experience-item__subtitle">Beta Industries</span>
+          <span class="experience-item__duration">
+            <span class="date-range">
+              <time>Mar 2019</time>
+              <time>Dec 2021</time>
+            </span>
+          </span>
+        </div>
+        <div class="experience-item__description">
+          <p class="show-more-less-text__text--less">
+            Designed CI/CD pipeline. &bull; Maintained Kubernetes clusters.
+          </p>
+        </div>
+      </li>
+    </ul>
+  </section>
+
+  <!-- Education -->
+  <section data-section="education">
+    <ul>
+      <li class="education__list-item">
+        <div class="education__item--degree-info">
+          <h3 class="education__school-name">State University</h3>
+          <span class="education__item--degree-name">B.S. Computer Science</span>
+          <span class="education__item--duration">2010 – 2014</span>
+        </div>
+      </li>
+    </ul>
+  </section>
+
+  <!-- Skills -->
+  <section data-section="skills">
+    <ul>
+      <li class="skills-section__list-item">
+        <div class="skills-section__skill">
+          <span class="mr1 t-bold">Python</span>
+        </div>
+      </li>
+      <li class="skills-section__list-item">
+        <div class="skills-section__skill">
+          <span class="mr1 t-bold">Kubernetes</span>
+        </div>
+      </li>
+      <li class="skills-section__list-item">
+        <div class="skills-section__skill">
+          <span class="mr1 t-bold">PostgreSQL</span>
+        </div>
+      </li>
+    </ul>
+  </section>
+
+  <!-- Certifications -->
+  <section data-section="certifications">
+    <ul>
+      <li class="certifications__list-item">
+        <h3 class="certifications__name">AWS Solutions Architect – Associate</h3>
+      </li>
+      <li class="certifications__list-item">
+        <h3 class="certifications__name">CKA: Certified Kubernetes Administrator</h3>
+      </li>
+    </ul>
+  </section>
+</body>
+</html>
--- a/tests/test_linkedin_utils.py
+++ b/tests/test_linkedin_utils.py
@ -0,0 +1,73 @@
+# tests/test_linkedin_utils.py
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+FIXTURE = (Path(__file__).parent / "fixtures" / "linkedin_profile.html").read_text()
+
+
+def test_parse_html_name():
+    from scripts.linkedin_utils import parse_html
+    result = parse_html(FIXTURE)
+    assert result["name"] == "Alan Weinstock"
+
+
+def test_parse_html_summary():
+    from scripts.linkedin_utils import parse_html
+    result = parse_html(FIXTURE)
+    assert "embedded systems" in result["career_summary"]
+
+
+def test_parse_html_experience_count():
+    from scripts.linkedin_utils import parse_html
+    result = parse_html(FIXTURE)
+    assert len(result["experience"]) == 2
+
+
+def test_parse_html_experience_fields():
+    from scripts.linkedin_utils import parse_html
+    result = parse_html(FIXTURE)
+    first = result["experience"][0]
+    assert first["company"] == "Acme Corp"
+    assert first["title"] == "Staff Engineer"
+    assert "Jan 2022" in first["date_range"]
+    assert len(first["bullets"]) >= 2
+    assert any("latency" in b for b in first["bullets"])
+
+
+def test_parse_html_education():
+    from scripts.linkedin_utils import parse_html
+    result = parse_html(FIXTURE)
+    assert len(result["education"]) == 1
+    edu = result["education"][0]
+    assert edu["school"] == "State University"
+    assert "Computer Science" in edu["degree"]
+
+
+def test_parse_html_skills():
+    from scripts.linkedin_utils import parse_html
+    result = parse_html(FIXTURE)
+    assert "Python" in result["skills"]
+    assert "Kubernetes" in result["skills"]
+
+
+def test_parse_html_achievements():
+    from scripts.linkedin_utils import parse_html
+    result = parse_html(FIXTURE)
+    assert any("AWS" in a for a in result["achievements"])
+
+
+def test_parse_html_missing_section_returns_empty():
+    """A profile with no skills section returns empty skills list, not an error."""
+    from scripts.linkedin_utils import parse_html
+    html_no_skills = FIXTURE.replace('data-section="skills"', 'data-section="hidden"')
+    result = parse_html(html_no_skills)
+    assert result["skills"] == []
+
+
+def test_parse_html_returns_all_keys():
+    from scripts.linkedin_utils import parse_html
+    result = parse_html(FIXTURE)
+    for key in ("name", "email", "phone", "linkedin", "career_summary",
+                "experience", "education", "skills", "achievements"):
+        assert key in result, f"Missing key: {key}"