feat(linkedin): add HTML parser utils with fixture tests

This commit is contained in:
pyr0ball 2026-03-13 01:01:05 -07:00
parent db26b9aaf9
commit 530f4346d1
3 changed files with 377 additions and 0 deletions

194
scripts/linkedin_utils.py Normal file
View file

@ -0,0 +1,194 @@
# scripts/linkedin_utils.py
"""
LinkedIn profile HTML parser.
Extracts structured profile data from a raw LinkedIn public profile page.
No Playwright dependency importable by both linkedin_scraper and linkedin_parser.
Selectors target the 2024-2025 LinkedIn public profile DOM.
When LinkedIn changes their markup, update the selector lists here only.
Each section uses ordered fallbacks first matching selector wins.
"""
from __future__ import annotations
import re
from bs4 import BeautifulSoup
# ── Selector fallback lists ────────────────────────────────────────────────────
_NAME_SELECTORS = [
"h1.top-card-layout__title",
"h1[class*='title']",
".pv-top-card--list h1",
"h1",
]
_SUMMARY_SELECTORS = [
"section[data-section='about'] .show-more-less-text__text--less",
"section[data-section='about'] p",
"#about ~ * p.show-more-less-text__text--less",
".pv-about-section p",
]
_EXPERIENCE_ITEM_SELECTORS = [
"section[data-section='experience'] li.experience-item",
"section[data-section='experience'] li",
"#experience-section li",
"#experience ~ * li",
]
_EXP_TITLE_SELECTORS = ["span.experience-item__title", "span[class*='title']", "h3"]
_EXP_COMPANY_SELECTORS = ["span.experience-item__subtitle", "span[class*='subtitle']", "p[class*='company']"]
_EXP_DATE_SELECTORS = ["span.date-range", "[class*='date-range']", "span[class*='duration']"]
_EXP_DESC_SELECTORS = [".show-more-less-text__text--less", "p[class*='description']", "p"]
_EDUCATION_ITEM_SELECTORS = [
"section[data-section='education'] li.education__list-item",
"section[data-section='education'] li",
"#education ~ * li",
]
_EDU_SCHOOL_SELECTORS = ["h3.education__school-name", "h3[class*='school']", "h3"]
_EDU_DEGREE_SELECTORS = ["span.education__item--degree-name", "span[class*='degree']", "p[class*='degree']"]
_EDU_DATES_SELECTORS = ["span.education__item--duration", "span[class*='duration']", "time"]
_SKILLS_SELECTORS = [
"section[data-section='skills'] span.mr1",
"section[data-section='skills'] li span[class*='bold']",
"section[data-section='skills'] li span",
"#skills ~ * li span",
]
_CERT_ITEM_SELECTORS = [
"section[data-section='certifications'] li",
"#certifications ~ * li",
"#licenses_and_certifications ~ * li",
]
_CERT_NAME_SELECTORS = ["h3.certifications__name", "h3[class*='name']", "h3", "span[class*='title']"]
# ── Helpers ───────────────────────────────────────────────────────────────────
def _select_first(soup, selectors):
for sel in selectors:
try:
el = soup.select_one(sel)
if el and el.get_text(strip=True):
return el.get_text(strip=True)
except Exception:
continue
return ""
def _select_all(soup, selectors):
for sel in selectors:
try:
els = soup.select(sel)
if els:
return els
except Exception:
continue
return []
def _split_bullets(text):
parts = re.split(r"[•·]\s*|(?<=\s)\s+|\n+", text)
return [p.strip() for p in parts if p.strip() and len(p.strip()) > 3]
def _date_range_text(item):
for sel in _EXP_DATE_SELECTORS:
try:
el = item.select_one(sel)
if el:
times = [t.get_text(strip=True) for t in el.find_all("time")]
if times:
return " ".join(times)
text = el.get_text(strip=True)
if text:
return text
except Exception:
continue
return ""
# ── Public API ────────────────────────────────────────────────────────────────
def parse_html(raw_html: str) -> dict:
"""
Extract structured profile data from a raw LinkedIn public profile HTML page.
Returns a dict with keys: name, email, phone, linkedin, career_summary,
experience[], education[], skills[], achievements[]
Never raises returns empty values for sections that cannot be parsed.
"""
soup = BeautifulSoup(raw_html, "lxml")
name = _select_first(soup, _NAME_SELECTORS)
career_summary = _select_first(soup, _SUMMARY_SELECTORS)
experience = []
for item in _select_all(soup, _EXPERIENCE_ITEM_SELECTORS):
title = _select_first(item, _EXP_TITLE_SELECTORS)
company = _select_first(item, _EXP_COMPANY_SELECTORS)
dates = _date_range_text(item)
desc_el = None
for sel in _EXP_DESC_SELECTORS:
try:
desc_el = item.select_one(sel)
if desc_el:
break
except Exception:
continue
bullets = _split_bullets(desc_el.get_text(" ", strip=True)) if desc_el else []
if title or company:
experience.append({
"company": company,
"title": title,
"date_range": dates,
"bullets": bullets,
})
education = []
for item in _select_all(soup, _EDUCATION_ITEM_SELECTORS):
school = _select_first(item, _EDU_SCHOOL_SELECTORS)
degree = _select_first(item, _EDU_DEGREE_SELECTORS)
dates = ""
for sel in _EDU_DATES_SELECTORS:
try:
el = item.select_one(sel)
if el:
dates = el.get_text(strip=True)
break
except Exception:
continue
if school or degree:
education.append({
"school": school,
"degree": degree,
"field": "",
"dates": dates,
})
skills = [el.get_text(strip=True) for el in _select_all(soup, _SKILLS_SELECTORS)
if el.get_text(strip=True)]
skills = list(dict.fromkeys(skills))
achievements = []
for item in _select_all(soup, _CERT_ITEM_SELECTORS):
label = _select_first(item, _CERT_NAME_SELECTORS)
if label:
achievements.append(label)
return {
"name": name,
"email": "",
"phone": "",
"linkedin": "",
"career_summary": career_summary,
"experience": experience,
"education": education,
"skills": skills,
"achievements": achievements,
}

110
tests/fixtures/linkedin_profile.html vendored Normal file
View file

@ -0,0 +1,110 @@
<!-- tests/fixtures/linkedin_profile.html -->
<!DOCTYPE html>
<html>
<head><title>Alan Weinstock | LinkedIn</title></head>
<body>
<!-- Name and headline -->
<div class="top-card-layout__entity-info">
<h1 class="top-card-layout__title">Alan Weinstock</h1>
<h2 class="top-card-layout__headline">Staff Engineer · Open to Work</h2>
</div>
<!-- About / Summary -->
<section data-section="about">
<div class="core-section-container__content">
<p class="show-more-less-text__text--less">
Experienced engineer with 10 years in embedded systems and DevOps.
Passionate about open-source and accessibility tooling.
</p>
</div>
</section>
<!-- Experience -->
<section data-section="experience">
<ul>
<li class="experience-item">
<div class="experience-item__info">
<span class="experience-item__title">Staff Engineer</span>
<span class="experience-item__subtitle">Acme Corp</span>
<span class="experience-item__duration">
<span class="date-range">
<time>Jan 2022</time>
<time>Present</time>
</span>
</span>
</div>
<div class="experience-item__description">
<p class="show-more-less-text__text--less">
Led migration of monolith to microservices. &bull;
Reduced p99 latency by 40%. &bull;
Mentored three junior engineers.
</p>
</div>
</li>
<li class="experience-item">
<div class="experience-item__info">
<span class="experience-item__title">Senior Engineer</span>
<span class="experience-item__subtitle">Beta Industries</span>
<span class="experience-item__duration">
<span class="date-range">
<time>Mar 2019</time>
<time>Dec 2021</time>
</span>
</span>
</div>
<div class="experience-item__description">
<p class="show-more-less-text__text--less">
Designed CI/CD pipeline. &bull; Maintained Kubernetes clusters.
</p>
</div>
</li>
</ul>
</section>
<!-- Education -->
<section data-section="education">
<ul>
<li class="education__list-item">
<div class="education__item--degree-info">
<h3 class="education__school-name">State University</h3>
<span class="education__item--degree-name">B.S. Computer Science</span>
<span class="education__item--duration">2010 2014</span>
</div>
</li>
</ul>
</section>
<!-- Skills -->
<section data-section="skills">
<ul>
<li class="skills-section__list-item">
<div class="skills-section__skill">
<span class="mr1 t-bold">Python</span>
</div>
</li>
<li class="skills-section__list-item">
<div class="skills-section__skill">
<span class="mr1 t-bold">Kubernetes</span>
</div>
</li>
<li class="skills-section__list-item">
<div class="skills-section__skill">
<span class="mr1 t-bold">PostgreSQL</span>
</div>
</li>
</ul>
</section>
<!-- Certifications -->
<section data-section="certifications">
<ul>
<li class="certifications__list-item">
<h3 class="certifications__name">AWS Solutions Architect Associate</h3>
</li>
<li class="certifications__list-item">
<h3 class="certifications__name">CKA: Certified Kubernetes Administrator</h3>
</li>
</ul>
</section>
</body>
</html>

View file

@ -0,0 +1,73 @@
# tests/test_linkedin_utils.py
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
FIXTURE = (Path(__file__).parent / "fixtures" / "linkedin_profile.html").read_text()
def test_parse_html_name():
from scripts.linkedin_utils import parse_html
result = parse_html(FIXTURE)
assert result["name"] == "Alan Weinstock"
def test_parse_html_summary():
from scripts.linkedin_utils import parse_html
result = parse_html(FIXTURE)
assert "embedded systems" in result["career_summary"]
def test_parse_html_experience_count():
from scripts.linkedin_utils import parse_html
result = parse_html(FIXTURE)
assert len(result["experience"]) == 2
def test_parse_html_experience_fields():
from scripts.linkedin_utils import parse_html
result = parse_html(FIXTURE)
first = result["experience"][0]
assert first["company"] == "Acme Corp"
assert first["title"] == "Staff Engineer"
assert "Jan 2022" in first["date_range"]
assert len(first["bullets"]) >= 2
assert any("latency" in b for b in first["bullets"])
def test_parse_html_education():
from scripts.linkedin_utils import parse_html
result = parse_html(FIXTURE)
assert len(result["education"]) == 1
edu = result["education"][0]
assert edu["school"] == "State University"
assert "Computer Science" in edu["degree"]
def test_parse_html_skills():
from scripts.linkedin_utils import parse_html
result = parse_html(FIXTURE)
assert "Python" in result["skills"]
assert "Kubernetes" in result["skills"]
def test_parse_html_achievements():
from scripts.linkedin_utils import parse_html
result = parse_html(FIXTURE)
assert any("AWS" in a for a in result["achievements"])
def test_parse_html_missing_section_returns_empty():
"""A profile with no skills section returns empty skills list, not an error."""
from scripts.linkedin_utils import parse_html
html_no_skills = FIXTURE.replace('data-section="skills"', 'data-section="hidden"')
result = parse_html(html_no_skills)
assert result["skills"] == []
def test_parse_html_returns_all_keys():
from scripts.linkedin_utils import parse_html
result = parse_html(FIXTURE)
for key in ("name", "email", "phone", "linkedin", "career_summary",
"experience", "education", "skills", "achievements"):
assert key in result, f"Missing key: {key}"