feat(linkedin): add HTML parser utils with fixture tests
This commit is contained in:
parent
d703bebb5e
commit
a43e29e50d
3 changed files with 377 additions and 0 deletions
194
scripts/linkedin_utils.py
Normal file
194
scripts/linkedin_utils.py
Normal file
|
|
@ -0,0 +1,194 @@
|
|||
# scripts/linkedin_utils.py
|
||||
"""
|
||||
LinkedIn profile HTML parser.
|
||||
|
||||
Extracts structured profile data from a raw LinkedIn public profile page.
|
||||
No Playwright dependency — importable by both linkedin_scraper and linkedin_parser.
|
||||
|
||||
Selectors target the 2024-2025 LinkedIn public profile DOM.
|
||||
When LinkedIn changes their markup, update the selector lists here only.
|
||||
Each section uses ordered fallbacks — first matching selector wins.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
# ── Selector fallback lists ────────────────────────────────────────────────────
|
||||
|
||||
_NAME_SELECTORS = [
|
||||
"h1.top-card-layout__title",
|
||||
"h1[class*='title']",
|
||||
".pv-top-card--list h1",
|
||||
"h1",
|
||||
]
|
||||
|
||||
_SUMMARY_SELECTORS = [
|
||||
"section[data-section='about'] .show-more-less-text__text--less",
|
||||
"section[data-section='about'] p",
|
||||
"#about ~ * p.show-more-less-text__text--less",
|
||||
".pv-about-section p",
|
||||
]
|
||||
|
||||
_EXPERIENCE_ITEM_SELECTORS = [
|
||||
"section[data-section='experience'] li.experience-item",
|
||||
"section[data-section='experience'] li",
|
||||
"#experience-section li",
|
||||
"#experience ~ * li",
|
||||
]
|
||||
|
||||
_EXP_TITLE_SELECTORS = ["span.experience-item__title", "span[class*='title']", "h3"]
|
||||
_EXP_COMPANY_SELECTORS = ["span.experience-item__subtitle", "span[class*='subtitle']", "p[class*='company']"]
|
||||
_EXP_DATE_SELECTORS = ["span.date-range", "[class*='date-range']", "span[class*='duration']"]
|
||||
_EXP_DESC_SELECTORS = [".show-more-less-text__text--less", "p[class*='description']", "p"]
|
||||
|
||||
_EDUCATION_ITEM_SELECTORS = [
|
||||
"section[data-section='education'] li.education__list-item",
|
||||
"section[data-section='education'] li",
|
||||
"#education ~ * li",
|
||||
]
|
||||
|
||||
_EDU_SCHOOL_SELECTORS = ["h3.education__school-name", "h3[class*='school']", "h3"]
|
||||
_EDU_DEGREE_SELECTORS = ["span.education__item--degree-name", "span[class*='degree']", "p[class*='degree']"]
|
||||
_EDU_DATES_SELECTORS = ["span.education__item--duration", "span[class*='duration']", "time"]
|
||||
|
||||
_SKILLS_SELECTORS = [
|
||||
"section[data-section='skills'] span.mr1",
|
||||
"section[data-section='skills'] li span[class*='bold']",
|
||||
"section[data-section='skills'] li span",
|
||||
"#skills ~ * li span",
|
||||
]
|
||||
|
||||
_CERT_ITEM_SELECTORS = [
|
||||
"section[data-section='certifications'] li",
|
||||
"#certifications ~ * li",
|
||||
"#licenses_and_certifications ~ * li",
|
||||
]
|
||||
_CERT_NAME_SELECTORS = ["h3.certifications__name", "h3[class*='name']", "h3", "span[class*='title']"]
|
||||
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
def _select_first(soup, selectors):
|
||||
for sel in selectors:
|
||||
try:
|
||||
el = soup.select_one(sel)
|
||||
if el and el.get_text(strip=True):
|
||||
return el.get_text(strip=True)
|
||||
except Exception:
|
||||
continue
|
||||
return ""
|
||||
|
||||
|
||||
def _select_all(soup, selectors):
|
||||
for sel in selectors:
|
||||
try:
|
||||
els = soup.select(sel)
|
||||
if els:
|
||||
return els
|
||||
except Exception:
|
||||
continue
|
||||
return []
|
||||
|
||||
|
||||
def _split_bullets(text):
|
||||
parts = re.split(r"[•·]\s*|(?<=\s)–\s+|\n+", text)
|
||||
return [p.strip() for p in parts if p.strip() and len(p.strip()) > 3]
|
||||
|
||||
|
||||
def _date_range_text(item):
|
||||
for sel in _EXP_DATE_SELECTORS:
|
||||
try:
|
||||
el = item.select_one(sel)
|
||||
if el:
|
||||
times = [t.get_text(strip=True) for t in el.find_all("time")]
|
||||
if times:
|
||||
return " – ".join(times)
|
||||
text = el.get_text(strip=True)
|
||||
if text:
|
||||
return text
|
||||
except Exception:
|
||||
continue
|
||||
return ""
|
||||
|
||||
|
||||
# ── Public API ────────────────────────────────────────────────────────────────
|
||||
|
||||
def parse_html(raw_html: str) -> dict:
|
||||
"""
|
||||
Extract structured profile data from a raw LinkedIn public profile HTML page.
|
||||
|
||||
Returns a dict with keys: name, email, phone, linkedin, career_summary,
|
||||
experience[], education[], skills[], achievements[]
|
||||
|
||||
Never raises — returns empty values for sections that cannot be parsed.
|
||||
"""
|
||||
soup = BeautifulSoup(raw_html, "lxml")
|
||||
|
||||
name = _select_first(soup, _NAME_SELECTORS)
|
||||
career_summary = _select_first(soup, _SUMMARY_SELECTORS)
|
||||
|
||||
experience = []
|
||||
for item in _select_all(soup, _EXPERIENCE_ITEM_SELECTORS):
|
||||
title = _select_first(item, _EXP_TITLE_SELECTORS)
|
||||
company = _select_first(item, _EXP_COMPANY_SELECTORS)
|
||||
dates = _date_range_text(item)
|
||||
desc_el = None
|
||||
for sel in _EXP_DESC_SELECTORS:
|
||||
try:
|
||||
desc_el = item.select_one(sel)
|
||||
if desc_el:
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
bullets = _split_bullets(desc_el.get_text(" ", strip=True)) if desc_el else []
|
||||
if title or company:
|
||||
experience.append({
|
||||
"company": company,
|
||||
"title": title,
|
||||
"date_range": dates,
|
||||
"bullets": bullets,
|
||||
})
|
||||
|
||||
education = []
|
||||
for item in _select_all(soup, _EDUCATION_ITEM_SELECTORS):
|
||||
school = _select_first(item, _EDU_SCHOOL_SELECTORS)
|
||||
degree = _select_first(item, _EDU_DEGREE_SELECTORS)
|
||||
dates = ""
|
||||
for sel in _EDU_DATES_SELECTORS:
|
||||
try:
|
||||
el = item.select_one(sel)
|
||||
if el:
|
||||
dates = el.get_text(strip=True)
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
if school or degree:
|
||||
education.append({
|
||||
"school": school,
|
||||
"degree": degree,
|
||||
"field": "",
|
||||
"dates": dates,
|
||||
})
|
||||
|
||||
skills = [el.get_text(strip=True) for el in _select_all(soup, _SKILLS_SELECTORS)
|
||||
if el.get_text(strip=True)]
|
||||
skills = list(dict.fromkeys(skills))
|
||||
|
||||
achievements = []
|
||||
for item in _select_all(soup, _CERT_ITEM_SELECTORS):
|
||||
label = _select_first(item, _CERT_NAME_SELECTORS)
|
||||
if label:
|
||||
achievements.append(label)
|
||||
|
||||
return {
|
||||
"name": name,
|
||||
"email": "",
|
||||
"phone": "",
|
||||
"linkedin": "",
|
||||
"career_summary": career_summary,
|
||||
"experience": experience,
|
||||
"education": education,
|
||||
"skills": skills,
|
||||
"achievements": achievements,
|
||||
}
|
||||
110
tests/fixtures/linkedin_profile.html
vendored
Normal file
110
tests/fixtures/linkedin_profile.html
vendored
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
<!-- tests/fixtures/linkedin_profile.html -->
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>Alan Weinstock | LinkedIn</title></head>
|
||||
<body>
|
||||
<!-- Name and headline -->
|
||||
<div class="top-card-layout__entity-info">
|
||||
<h1 class="top-card-layout__title">Alan Weinstock</h1>
|
||||
<h2 class="top-card-layout__headline">Staff Engineer · Open to Work</h2>
|
||||
</div>
|
||||
|
||||
<!-- About / Summary -->
|
||||
<section data-section="about">
|
||||
<div class="core-section-container__content">
|
||||
<p class="show-more-less-text__text--less">
|
||||
Experienced engineer with 10 years in embedded systems and DevOps.
|
||||
Passionate about open-source and accessibility tooling.
|
||||
</p>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Experience -->
|
||||
<section data-section="experience">
|
||||
<ul>
|
||||
<li class="experience-item">
|
||||
<div class="experience-item__info">
|
||||
<span class="experience-item__title">Staff Engineer</span>
|
||||
<span class="experience-item__subtitle">Acme Corp</span>
|
||||
<span class="experience-item__duration">
|
||||
<span class="date-range">
|
||||
<time>Jan 2022</time>
|
||||
<time>Present</time>
|
||||
</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class="experience-item__description">
|
||||
<p class="show-more-less-text__text--less">
|
||||
Led migration of monolith to microservices. •
|
||||
Reduced p99 latency by 40%. •
|
||||
Mentored three junior engineers.
|
||||
</p>
|
||||
</div>
|
||||
</li>
|
||||
<li class="experience-item">
|
||||
<div class="experience-item__info">
|
||||
<span class="experience-item__title">Senior Engineer</span>
|
||||
<span class="experience-item__subtitle">Beta Industries</span>
|
||||
<span class="experience-item__duration">
|
||||
<span class="date-range">
|
||||
<time>Mar 2019</time>
|
||||
<time>Dec 2021</time>
|
||||
</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class="experience-item__description">
|
||||
<p class="show-more-less-text__text--less">
|
||||
Designed CI/CD pipeline. • Maintained Kubernetes clusters.
|
||||
</p>
|
||||
</div>
|
||||
</li>
|
||||
</ul>
|
||||
</section>
|
||||
|
||||
<!-- Education -->
|
||||
<section data-section="education">
|
||||
<ul>
|
||||
<li class="education__list-item">
|
||||
<div class="education__item--degree-info">
|
||||
<h3 class="education__school-name">State University</h3>
|
||||
<span class="education__item--degree-name">B.S. Computer Science</span>
|
||||
<span class="education__item--duration">2010 – 2014</span>
|
||||
</div>
|
||||
</li>
|
||||
</ul>
|
||||
</section>
|
||||
|
||||
<!-- Skills -->
|
||||
<section data-section="skills">
|
||||
<ul>
|
||||
<li class="skills-section__list-item">
|
||||
<div class="skills-section__skill">
|
||||
<span class="mr1 t-bold">Python</span>
|
||||
</div>
|
||||
</li>
|
||||
<li class="skills-section__list-item">
|
||||
<div class="skills-section__skill">
|
||||
<span class="mr1 t-bold">Kubernetes</span>
|
||||
</div>
|
||||
</li>
|
||||
<li class="skills-section__list-item">
|
||||
<div class="skills-section__skill">
|
||||
<span class="mr1 t-bold">PostgreSQL</span>
|
||||
</div>
|
||||
</li>
|
||||
</ul>
|
||||
</section>
|
||||
|
||||
<!-- Certifications -->
|
||||
<section data-section="certifications">
|
||||
<ul>
|
||||
<li class="certifications__list-item">
|
||||
<h3 class="certifications__name">AWS Solutions Architect – Associate</h3>
|
||||
</li>
|
||||
<li class="certifications__list-item">
|
||||
<h3 class="certifications__name">CKA: Certified Kubernetes Administrator</h3>
|
||||
</li>
|
||||
</ul>
|
||||
</section>
|
||||
</body>
|
||||
</html>
|
||||
73
tests/test_linkedin_utils.py
Normal file
73
tests/test_linkedin_utils.py
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
# tests/test_linkedin_utils.py
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
FIXTURE = (Path(__file__).parent / "fixtures" / "linkedin_profile.html").read_text()
|
||||
|
||||
|
||||
def test_parse_html_name():
|
||||
from scripts.linkedin_utils import parse_html
|
||||
result = parse_html(FIXTURE)
|
||||
assert result["name"] == "Alan Weinstock"
|
||||
|
||||
|
||||
def test_parse_html_summary():
|
||||
from scripts.linkedin_utils import parse_html
|
||||
result = parse_html(FIXTURE)
|
||||
assert "embedded systems" in result["career_summary"]
|
||||
|
||||
|
||||
def test_parse_html_experience_count():
|
||||
from scripts.linkedin_utils import parse_html
|
||||
result = parse_html(FIXTURE)
|
||||
assert len(result["experience"]) == 2
|
||||
|
||||
|
||||
def test_parse_html_experience_fields():
|
||||
from scripts.linkedin_utils import parse_html
|
||||
result = parse_html(FIXTURE)
|
||||
first = result["experience"][0]
|
||||
assert first["company"] == "Acme Corp"
|
||||
assert first["title"] == "Staff Engineer"
|
||||
assert "Jan 2022" in first["date_range"]
|
||||
assert len(first["bullets"]) >= 2
|
||||
assert any("latency" in b for b in first["bullets"])
|
||||
|
||||
|
||||
def test_parse_html_education():
|
||||
from scripts.linkedin_utils import parse_html
|
||||
result = parse_html(FIXTURE)
|
||||
assert len(result["education"]) == 1
|
||||
edu = result["education"][0]
|
||||
assert edu["school"] == "State University"
|
||||
assert "Computer Science" in edu["degree"]
|
||||
|
||||
|
||||
def test_parse_html_skills():
|
||||
from scripts.linkedin_utils import parse_html
|
||||
result = parse_html(FIXTURE)
|
||||
assert "Python" in result["skills"]
|
||||
assert "Kubernetes" in result["skills"]
|
||||
|
||||
|
||||
def test_parse_html_achievements():
|
||||
from scripts.linkedin_utils import parse_html
|
||||
result = parse_html(FIXTURE)
|
||||
assert any("AWS" in a for a in result["achievements"])
|
||||
|
||||
|
||||
def test_parse_html_missing_section_returns_empty():
|
||||
"""A profile with no skills section returns empty skills list, not an error."""
|
||||
from scripts.linkedin_utils import parse_html
|
||||
html_no_skills = FIXTURE.replace('data-section="skills"', 'data-section="hidden"')
|
||||
result = parse_html(html_no_skills)
|
||||
assert result["skills"] == []
|
||||
|
||||
|
||||
def test_parse_html_returns_all_keys():
|
||||
from scripts.linkedin_utils import parse_html
|
||||
result = parse_html(FIXTURE)
|
||||
for key in ("name", "email", "phone", "linkedin", "career_summary",
|
||||
"experience", "education", "skills", "achievements"):
|
||||
assert key in result, f"Missing key: {key}"
|
||||
Loading…
Reference in a new issue