From 6c612902186f92ec37289a7a2221935336ff97af Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Fri, 13 Mar 2026 01:01:05 -0700 Subject: [PATCH] feat(linkedin): add HTML parser utils with fixture tests --- scripts/linkedin_utils.py | 194 +++++++++++++++++++++++++++ tests/fixtures/linkedin_profile.html | 110 +++++++++++++++ tests/test_linkedin_utils.py | 73 ++++++++++ 3 files changed, 377 insertions(+) create mode 100644 scripts/linkedin_utils.py create mode 100644 tests/fixtures/linkedin_profile.html create mode 100644 tests/test_linkedin_utils.py diff --git a/scripts/linkedin_utils.py b/scripts/linkedin_utils.py new file mode 100644 index 0000000..5eb4f52 --- /dev/null +++ b/scripts/linkedin_utils.py @@ -0,0 +1,194 @@ +# scripts/linkedin_utils.py +""" +LinkedIn profile HTML parser. + +Extracts structured profile data from a raw LinkedIn public profile page. +No Playwright dependency — importable by both linkedin_scraper and linkedin_parser. + +Selectors target the 2024-2025 LinkedIn public profile DOM. +When LinkedIn changes their markup, update the selector lists here only. +Each section uses ordered fallbacks — first matching selector wins. +""" +from __future__ import annotations +import re +from bs4 import BeautifulSoup + + +# ── Selector fallback lists ──────────────────────────────────────────────────── + +_NAME_SELECTORS = [ + "h1.top-card-layout__title", + "h1[class*='title']", + ".pv-top-card--list h1", + "h1", +] + +_SUMMARY_SELECTORS = [ + "section[data-section='about'] .show-more-less-text__text--less", + "section[data-section='about'] p", + "#about ~ * p.show-more-less-text__text--less", + ".pv-about-section p", +] + +_EXPERIENCE_ITEM_SELECTORS = [ + "section[data-section='experience'] li.experience-item", + "section[data-section='experience'] li", + "#experience-section li", + "#experience ~ * li", +] + +_EXP_TITLE_SELECTORS = ["span.experience-item__title", "span[class*='title']", "h3"] +_EXP_COMPANY_SELECTORS = ["span.experience-item__subtitle", "span[class*='subtitle']", "p[class*='company']"] +_EXP_DATE_SELECTORS = ["span.date-range", "[class*='date-range']", "span[class*='duration']"] +_EXP_DESC_SELECTORS = [".show-more-less-text__text--less", "p[class*='description']", "p"] + +_EDUCATION_ITEM_SELECTORS = [ + "section[data-section='education'] li.education__list-item", + "section[data-section='education'] li", + "#education ~ * li", +] + +_EDU_SCHOOL_SELECTORS = ["h3.education__school-name", "h3[class*='school']", "h3"] +_EDU_DEGREE_SELECTORS = ["span.education__item--degree-name", "span[class*='degree']", "p[class*='degree']"] +_EDU_DATES_SELECTORS = ["span.education__item--duration", "span[class*='duration']", "time"] + +_SKILLS_SELECTORS = [ + "section[data-section='skills'] span.mr1", + "section[data-section='skills'] li span[class*='bold']", + "section[data-section='skills'] li span", + "#skills ~ * li span", +] + +_CERT_ITEM_SELECTORS = [ + "section[data-section='certifications'] li", + "#certifications ~ * li", + "#licenses_and_certifications ~ * li", +] +_CERT_NAME_SELECTORS = ["h3.certifications__name", "h3[class*='name']", "h3", "span[class*='title']"] + + +# ── Helpers ─────────────────────────────────────────────────────────────────── + +def _select_first(soup, selectors): + for sel in selectors: + try: + el = soup.select_one(sel) + if el and el.get_text(strip=True): + return el.get_text(strip=True) + except Exception: + continue + return "" + + +def _select_all(soup, selectors): + for sel in selectors: + try: + els = soup.select(sel) + if els: + return els + except Exception: + continue + return [] + + +def _split_bullets(text): + parts = re.split(r"[•·]\s*|(?<=\s)–\s+|\n+", text) + return [p.strip() for p in parts if p.strip() and len(p.strip()) > 3] + + +def _date_range_text(item): + for sel in _EXP_DATE_SELECTORS: + try: + el = item.select_one(sel) + if el: + times = [t.get_text(strip=True) for t in el.find_all("time")] + if times: + return " – ".join(times) + text = el.get_text(strip=True) + if text: + return text + except Exception: + continue + return "" + + +# ── Public API ──────────────────────────────────────────────────────────────── + +def parse_html(raw_html: str) -> dict: + """ + Extract structured profile data from a raw LinkedIn public profile HTML page. + + Returns a dict with keys: name, email, phone, linkedin, career_summary, + experience[], education[], skills[], achievements[] + + Never raises — returns empty values for sections that cannot be parsed. + """ + soup = BeautifulSoup(raw_html, "lxml") + + name = _select_first(soup, _NAME_SELECTORS) + career_summary = _select_first(soup, _SUMMARY_SELECTORS) + + experience = [] + for item in _select_all(soup, _EXPERIENCE_ITEM_SELECTORS): + title = _select_first(item, _EXP_TITLE_SELECTORS) + company = _select_first(item, _EXP_COMPANY_SELECTORS) + dates = _date_range_text(item) + desc_el = None + for sel in _EXP_DESC_SELECTORS: + try: + desc_el = item.select_one(sel) + if desc_el: + break + except Exception: + continue + bullets = _split_bullets(desc_el.get_text(" ", strip=True)) if desc_el else [] + if title or company: + experience.append({ + "company": company, + "title": title, + "date_range": dates, + "bullets": bullets, + }) + + education = [] + for item in _select_all(soup, _EDUCATION_ITEM_SELECTORS): + school = _select_first(item, _EDU_SCHOOL_SELECTORS) + degree = _select_first(item, _EDU_DEGREE_SELECTORS) + dates = "" + for sel in _EDU_DATES_SELECTORS: + try: + el = item.select_one(sel) + if el: + dates = el.get_text(strip=True) + break + except Exception: + continue + if school or degree: + education.append({ + "school": school, + "degree": degree, + "field": "", + "dates": dates, + }) + + skills = [el.get_text(strip=True) for el in _select_all(soup, _SKILLS_SELECTORS) + if el.get_text(strip=True)] + skills = list(dict.fromkeys(skills)) + + achievements = [] + for item in _select_all(soup, _CERT_ITEM_SELECTORS): + label = _select_first(item, _CERT_NAME_SELECTORS) + if label: + achievements.append(label) + + return { + "name": name, + "email": "", + "phone": "", + "linkedin": "", + "career_summary": career_summary, + "experience": experience, + "education": education, + "skills": skills, + "achievements": achievements, + } diff --git a/tests/fixtures/linkedin_profile.html b/tests/fixtures/linkedin_profile.html new file mode 100644 index 0000000..916aa0f --- /dev/null +++ b/tests/fixtures/linkedin_profile.html @@ -0,0 +1,110 @@ + + + +Alan Weinstock | LinkedIn + + +
+

Alan Weinstock

+

Staff Engineer · Open to Work

+
+ + +
+
+

+ Experienced engineer with 10 years in embedded systems and DevOps. + Passionate about open-source and accessibility tooling. +

+
+
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + diff --git a/tests/test_linkedin_utils.py b/tests/test_linkedin_utils.py new file mode 100644 index 0000000..ae29dae --- /dev/null +++ b/tests/test_linkedin_utils.py @@ -0,0 +1,73 @@ +# tests/test_linkedin_utils.py +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +FIXTURE = (Path(__file__).parent / "fixtures" / "linkedin_profile.html").read_text() + + +def test_parse_html_name(): + from scripts.linkedin_utils import parse_html + result = parse_html(FIXTURE) + assert result["name"] == "Alan Weinstock" + + +def test_parse_html_summary(): + from scripts.linkedin_utils import parse_html + result = parse_html(FIXTURE) + assert "embedded systems" in result["career_summary"] + + +def test_parse_html_experience_count(): + from scripts.linkedin_utils import parse_html + result = parse_html(FIXTURE) + assert len(result["experience"]) == 2 + + +def test_parse_html_experience_fields(): + from scripts.linkedin_utils import parse_html + result = parse_html(FIXTURE) + first = result["experience"][0] + assert first["company"] == "Acme Corp" + assert first["title"] == "Staff Engineer" + assert "Jan 2022" in first["date_range"] + assert len(first["bullets"]) >= 2 + assert any("latency" in b for b in first["bullets"]) + + +def test_parse_html_education(): + from scripts.linkedin_utils import parse_html + result = parse_html(FIXTURE) + assert len(result["education"]) == 1 + edu = result["education"][0] + assert edu["school"] == "State University" + assert "Computer Science" in edu["degree"] + + +def test_parse_html_skills(): + from scripts.linkedin_utils import parse_html + result = parse_html(FIXTURE) + assert "Python" in result["skills"] + assert "Kubernetes" in result["skills"] + + +def test_parse_html_achievements(): + from scripts.linkedin_utils import parse_html + result = parse_html(FIXTURE) + assert any("AWS" in a for a in result["achievements"]) + + +def test_parse_html_missing_section_returns_empty(): + """A profile with no skills section returns empty skills list, not an error.""" + from scripts.linkedin_utils import parse_html + html_no_skills = FIXTURE.replace('data-section="skills"', 'data-section="hidden"') + result = parse_html(html_no_skills) + assert result["skills"] == [] + + +def test_parse_html_returns_all_keys(): + from scripts.linkedin_utils import parse_html + result = parse_html(FIXTURE) + for key in ("name", "email", "phone", "linkedin", "career_summary", + "experience", "education", "skills", "achievements"): + assert key in result, f"Missing key: {key}"