diff --git a/scripts/linkedin_utils.py b/scripts/linkedin_utils.py new file mode 100644 index 0000000..5eb4f52 --- /dev/null +++ b/scripts/linkedin_utils.py @@ -0,0 +1,194 @@ +# scripts/linkedin_utils.py +""" +LinkedIn profile HTML parser. + +Extracts structured profile data from a raw LinkedIn public profile page. +No Playwright dependency — importable by both linkedin_scraper and linkedin_parser. + +Selectors target the 2024-2025 LinkedIn public profile DOM. +When LinkedIn changes their markup, update the selector lists here only. +Each section uses ordered fallbacks — first matching selector wins. +""" +from __future__ import annotations +import re +from bs4 import BeautifulSoup + + +# ── Selector fallback lists ──────────────────────────────────────────────────── + +_NAME_SELECTORS = [ + "h1.top-card-layout__title", + "h1[class*='title']", + ".pv-top-card--list h1", + "h1", +] + +_SUMMARY_SELECTORS = [ + "section[data-section='about'] .show-more-less-text__text--less", + "section[data-section='about'] p", + "#about ~ * p.show-more-less-text__text--less", + ".pv-about-section p", +] + +_EXPERIENCE_ITEM_SELECTORS = [ + "section[data-section='experience'] li.experience-item", + "section[data-section='experience'] li", + "#experience-section li", + "#experience ~ * li", +] + +_EXP_TITLE_SELECTORS = ["span.experience-item__title", "span[class*='title']", "h3"] +_EXP_COMPANY_SELECTORS = ["span.experience-item__subtitle", "span[class*='subtitle']", "p[class*='company']"] +_EXP_DATE_SELECTORS = ["span.date-range", "[class*='date-range']", "span[class*='duration']"] +_EXP_DESC_SELECTORS = [".show-more-less-text__text--less", "p[class*='description']", "p"] + +_EDUCATION_ITEM_SELECTORS = [ + "section[data-section='education'] li.education__list-item", + "section[data-section='education'] li", + "#education ~ * li", +] + +_EDU_SCHOOL_SELECTORS = ["h3.education__school-name", "h3[class*='school']", "h3"] +_EDU_DEGREE_SELECTORS = ["span.education__item--degree-name", "span[class*='degree']", "p[class*='degree']"] +_EDU_DATES_SELECTORS = ["span.education__item--duration", "span[class*='duration']", "time"] + +_SKILLS_SELECTORS = [ + "section[data-section='skills'] span.mr1", + "section[data-section='skills'] li span[class*='bold']", + "section[data-section='skills'] li span", + "#skills ~ * li span", +] + +_CERT_ITEM_SELECTORS = [ + "section[data-section='certifications'] li", + "#certifications ~ * li", + "#licenses_and_certifications ~ * li", +] +_CERT_NAME_SELECTORS = ["h3.certifications__name", "h3[class*='name']", "h3", "span[class*='title']"] + + +# ── Helpers ─────────────────────────────────────────────────────────────────── + +def _select_first(soup, selectors): + for sel in selectors: + try: + el = soup.select_one(sel) + if el and el.get_text(strip=True): + return el.get_text(strip=True) + except Exception: + continue + return "" + + +def _select_all(soup, selectors): + for sel in selectors: + try: + els = soup.select(sel) + if els: + return els + except Exception: + continue + return [] + + +def _split_bullets(text): + parts = re.split(r"[•·]\s*|(?<=\s)–\s+|\n+", text) + return [p.strip() for p in parts if p.strip() and len(p.strip()) > 3] + + +def _date_range_text(item): + for sel in _EXP_DATE_SELECTORS: + try: + el = item.select_one(sel) + if el: + times = [t.get_text(strip=True) for t in el.find_all("time")] + if times: + return " – ".join(times) + text = el.get_text(strip=True) + if text: + return text + except Exception: + continue + return "" + + +# ── Public API ──────────────────────────────────────────────────────────────── + +def parse_html(raw_html: str) -> dict: + """ + Extract structured profile data from a raw LinkedIn public profile HTML page. + + Returns a dict with keys: name, email, phone, linkedin, career_summary, + experience[], education[], skills[], achievements[] + + Never raises — returns empty values for sections that cannot be parsed. + """ + soup = BeautifulSoup(raw_html, "lxml") + + name = _select_first(soup, _NAME_SELECTORS) + career_summary = _select_first(soup, _SUMMARY_SELECTORS) + + experience = [] + for item in _select_all(soup, _EXPERIENCE_ITEM_SELECTORS): + title = _select_first(item, _EXP_TITLE_SELECTORS) + company = _select_first(item, _EXP_COMPANY_SELECTORS) + dates = _date_range_text(item) + desc_el = None + for sel in _EXP_DESC_SELECTORS: + try: + desc_el = item.select_one(sel) + if desc_el: + break + except Exception: + continue + bullets = _split_bullets(desc_el.get_text(" ", strip=True)) if desc_el else [] + if title or company: + experience.append({ + "company": company, + "title": title, + "date_range": dates, + "bullets": bullets, + }) + + education = [] + for item in _select_all(soup, _EDUCATION_ITEM_SELECTORS): + school = _select_first(item, _EDU_SCHOOL_SELECTORS) + degree = _select_first(item, _EDU_DEGREE_SELECTORS) + dates = "" + for sel in _EDU_DATES_SELECTORS: + try: + el = item.select_one(sel) + if el: + dates = el.get_text(strip=True) + break + except Exception: + continue + if school or degree: + education.append({ + "school": school, + "degree": degree, + "field": "", + "dates": dates, + }) + + skills = [el.get_text(strip=True) for el in _select_all(soup, _SKILLS_SELECTORS) + if el.get_text(strip=True)] + skills = list(dict.fromkeys(skills)) + + achievements = [] + for item in _select_all(soup, _CERT_ITEM_SELECTORS): + label = _select_first(item, _CERT_NAME_SELECTORS) + if label: + achievements.append(label) + + return { + "name": name, + "email": "", + "phone": "", + "linkedin": "", + "career_summary": career_summary, + "experience": experience, + "education": education, + "skills": skills, + "achievements": achievements, + } diff --git a/tests/fixtures/linkedin_profile.html b/tests/fixtures/linkedin_profile.html new file mode 100644 index 0000000..916aa0f --- /dev/null +++ b/tests/fixtures/linkedin_profile.html @@ -0,0 +1,110 @@ + + + +
+ Experienced engineer with 10 years in embedded systems and DevOps. + Passionate about open-source and accessibility tooling. +
++ Led migration of monolith to microservices. • + Reduced p99 latency by 40%. • + Mentored three junior engineers. +
++ Designed CI/CD pipeline. • Maintained Kubernetes clusters. +
+