# scripts/linkedin_utils.py """ LinkedIn profile HTML parser. Extracts structured profile data from a raw LinkedIn public profile page. No Playwright dependency — importable by both linkedin_scraper and linkedin_parser. Selectors target the 2024-2025 LinkedIn public profile DOM. When LinkedIn changes their markup, update the selector lists here only. Each section uses ordered fallbacks — first matching selector wins. """ from __future__ import annotations import re from bs4 import BeautifulSoup # ── Selector fallback lists ──────────────────────────────────────────────────── _NAME_SELECTORS = [ "h1.top-card-layout__title", "h1[class*='title']", ".pv-top-card--list h1", "h1", ] _SUMMARY_SELECTORS = [ "section[data-section='about'] .show-more-less-text__text--less", "section[data-section='about'] p", "#about ~ * p.show-more-less-text__text--less", ".pv-about-section p", ] _EXPERIENCE_ITEM_SELECTORS = [ "section[data-section='experience'] li.experience-item", "section[data-section='experience'] li", "#experience-section li", "#experience ~ * li", ] _EXP_TITLE_SELECTORS = ["span.experience-item__title", "span[class*='title']", "h3"] _EXP_COMPANY_SELECTORS = ["span.experience-item__subtitle", "span[class*='subtitle']", "p[class*='company']"] _EXP_DATE_SELECTORS = ["span.date-range", "[class*='date-range']", "span[class*='duration']"] _EXP_DESC_SELECTORS = [".show-more-less-text__text--less", "p[class*='description']", "p"] _EDUCATION_ITEM_SELECTORS = [ "section[data-section='education'] li.education__list-item", "section[data-section='education'] li", "#education ~ * li", ] _EDU_SCHOOL_SELECTORS = ["h3.education__school-name", "h3[class*='school']", "h3"] _EDU_DEGREE_SELECTORS = ["span.education__item--degree-name", "span[class*='degree']", "p[class*='degree']"] _EDU_DATES_SELECTORS = ["span.education__item--duration", "span[class*='duration']", "time"] _SKILLS_SELECTORS = [ "section[data-section='skills'] span.mr1", "section[data-section='skills'] li span[class*='bold']", "section[data-section='skills'] li span", "#skills ~ * li span", ] _CERT_ITEM_SELECTORS = [ "section[data-section='certifications'] li", "#certifications ~ * li", "#licenses_and_certifications ~ * li", ] _CERT_NAME_SELECTORS = ["h3.certifications__name", "h3[class*='name']", "h3", "span[class*='title']"] # ── Helpers ─────────────────────────────────────────────────────────────────── def _select_first(soup, selectors): for sel in selectors: try: el = soup.select_one(sel) if el and el.get_text(strip=True): return el.get_text(strip=True) except Exception: continue return "" def _select_all(soup, selectors): for sel in selectors: try: els = soup.select(sel) if els: return els except Exception: continue return [] def _split_bullets(text): parts = re.split(r"[•·]\s*|(?<=\s)–\s+|\n+", text) return [p.strip() for p in parts if p.strip() and len(p.strip()) > 3] def _date_range_text(item): for sel in _EXP_DATE_SELECTORS: try: el = item.select_one(sel) if el: times = [t.get_text(strip=True) for t in el.find_all("time")] if times: return " – ".join(times) text = el.get_text(strip=True) if text: return text except Exception: continue return "" # ── Public API ──────────────────────────────────────────────────────────────── def parse_html(raw_html: str) -> dict: """ Extract structured profile data from a raw LinkedIn public profile HTML page. Returns a dict with keys: name, email, phone, linkedin, career_summary, experience[], education[], skills[], achievements[] Never raises — returns empty values for sections that cannot be parsed. """ soup = BeautifulSoup(raw_html, "lxml") name = _select_first(soup, _NAME_SELECTORS) career_summary = _select_first(soup, _SUMMARY_SELECTORS) experience = [] for item in _select_all(soup, _EXPERIENCE_ITEM_SELECTORS): title = _select_first(item, _EXP_TITLE_SELECTORS) company = _select_first(item, _EXP_COMPANY_SELECTORS) dates = _date_range_text(item) desc_el = None for sel in _EXP_DESC_SELECTORS: try: desc_el = item.select_one(sel) if desc_el: break except Exception: continue bullets = _split_bullets(desc_el.get_text(" ", strip=True)) if desc_el else [] if title or company: experience.append({ "company": company, "title": title, "date_range": dates, "bullets": bullets, }) education = [] for item in _select_all(soup, _EDUCATION_ITEM_SELECTORS): school = _select_first(item, _EDU_SCHOOL_SELECTORS) degree = _select_first(item, _EDU_DEGREE_SELECTORS) dates = "" for sel in _EDU_DATES_SELECTORS: try: el = item.select_one(sel) if el: dates = el.get_text(strip=True) break except Exception: continue if school or degree: education.append({ "school": school, "degree": degree, "field": "", "dates": dates, }) skills = [el.get_text(strip=True) for el in _select_all(soup, _SKILLS_SELECTORS) if el.get_text(strip=True)] skills = list(dict.fromkeys(skills)) achievements = [] for item in _select_all(soup, _CERT_ITEM_SELECTORS): label = _select_first(item, _CERT_NAME_SELECTORS) if label: achievements.append(label) return { "name": name, "email": "", "phone": "", "linkedin": "", "career_summary": career_summary, "experience": experience, "education": education, "skills": skills, "achievements": achievements, }