peregrine/scripts/linkedin_utils.py

194 lines
6.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# scripts/linkedin_utils.py
"""
LinkedIn profile HTML parser.
Extracts structured profile data from a raw LinkedIn public profile page.
No Playwright dependency — importable by both linkedin_scraper and linkedin_parser.
Selectors target the 2024-2025 LinkedIn public profile DOM.
When LinkedIn changes their markup, update the selector lists here only.
Each section uses ordered fallbacks — first matching selector wins.
"""
from __future__ import annotations
import re
from bs4 import BeautifulSoup
# ── Selector fallback lists ────────────────────────────────────────────────────
_NAME_SELECTORS = [
"h1.top-card-layout__title",
"h1[class*='title']",
".pv-top-card--list h1",
"h1",
]
_SUMMARY_SELECTORS = [
"section[data-section='about'] .show-more-less-text__text--less",
"section[data-section='about'] p",
"#about ~ * p.show-more-less-text__text--less",
".pv-about-section p",
]
_EXPERIENCE_ITEM_SELECTORS = [
"section[data-section='experience'] li.experience-item",
"section[data-section='experience'] li",
"#experience-section li",
"#experience ~ * li",
]
_EXP_TITLE_SELECTORS = ["span.experience-item__title", "span[class*='title']", "h3"]
_EXP_COMPANY_SELECTORS = ["span.experience-item__subtitle", "span[class*='subtitle']", "p[class*='company']"]
_EXP_DATE_SELECTORS = ["span.date-range", "[class*='date-range']", "span[class*='duration']"]
_EXP_DESC_SELECTORS = [".show-more-less-text__text--less", "p[class*='description']", "p"]
_EDUCATION_ITEM_SELECTORS = [
"section[data-section='education'] li.education__list-item",
"section[data-section='education'] li",
"#education ~ * li",
]
_EDU_SCHOOL_SELECTORS = ["h3.education__school-name", "h3[class*='school']", "h3"]
_EDU_DEGREE_SELECTORS = ["span.education__item--degree-name", "span[class*='degree']", "p[class*='degree']"]
_EDU_DATES_SELECTORS = ["span.education__item--duration", "span[class*='duration']", "time"]
_SKILLS_SELECTORS = [
"section[data-section='skills'] span.mr1",
"section[data-section='skills'] li span[class*='bold']",
"section[data-section='skills'] li span",
"#skills ~ * li span",
]
_CERT_ITEM_SELECTORS = [
"section[data-section='certifications'] li",
"#certifications ~ * li",
"#licenses_and_certifications ~ * li",
]
_CERT_NAME_SELECTORS = ["h3.certifications__name", "h3[class*='name']", "h3", "span[class*='title']"]
# ── Helpers ───────────────────────────────────────────────────────────────────
def _select_first(soup, selectors):
for sel in selectors:
try:
el = soup.select_one(sel)
if el and el.get_text(strip=True):
return el.get_text(strip=True)
except Exception:
continue
return ""
def _select_all(soup, selectors):
for sel in selectors:
try:
els = soup.select(sel)
if els:
return els
except Exception:
continue
return []
def _split_bullets(text):
parts = re.split(r"[•·]\s*|(?<=\s)\s+|\n+", text)
return [p.strip() for p in parts if p.strip() and len(p.strip()) > 3]
def _date_range_text(item):
for sel in _EXP_DATE_SELECTORS:
try:
el = item.select_one(sel)
if el:
times = [t.get_text(strip=True) for t in el.find_all("time")]
if times:
return " ".join(times)
text = el.get_text(strip=True)
if text:
return text
except Exception:
continue
return ""
# ── Public API ────────────────────────────────────────────────────────────────
def parse_html(raw_html: str) -> dict:
"""
Extract structured profile data from a raw LinkedIn public profile HTML page.
Returns a dict with keys: name, email, phone, linkedin, career_summary,
experience[], education[], skills[], achievements[]
Never raises — returns empty values for sections that cannot be parsed.
"""
soup = BeautifulSoup(raw_html, "lxml")
name = _select_first(soup, _NAME_SELECTORS)
career_summary = _select_first(soup, _SUMMARY_SELECTORS)
experience = []
for item in _select_all(soup, _EXPERIENCE_ITEM_SELECTORS):
title = _select_first(item, _EXP_TITLE_SELECTORS)
company = _select_first(item, _EXP_COMPANY_SELECTORS)
dates = _date_range_text(item)
desc_el = None
for sel in _EXP_DESC_SELECTORS:
try:
desc_el = item.select_one(sel)
if desc_el:
break
except Exception:
continue
bullets = _split_bullets(desc_el.get_text(" ", strip=True)) if desc_el else []
if title or company:
experience.append({
"company": company,
"title": title,
"date_range": dates,
"bullets": bullets,
})
education = []
for item in _select_all(soup, _EDUCATION_ITEM_SELECTORS):
school = _select_first(item, _EDU_SCHOOL_SELECTORS)
degree = _select_first(item, _EDU_DEGREE_SELECTORS)
dates = ""
for sel in _EDU_DATES_SELECTORS:
try:
el = item.select_one(sel)
if el:
dates = el.get_text(strip=True)
break
except Exception:
continue
if school or degree:
education.append({
"school": school,
"degree": degree,
"field": "",
"dates": dates,
})
skills = [el.get_text(strip=True) for el in _select_all(soup, _SKILLS_SELECTORS)
if el.get_text(strip=True)]
skills = list(dict.fromkeys(skills))
achievements = []
for item in _select_all(soup, _CERT_ITEM_SELECTORS):
label = _select_first(item, _CERT_NAME_SELECTORS)
if label:
achievements.append(label)
return {
"name": name,
"email": "",
"phone": "",
"linkedin": "",
"career_summary": career_summary,
"experience": experience,
"education": education,
"skills": skills,
"achievements": achievements,
}