From 2c61d4038f67d6f8a18963eda71bc2fd0c82aec7 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Fri, 13 Mar 2026 19:47:21 -0700 Subject: [PATCH] fix(linkedin): update selectors for 2025 public DOM; surface login-wall limitation in UI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LinkedIn's unauthenticated public profile only exposes name, summary (truncated), current employer name, and certifications. Past roles, education, and skills are blurred server-side behind a login wall — not a scraper limitation. - Update selectors: data-section='summary' (was 'about'), .profile-section-card for certs, .visible-list for current experience entry - Strip login-wall noise injected into summary text after 'see more' - Skip aria-hidden blurred placeholder experience items - Add info callout in UI directing users to data export zip for full history --- app/components/linkedin_import.py | 7 ++++ scripts/linkedin_utils.py | 59 ++++++++++++++++++++++++++----- 2 files changed, 58 insertions(+), 8 deletions(-) diff --git a/app/components/linkedin_import.py b/app/components/linkedin_import.py index 3674ae5..93e7875 100644 --- a/app/components/linkedin_import.py +++ b/app/components/linkedin_import.py @@ -117,6 +117,13 @@ def render_linkedin_tab(config_dir: Path, tier: str) -> None: "Imports from your public LinkedIn profile. No login or credentials required. " "Scraping typically takes 10–20 seconds." ) + st.info( + "**LinkedIn limits public profile data.** Without logging in, LinkedIn only " + "exposes your name, About summary, current employer, and certifications — " + "past roles, education, and skills are hidden behind their login wall. " + "For your full career history use the **data export zip** option below.", + icon="ℹ️", + ) # ── Section preview + use button ───────────────────────────────────────── if stage: diff --git a/scripts/linkedin_utils.py b/scripts/linkedin_utils.py index 5eb4f52..657c662 100644 --- a/scripts/linkedin_utils.py +++ b/scripts/linkedin_utils.py @@ -5,7 +5,18 @@ LinkedIn profile HTML parser. Extracts structured profile data from a raw LinkedIn public profile page. No Playwright dependency — importable by both linkedin_scraper and linkedin_parser. -Selectors target the 2024-2025 LinkedIn public profile DOM. +** LinkedIn public profile limitations (2025) ** +Unauthenticated requests receive a degraded page where experience titles, past +roles, education detail, and skills are replaced with blur placeholders or omitted +entirely. Only the following are reliably available without login: + - Name + headline (top card) + - About/summary (truncated; login prompt injected after "see more") + - Current employer name only (no title, dates, or description) + - Certifications/licenses (if publicly listed) + - Volunteer experience, publications, projects (if public) +For full profile data use the LinkedIn data export zip path instead. + +Selectors target the 2025 LinkedIn public profile DOM. When LinkedIn changes their markup, update the selector lists here only. Each section uses ordered fallbacks — first matching selector wins. """ @@ -13,6 +24,11 @@ from __future__ import annotations import re from bs4 import BeautifulSoup +# Noise phrases injected by LinkedIn's login wall — stripped from summary text +_LOGIN_NOISE = re.compile( + r"see more.*$|welcome back.*$|sign in.*$|by clicking.*$|new to linkedin.*$", + re.I | re.S, +) # ── Selector fallback lists ──────────────────────────────────────────────────── @@ -23,25 +39,31 @@ _NAME_SELECTORS = [ "h1", ] +# 2025 DOM: data-section="summary" (not "about") +_SUMMARY_SECTION_SELECTOR = "section[data-section='summary'] .core-section-container__content" _SUMMARY_SELECTORS = [ + "section[data-section='summary'] .core-section-container__content", + "section[data-section='about'] .core-section-container__content", "section[data-section='about'] .show-more-less-text__text--less", "section[data-section='about'] p", - "#about ~ * p.show-more-less-text__text--less", ".pv-about-section p", ] +# 2025 DOM: experience lives in .visible-list inside .experience-education section. +# Only the current employer h3 is unblurred; past roles use aria-hidden blurred-list. _EXPERIENCE_ITEM_SELECTORS = [ + "section.experience-education .visible-list li.profile-section-card", "section[data-section='experience'] li.experience-item", "section[data-section='experience'] li", "#experience-section li", - "#experience ~ * li", ] -_EXP_TITLE_SELECTORS = ["span.experience-item__title", "span[class*='title']", "h3"] -_EXP_COMPANY_SELECTORS = ["span.experience-item__subtitle", "span[class*='subtitle']", "p[class*='company']"] +_EXP_TITLE_SELECTORS = ["span.experience-item__title", "span[class*='title']"] +_EXP_COMPANY_SELECTORS = ["h3", "span.experience-item__subtitle", "span[class*='subtitle']"] _EXP_DATE_SELECTORS = ["span.date-range", "[class*='date-range']", "span[class*='duration']"] -_EXP_DESC_SELECTORS = [".show-more-less-text__text--less", "p[class*='description']", "p"] +_EXP_DESC_SELECTORS = [".show-more-less-text__text--less", "p[class*='description']"] +# 2025 DOM: education is also blurred; top-card shows most recent school only _EDUCATION_ITEM_SELECTORS = [ "section[data-section='education'] li.education__list-item", "section[data-section='education'] li", @@ -52,6 +74,7 @@ _EDU_SCHOOL_SELECTORS = ["h3.education__school-name", "h3[class*='school']", "h3 _EDU_DEGREE_SELECTORS = ["span.education__item--degree-name", "span[class*='degree']", "p[class*='degree']"] _EDU_DATES_SELECTORS = ["span.education__item--duration", "span[class*='duration']", "time"] +# Skills are not present on the 2025 unauthenticated public profile page _SKILLS_SELECTORS = [ "section[data-section='skills'] span.mr1", "section[data-section='skills'] li span[class*='bold']", @@ -59,12 +82,14 @@ _SKILLS_SELECTORS = [ "#skills ~ * li span", ] +# 2025 DOM: certifications use li.profile-section-card with h3 for name _CERT_ITEM_SELECTORS = [ + "section[data-section='certifications'] li.profile-section-card", "section[data-section='certifications'] li", "#certifications ~ * li", "#licenses_and_certifications ~ * li", ] -_CERT_NAME_SELECTORS = ["h3.certifications__name", "h3[class*='name']", "h3", "span[class*='title']"] +_CERT_NAME_SELECTORS = ["h3", "h3.certifications__name", "h3[class*='name']", "span[class*='title']"] # ── Helpers ─────────────────────────────────────────────────────────────────── @@ -126,12 +151,30 @@ def parse_html(raw_html: str) -> dict: soup = BeautifulSoup(raw_html, "lxml") name = _select_first(soup, _NAME_SELECTORS) - career_summary = _select_first(soup, _SUMMARY_SELECTORS) + + # Summary: strip login-wall noise injected after "see more" + career_summary = "" + for sel in _SUMMARY_SELECTORS: + try: + el = soup.select_one(sel) + if el: + raw_text = el.get_text(" ", strip=True) + career_summary = _LOGIN_NOISE.sub("", raw_text).strip() + if career_summary: + break + except Exception: + continue experience = [] for item in _select_all(soup, _EXPERIENCE_ITEM_SELECTORS): + # Skip blurred items (aria-hidden list shown as decorative background) + if item.get("aria-hidden") == "true": + continue title = _select_first(item, _EXP_TITLE_SELECTORS) company = _select_first(item, _EXP_COMPANY_SELECTORS) + # Skip entries where the title text is pure asterisks (blurred placeholder) + if title and re.fullmatch(r"[\*\s]+", title): + title = "" dates = _date_range_text(item) desc_el = None for sel in _EXP_DESC_SELECTORS: