From 2c61d4038f67d6f8a18963eda71bc2fd0c82aec7 Mon Sep 17 00:00:00 2001
From: pyr0ball <pyroballpcs@gmail.com>
Date: Fri, 13 Mar 2026 19:47:21 -0700
Subject: [PATCH] fix(linkedin): update selectors for 2025 public DOM; surface
 login-wall limitation in UI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LinkedIn's unauthenticated public profile only exposes name, summary (truncated),
current employer name, and certifications. Past roles, education, and skills are
blurred server-side behind a login wall — not a scraper limitation.

- Update selectors: data-section='summary' (was 'about'), .profile-section-card
  for certs, .visible-list for current experience entry
- Strip login-wall noise injected into summary text after 'see more'
- Skip aria-hidden blurred placeholder experience items
- Add info callout in UI directing users to data export zip for full history
---
 app/components/linkedin_import.py |  7 ++++
 scripts/linkedin_utils.py         | 59 ++++++++++++++++++++++++++-----
 2 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/app/components/linkedin_import.py b/app/components/linkedin_import.py
index 3674ae5..93e7875 100644
--- a/app/components/linkedin_import.py
+++ b/app/components/linkedin_import.py
@@ -117,6 +117,13 @@ def render_linkedin_tab(config_dir: Path, tier: str) -> None:
         "Imports from your public LinkedIn profile. No login or credentials required. "
         "Scraping typically takes 10–20 seconds."
     )
+    st.info(
+        "**LinkedIn limits public profile data.** Without logging in, LinkedIn only "
+        "exposes your name, About summary, current employer, and certifications — "
+        "past roles, education, and skills are hidden behind their login wall. "
+        "For your full career history use the **data export zip** option below.",
+        icon="ℹ️",
+    )
 
     # ── Section preview + use button ─────────────────────────────────────────
     if stage:
diff --git a/scripts/linkedin_utils.py b/scripts/linkedin_utils.py
index 5eb4f52..657c662 100644
--- a/scripts/linkedin_utils.py
+++ b/scripts/linkedin_utils.py
@@ -5,7 +5,18 @@ LinkedIn profile HTML parser.
 Extracts structured profile data from a raw LinkedIn public profile page.
 No Playwright dependency — importable by both linkedin_scraper and linkedin_parser.
 
-Selectors target the 2024-2025 LinkedIn public profile DOM.
+** LinkedIn public profile limitations (2025) **
+Unauthenticated requests receive a degraded page where experience titles, past
+roles, education detail, and skills are replaced with blur placeholders or omitted
+entirely.  Only the following are reliably available without login:
+  - Name + headline (top card)
+  - About/summary (truncated; login prompt injected after "see more")
+  - Current employer name only (no title, dates, or description)
+  - Certifications/licenses (if publicly listed)
+  - Volunteer experience, publications, projects (if public)
+For full profile data use the LinkedIn data export zip path instead.
+
+Selectors target the 2025 LinkedIn public profile DOM.
 When LinkedIn changes their markup, update the selector lists here only.
 Each section uses ordered fallbacks — first matching selector wins.
 """
@@ -13,6 +24,11 @@ from __future__ import annotations
 import re
 from bs4 import BeautifulSoup
 
+# Noise phrases injected by LinkedIn's login wall — stripped from summary text
+_LOGIN_NOISE = re.compile(
+    r"see more.*$|welcome back.*$|sign in.*$|by clicking.*$|new to linkedin.*$",
+    re.I | re.S,
+)
 
 # ── Selector fallback lists ────────────────────────────────────────────────────
 
@@ -23,25 +39,31 @@ _NAME_SELECTORS = [
     "h1",
 ]
 
+# 2025 DOM: data-section="summary" (not "about")
+_SUMMARY_SECTION_SELECTOR = "section[data-section='summary'] .core-section-container__content"
 _SUMMARY_SELECTORS = [
+    "section[data-section='summary'] .core-section-container__content",
+    "section[data-section='about'] .core-section-container__content",
     "section[data-section='about'] .show-more-less-text__text--less",
     "section[data-section='about'] p",
-    "#about ~ * p.show-more-less-text__text--less",
     ".pv-about-section p",
 ]
 
+# 2025 DOM: experience lives in .visible-list inside .experience-education section.
+# Only the current employer h3 is unblurred; past roles use aria-hidden blurred-list.
 _EXPERIENCE_ITEM_SELECTORS = [
+    "section.experience-education .visible-list li.profile-section-card",
     "section[data-section='experience'] li.experience-item",
     "section[data-section='experience'] li",
     "#experience-section li",
-    "#experience ~ * li",
 ]
 
-_EXP_TITLE_SELECTORS   = ["span.experience-item__title", "span[class*='title']", "h3"]
-_EXP_COMPANY_SELECTORS = ["span.experience-item__subtitle", "span[class*='subtitle']", "p[class*='company']"]
+_EXP_TITLE_SELECTORS   = ["span.experience-item__title", "span[class*='title']"]
+_EXP_COMPANY_SELECTORS = ["h3", "span.experience-item__subtitle", "span[class*='subtitle']"]
 _EXP_DATE_SELECTORS    = ["span.date-range", "[class*='date-range']", "span[class*='duration']"]
-_EXP_DESC_SELECTORS    = [".show-more-less-text__text--less", "p[class*='description']", "p"]
+_EXP_DESC_SELECTORS    = [".show-more-less-text__text--less", "p[class*='description']"]
 
+# 2025 DOM: education is also blurred; top-card shows most recent school only
 _EDUCATION_ITEM_SELECTORS = [
     "section[data-section='education'] li.education__list-item",
     "section[data-section='education'] li",
@@ -52,6 +74,7 @@ _EDU_SCHOOL_SELECTORS = ["h3.education__school-name", "h3[class*='school']", "h3
 _EDU_DEGREE_SELECTORS = ["span.education__item--degree-name", "span[class*='degree']", "p[class*='degree']"]
 _EDU_DATES_SELECTORS  = ["span.education__item--duration", "span[class*='duration']", "time"]
 
+# Skills are not present on the 2025 unauthenticated public profile page
 _SKILLS_SELECTORS = [
     "section[data-section='skills'] span.mr1",
     "section[data-section='skills'] li span[class*='bold']",
@@ -59,12 +82,14 @@ _SKILLS_SELECTORS = [
     "#skills ~ * li span",
 ]
 
+# 2025 DOM: certifications use li.profile-section-card with h3 for name
 _CERT_ITEM_SELECTORS = [
+    "section[data-section='certifications'] li.profile-section-card",
     "section[data-section='certifications'] li",
     "#certifications ~ * li",
     "#licenses_and_certifications ~ * li",
 ]
-_CERT_NAME_SELECTORS = ["h3.certifications__name", "h3[class*='name']", "h3", "span[class*='title']"]
+_CERT_NAME_SELECTORS = ["h3", "h3.certifications__name", "h3[class*='name']", "span[class*='title']"]
 
 
 # ── Helpers ───────────────────────────────────────────────────────────────────
@@ -126,12 +151,30 @@ def parse_html(raw_html: str) -> dict:
     soup = BeautifulSoup(raw_html, "lxml")
 
     name = _select_first(soup, _NAME_SELECTORS)
-    career_summary = _select_first(soup, _SUMMARY_SELECTORS)
+
+    # Summary: strip login-wall noise injected after "see more"
+    career_summary = ""
+    for sel in _SUMMARY_SELECTORS:
+        try:
+            el = soup.select_one(sel)
+            if el:
+                raw_text = el.get_text(" ", strip=True)
+                career_summary = _LOGIN_NOISE.sub("", raw_text).strip()
+                if career_summary:
+                    break
+        except Exception:
+            continue
 
     experience = []
     for item in _select_all(soup, _EXPERIENCE_ITEM_SELECTORS):
+        # Skip blurred items (aria-hidden list shown as decorative background)
+        if item.get("aria-hidden") == "true":
+            continue
         title   = _select_first(item, _EXP_TITLE_SELECTORS)
         company = _select_first(item, _EXP_COMPANY_SELECTORS)
+        # Skip entries where the title text is pure asterisks (blurred placeholder)
+        if title and re.fullmatch(r"[\*\s]+", title):
+            title = ""
         dates   = _date_range_text(item)
         desc_el = None
         for sel in _EXP_DESC_SELECTORS: