peregrine/app/components/linkedin_import.py
pyr0ball f403af4a31
Some checks failed
CI / test (push) Has been cancelled
fix(linkedin): update selectors for 2025 public DOM; surface login-wall limitation in UI
LinkedIn's unauthenticated public profile only exposes name, summary (truncated),
current employer name, and certifications. Past roles, education, and skills are
blurred server-side behind a login wall — not a scraper limitation.

- Update selectors: data-section='summary' (was 'about'), .profile-section-card
  for certs, .visible-list for current experience entry
- Strip login-wall noise injected into summary text after 'see more'
- Skip aria-hidden blurred placeholder experience items
- Add info callout in UI directing users to data export zip for full history
2026-03-13 19:47:21 -07:00

192 lines
7.7 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# app/components/linkedin_import.py
"""
Shared LinkedIn import widget.
Usage in a page:
from app.components.linkedin_import import render_linkedin_tab
# At top of page render — check for pending import:
_li_data = st.session_state.pop("_linkedin_extracted", None)
if _li_data:
st.session_state["_parsed_resume"] = _li_data
st.rerun()
# Inside the LinkedIn tab:
with tab_linkedin:
render_linkedin_tab(config_dir=CONFIG_DIR, tier=tier)
"""
from __future__ import annotations
import json
import re
from datetime import datetime, timezone
from pathlib import Path
import streamlit as st
_LINKEDIN_PROFILE_RE = re.compile(r"https?://(www\.)?linkedin\.com/in/", re.I)
def _stage_path(config_dir: Path) -> Path:
return config_dir / "linkedin_stage.json"
def _load_stage(config_dir: Path) -> dict | None:
path = _stage_path(config_dir)
if not path.exists():
return None
try:
return json.loads(path.read_text())
except Exception:
return None
def _days_ago(iso_ts: str) -> str:
try:
dt = datetime.fromisoformat(iso_ts)
delta = datetime.now(timezone.utc) - dt
days = delta.days
if days == 0:
return "today"
if days == 1:
return "yesterday"
return f"{days} days ago"
except Exception:
return "unknown"
def _do_scrape(url: str, config_dir: Path) -> None:
"""Validate URL, run scrape, update state."""
if not _LINKEDIN_PROFILE_RE.match(url):
st.error("Please enter a LinkedIn profile URL (linkedin.com/in/…)")
return
with st.spinner("Fetching LinkedIn profile… (1020 seconds)"):
try:
from scripts.linkedin_scraper import scrape_profile
scrape_profile(url, _stage_path(config_dir))
st.success("Profile imported successfully.")
st.rerun()
except ValueError as e:
st.error(str(e))
except RuntimeError as e:
st.warning(str(e))
except Exception as e:
st.error(f"Unexpected error: {e}")
def render_linkedin_tab(config_dir: Path, tier: str) -> None:
"""
Render the LinkedIn import UI.
When the user clicks "Use this data", writes the extracted dict to
st.session_state["_linkedin_extracted"] and calls st.rerun().
Caller reads: data = st.session_state.pop("_linkedin_extracted", None)
"""
stage = _load_stage(config_dir)
# ── Staged data status bar ────────────────────────────────────────────────
if stage:
scraped_at = stage.get("scraped_at", "")
source_label = "LinkedIn export" if stage.get("source") == "export_zip" else "LinkedIn profile"
col_info, col_refresh = st.columns([4, 1])
col_info.caption(f"Last imported from {source_label}: {_days_ago(scraped_at)}")
if col_refresh.button("🔄 Refresh", key="li_refresh"):
url = stage.get("url")
if url:
_do_scrape(url, config_dir)
else:
st.info("Original URL not available — paste the URL below to re-import.")
# ── URL import ────────────────────────────────────────────────────────────
st.markdown("**Import from LinkedIn profile URL**")
url_input = st.text_input(
"LinkedIn profile URL",
placeholder="https://linkedin.com/in/your-name",
label_visibility="collapsed",
key="li_url_input",
)
if st.button("🔗 Import from LinkedIn", key="li_import_btn", type="primary"):
if not url_input.strip():
st.warning("Please enter your LinkedIn profile URL.")
else:
_do_scrape(url_input.strip(), config_dir)
st.caption(
"Imports from your public LinkedIn profile. No login or credentials required. "
"Scraping typically takes 1020 seconds."
)
st.info(
"**LinkedIn limits public profile data.** Without logging in, LinkedIn only "
"exposes your name, About summary, current employer, and certifications — "
"past roles, education, and skills are hidden behind their login wall. "
"For your full career history use the **data export zip** option below.",
icon="",
)
# ── Section preview + use button ─────────────────────────────────────────
if stage:
from scripts.linkedin_parser import parse_stage
extracted, err = parse_stage(_stage_path(config_dir))
if err:
st.warning(f"Could not read staged data: {err}")
else:
st.divider()
st.markdown("**Preview**")
col1, col2, col3 = st.columns(3)
col1.metric("Experience entries", len(extracted.get("experience", [])))
col2.metric("Skills", len(extracted.get("skills", [])))
col3.metric("Certifications", len(extracted.get("achievements", [])))
if extracted.get("career_summary"):
with st.expander("Summary"):
st.write(extracted["career_summary"])
if extracted.get("experience"):
with st.expander(f"Experience ({len(extracted['experience'])} entries)"):
for exp in extracted["experience"]:
st.markdown(f"**{exp.get('title')}** @ {exp.get('company')} · {exp.get('date_range', '')}")
if extracted.get("education"):
with st.expander("Education"):
for edu in extracted["education"]:
st.markdown(f"**{edu.get('school')}** — {edu.get('degree')} {edu.get('field', '')}".strip())
if extracted.get("skills"):
with st.expander("Skills"):
st.write(", ".join(extracted["skills"]))
st.divider()
if st.button("✅ Use this data", key="li_use_btn", type="primary"):
st.session_state["_linkedin_extracted"] = extracted
st.rerun()
# ── Advanced: data export ─────────────────────────────────────────────────
with st.expander("⬇️ Import from LinkedIn data export (advanced)", expanded=False):
st.caption(
"Download your LinkedIn data: **Settings & Privacy → Data Privacy → "
"Get a copy of your data → Request archive → Fast file**. "
"The Fast file is available immediately and contains your profile, "
"experience, education, and skills."
)
zip_file = st.file_uploader(
"Upload LinkedIn export zip", type=["zip"], key="li_zip_upload"
)
if zip_file is not None:
if st.button("📦 Parse export", key="li_parse_zip"):
with st.spinner("Parsing export archive…"):
try:
from scripts.linkedin_scraper import parse_export_zip
extracted = parse_export_zip(
zip_file.read(), _stage_path(config_dir)
)
st.success(
f"Imported {len(extracted.get('experience', []))} experience entries, "
f"{len(extracted.get('skills', []))} skills. "
"Click 'Use this data' above to apply."
)
st.rerun()
except Exception as e:
st.error(f"Failed to parse export: {e}")