Compare commits
12 commits
95c5a12196
...
7d15980bdd
| Author | SHA1 | Date | |
|---|---|---|---|
| 7d15980bdd | |||
| 9603d591a3 | |||
| f3617abb6b | |||
| 6b59804d35 | |||
| 7b9e758861 | |||
| 070be6c2e9 | |||
| 083dff2ec8 | |||
| ac1db1ea7f | |||
| 260d186c86 | |||
| 04d0a66f21 | |||
| 32ed451933 | |||
| 6c61290218 |
18 changed files with 1312 additions and 78 deletions
|
|
@ -10,8 +10,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY requirements.txt .
|
||||
# Install Python dependencies
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Install Playwright browser (cached separately from Python deps so requirements
|
||||
# changes don't bust the ~600–900 MB Chromium layer and vice versa)
|
||||
RUN playwright install chromium && playwright install-deps chromium
|
||||
|
||||
# Bundle companyScraper (company research web scraper)
|
||||
COPY scrapers/ /app/scrapers/
|
||||
|
||||
|
|
|
|||
|
|
@ -69,7 +69,7 @@ _SETUP_BANNERS = [
|
|||
{"key": "upload_corpus", "text": "Upload your cover letter corpus for voice fine-tuning",
|
||||
"link_label": "Settings → Fine-Tune"},
|
||||
{"key": "configure_linkedin", "text": "Configure LinkedIn Easy Apply automation",
|
||||
"link_label": "Settings → AIHawk"},
|
||||
"link_label": "Settings → Integrations"},
|
||||
{"key": "setup_searxng", "text": "Set up company research with SearXNG",
|
||||
"link_label": "Settings → Services"},
|
||||
{"key": "target_companies", "text": "Build a target company list for focused outreach",
|
||||
|
|
|
|||
|
|
@ -22,11 +22,11 @@ IS_DEMO = os.environ.get("DEMO_MODE", "").lower() in ("1", "true", "yes")
|
|||
import streamlit as st
|
||||
from scripts.db import DEFAULT_DB, init_db, get_active_tasks
|
||||
from app.feedback import inject_feedback_button
|
||||
from app.cloud_session import resolve_session, get_db_path
|
||||
from app.cloud_session import resolve_session, get_db_path, get_config_dir
|
||||
import sqlite3
|
||||
|
||||
st.set_page_config(
|
||||
page_title="Job Seeker",
|
||||
page_title="Peregrine",
|
||||
page_icon="💼",
|
||||
layout="wide",
|
||||
)
|
||||
|
|
@ -80,7 +80,7 @@ except Exception:
|
|||
|
||||
# ── First-run wizard gate ───────────────────────────────────────────────────────
|
||||
from scripts.user_profile import UserProfile as _UserProfile
|
||||
_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
|
||||
_USER_YAML = get_config_dir() / "user.yaml"
|
||||
|
||||
_show_wizard = not IS_DEMO and (
|
||||
not _UserProfile.exists(_USER_YAML)
|
||||
|
|
|
|||
|
|
@ -112,13 +112,19 @@ def resolve_session(app: str = "peregrine") -> None:
|
|||
cookie_header = st.context.headers.get("x-cf-session", "")
|
||||
session_jwt = _extract_session_token(cookie_header)
|
||||
if not session_jwt:
|
||||
st.error("Session token missing. Please log in at circuitforge.tech.")
|
||||
st.components.v1.html(
|
||||
'<script>window.top.location.href = "https://circuitforge.tech/login";</script>',
|
||||
height=0,
|
||||
)
|
||||
st.stop()
|
||||
|
||||
try:
|
||||
user_id = validate_session_jwt(session_jwt)
|
||||
except Exception as exc:
|
||||
st.error(f"Invalid session — please log in again. ({exc})")
|
||||
except Exception:
|
||||
st.components.v1.html(
|
||||
'<script>window.top.location.href = "https://circuitforge.tech/login";</script>',
|
||||
height=0,
|
||||
)
|
||||
st.stop()
|
||||
|
||||
user_path = _user_data_path(user_id, app)
|
||||
|
|
@ -141,6 +147,19 @@ def get_db_path() -> Path:
|
|||
return st.session_state.get("db_path", DEFAULT_DB)
|
||||
|
||||
|
||||
def get_config_dir() -> Path:
|
||||
"""
|
||||
Return the config directory for this session.
|
||||
Cloud: per-user path (<data_root>/<user_id>/peregrine/config/) so each
|
||||
user's YAML files (user.yaml, plain_text_resume.yaml, etc.) are
|
||||
isolated and never shared across tenants.
|
||||
Local: repo-level config/ directory.
|
||||
"""
|
||||
if CLOUD_MODE and st.session_state.get("db_path"):
|
||||
return Path(st.session_state["db_path"]).parent / "config"
|
||||
return Path(__file__).parent.parent.parent / "config"
|
||||
|
||||
|
||||
def get_cloud_tier() -> str:
|
||||
"""
|
||||
Return the current user's cloud tier.
|
||||
|
|
|
|||
1
app/components/__init__.py
Normal file
1
app/components/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
# app/components/__init__.py
|
||||
185
app/components/linkedin_import.py
Normal file
185
app/components/linkedin_import.py
Normal file
|
|
@ -0,0 +1,185 @@
|
|||
# app/components/linkedin_import.py
|
||||
"""
|
||||
Shared LinkedIn import widget.
|
||||
|
||||
Usage in a page:
|
||||
from app.components.linkedin_import import render_linkedin_tab
|
||||
|
||||
# At top of page render — check for pending import:
|
||||
_li_data = st.session_state.pop("_linkedin_extracted", None)
|
||||
if _li_data:
|
||||
st.session_state["_parsed_resume"] = _li_data
|
||||
st.rerun()
|
||||
|
||||
# Inside the LinkedIn tab:
|
||||
with tab_linkedin:
|
||||
render_linkedin_tab(config_dir=CONFIG_DIR, tier=tier)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import streamlit as st
|
||||
|
||||
_LINKEDIN_PROFILE_RE = re.compile(r"https?://(www\.)?linkedin\.com/in/", re.I)
|
||||
|
||||
|
||||
def _stage_path(config_dir: Path) -> Path:
|
||||
return config_dir / "linkedin_stage.json"
|
||||
|
||||
|
||||
def _load_stage(config_dir: Path) -> dict | None:
|
||||
path = _stage_path(config_dir)
|
||||
if not path.exists():
|
||||
return None
|
||||
try:
|
||||
return json.loads(path.read_text())
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _days_ago(iso_ts: str) -> str:
|
||||
try:
|
||||
dt = datetime.fromisoformat(iso_ts)
|
||||
delta = datetime.now(timezone.utc) - dt
|
||||
days = delta.days
|
||||
if days == 0:
|
||||
return "today"
|
||||
if days == 1:
|
||||
return "yesterday"
|
||||
return f"{days} days ago"
|
||||
except Exception:
|
||||
return "unknown"
|
||||
|
||||
|
||||
def _do_scrape(url: str, config_dir: Path) -> None:
|
||||
"""Validate URL, run scrape, update state."""
|
||||
if not _LINKEDIN_PROFILE_RE.match(url):
|
||||
st.error("Please enter a LinkedIn profile URL (linkedin.com/in/…)")
|
||||
return
|
||||
|
||||
with st.spinner("Fetching LinkedIn profile… (10–20 seconds)"):
|
||||
try:
|
||||
from scripts.linkedin_scraper import scrape_profile
|
||||
scrape_profile(url, _stage_path(config_dir))
|
||||
st.success("Profile imported successfully.")
|
||||
st.rerun()
|
||||
except ValueError as e:
|
||||
st.error(str(e))
|
||||
except RuntimeError as e:
|
||||
st.warning(str(e))
|
||||
except Exception as e:
|
||||
st.error(f"Unexpected error: {e}")
|
||||
|
||||
|
||||
def render_linkedin_tab(config_dir: Path, tier: str) -> None:
|
||||
"""
|
||||
Render the LinkedIn import UI.
|
||||
|
||||
When the user clicks "Use this data", writes the extracted dict to
|
||||
st.session_state["_linkedin_extracted"] and calls st.rerun().
|
||||
|
||||
Caller reads: data = st.session_state.pop("_linkedin_extracted", None)
|
||||
"""
|
||||
stage = _load_stage(config_dir)
|
||||
|
||||
# ── Staged data status bar ────────────────────────────────────────────────
|
||||
if stage:
|
||||
scraped_at = stage.get("scraped_at", "")
|
||||
source_label = "LinkedIn export" if stage.get("source") == "export_zip" else "LinkedIn profile"
|
||||
col_info, col_refresh = st.columns([4, 1])
|
||||
col_info.caption(f"Last imported from {source_label}: {_days_ago(scraped_at)}")
|
||||
if col_refresh.button("🔄 Refresh", key="li_refresh"):
|
||||
url = stage.get("url")
|
||||
if url:
|
||||
_do_scrape(url, config_dir)
|
||||
else:
|
||||
st.info("Original URL not available — paste the URL below to re-import.")
|
||||
|
||||
# ── URL import ────────────────────────────────────────────────────────────
|
||||
st.markdown("**Import from LinkedIn profile URL**")
|
||||
url_input = st.text_input(
|
||||
"LinkedIn profile URL",
|
||||
placeholder="https://linkedin.com/in/your-name",
|
||||
label_visibility="collapsed",
|
||||
key="li_url_input",
|
||||
)
|
||||
if st.button("🔗 Import from LinkedIn", key="li_import_btn", type="primary"):
|
||||
if not url_input.strip():
|
||||
st.warning("Please enter your LinkedIn profile URL.")
|
||||
else:
|
||||
_do_scrape(url_input.strip(), config_dir)
|
||||
|
||||
st.caption(
|
||||
"Imports from your public LinkedIn profile. No login or credentials required. "
|
||||
"Scraping typically takes 10–20 seconds."
|
||||
)
|
||||
|
||||
# ── Section preview + use button ─────────────────────────────────────────
|
||||
if stage:
|
||||
from scripts.linkedin_parser import parse_stage
|
||||
extracted, err = parse_stage(_stage_path(config_dir))
|
||||
|
||||
if err:
|
||||
st.warning(f"Could not read staged data: {err}")
|
||||
else:
|
||||
st.divider()
|
||||
st.markdown("**Preview**")
|
||||
col1, col2, col3 = st.columns(3)
|
||||
col1.metric("Experience entries", len(extracted.get("experience", [])))
|
||||
col2.metric("Skills", len(extracted.get("skills", [])))
|
||||
col3.metric("Certifications", len(extracted.get("achievements", [])))
|
||||
|
||||
if extracted.get("career_summary"):
|
||||
with st.expander("Summary"):
|
||||
st.write(extracted["career_summary"])
|
||||
|
||||
if extracted.get("experience"):
|
||||
with st.expander(f"Experience ({len(extracted['experience'])} entries)"):
|
||||
for exp in extracted["experience"]:
|
||||
st.markdown(f"**{exp.get('title')}** @ {exp.get('company')} · {exp.get('date_range', '')}")
|
||||
|
||||
if extracted.get("education"):
|
||||
with st.expander("Education"):
|
||||
for edu in extracted["education"]:
|
||||
st.markdown(f"**{edu.get('school')}** — {edu.get('degree')} {edu.get('field', '')}".strip())
|
||||
|
||||
if extracted.get("skills"):
|
||||
with st.expander("Skills"):
|
||||
st.write(", ".join(extracted["skills"]))
|
||||
|
||||
st.divider()
|
||||
if st.button("✅ Use this data", key="li_use_btn", type="primary"):
|
||||
st.session_state["_linkedin_extracted"] = extracted
|
||||
st.rerun()
|
||||
|
||||
# ── Advanced: data export ─────────────────────────────────────────────────
|
||||
with st.expander("⬇️ Import from LinkedIn data export (advanced)", expanded=False):
|
||||
st.caption(
|
||||
"Download your LinkedIn data: **Settings & Privacy → Data Privacy → "
|
||||
"Get a copy of your data → Request archive → Fast file**. "
|
||||
"The Fast file is available immediately and contains your profile, "
|
||||
"experience, education, and skills."
|
||||
)
|
||||
zip_file = st.file_uploader(
|
||||
"Upload LinkedIn export zip", type=["zip"], key="li_zip_upload"
|
||||
)
|
||||
if zip_file is not None:
|
||||
if st.button("📦 Parse export", key="li_parse_zip"):
|
||||
with st.spinner("Parsing export archive…"):
|
||||
try:
|
||||
from scripts.linkedin_scraper import parse_export_zip
|
||||
extracted = parse_export_zip(
|
||||
zip_file.read(), _stage_path(config_dir)
|
||||
)
|
||||
st.success(
|
||||
f"Imported {len(extracted.get('experience', []))} experience entries, "
|
||||
f"{len(extracted.get('skills', []))} skills. "
|
||||
"Click 'Use this data' above to apply."
|
||||
)
|
||||
st.rerun()
|
||||
except Exception as e:
|
||||
st.error(f"Failed to parse export: {e}")
|
||||
|
|
@ -15,14 +15,14 @@ sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|||
import streamlit as st
|
||||
import yaml
|
||||
|
||||
from app.cloud_session import resolve_session, get_db_path
|
||||
from app.cloud_session import resolve_session, get_db_path, get_config_dir
|
||||
resolve_session("peregrine")
|
||||
|
||||
_ROOT = Path(__file__).parent.parent.parent
|
||||
CONFIG_DIR = _ROOT / "config"
|
||||
CONFIG_DIR = get_config_dir() # per-user dir in cloud; repo config/ locally
|
||||
USER_YAML = CONFIG_DIR / "user.yaml"
|
||||
STEPS = 6 # mandatory steps
|
||||
STEP_LABELS = ["Hardware", "Tier", "Identity", "Resume", "Inference", "Search"]
|
||||
STEP_LABELS = ["Hardware", "Tier", "Resume", "Identity", "Inference", "Search"]
|
||||
|
||||
|
||||
# ── Helpers ────────────────────────────────────────────────────────────────────
|
||||
|
|
@ -179,6 +179,13 @@ st.divider()
|
|||
|
||||
# ── Step 1: Hardware ───────────────────────────────────────────────────────────
|
||||
if step == 1:
|
||||
from app.cloud_session import CLOUD_MODE as _CLOUD_MODE
|
||||
if _CLOUD_MODE:
|
||||
# Cloud deployment: always single-gpu (Heimdall), skip hardware selection
|
||||
_save_yaml({"inference_profile": "single-gpu", "wizard_step": 1})
|
||||
st.session_state.wizard_step = 2
|
||||
st.rerun()
|
||||
|
||||
from app.wizard.step_hardware import validate, PROFILES
|
||||
|
||||
st.subheader("Step 1 \u2014 Hardware Detection")
|
||||
|
|
@ -212,6 +219,14 @@ if step == 1:
|
|||
|
||||
# ── Step 2: Tier ───────────────────────────────────────────────────────────────
|
||||
elif step == 2:
|
||||
from app.cloud_session import CLOUD_MODE as _CLOUD_MODE
|
||||
if _CLOUD_MODE:
|
||||
# Cloud mode: tier already resolved from Heimdall at session init
|
||||
cloud_tier = st.session_state.get("cloud_tier", "free")
|
||||
_save_yaml({"tier": cloud_tier, "wizard_step": 2})
|
||||
st.session_state.wizard_step = 3
|
||||
st.rerun()
|
||||
|
||||
from app.wizard.step_tier import validate
|
||||
|
||||
st.subheader("Step 2 \u2014 Choose Your Plan")
|
||||
|
|
@ -248,63 +263,21 @@ elif step == 2:
|
|||
st.rerun()
|
||||
|
||||
|
||||
# ── Step 3: Identity ───────────────────────────────────────────────────────────
|
||||
# ── Step 3: Resume ─────────────────────────────────────────────────────────────
|
||||
elif step == 3:
|
||||
from app.wizard.step_identity import validate
|
||||
|
||||
st.subheader("Step 3 \u2014 Your Identity")
|
||||
st.caption("Used in cover letter PDFs, LLM prompts, and the app header.")
|
||||
|
||||
c1, c2 = st.columns(2)
|
||||
name = c1.text_input("Full Name *", saved_yaml.get("name", ""))
|
||||
email = c1.text_input("Email *", saved_yaml.get("email", ""))
|
||||
phone = c2.text_input("Phone", saved_yaml.get("phone", ""))
|
||||
linkedin = c2.text_input("LinkedIn URL", saved_yaml.get("linkedin", ""))
|
||||
|
||||
# Career summary with optional LLM generation
|
||||
summary_default = st.session_state.get("_gen_result_career_summary") or saved_yaml.get("career_summary", "")
|
||||
summary = st.text_area(
|
||||
"Career Summary *", value=summary_default, height=120,
|
||||
placeholder="Experienced professional with X years in [field]. Specialise in [skills].",
|
||||
help="Injected into cover letter and research prompts as your professional context.",
|
||||
)
|
||||
|
||||
gen_result = _generation_widget(
|
||||
section="career_summary",
|
||||
label="Generate from resume",
|
||||
tier=_tier,
|
||||
feature_key="llm_career_summary",
|
||||
input_data={"resume_text": saved_yaml.get("_raw_resume_text", "")},
|
||||
)
|
||||
if gen_result and gen_result != summary:
|
||||
st.info(f"\u2728 Suggested summary \u2014 paste it above if it looks good:\n\n{gen_result}")
|
||||
|
||||
col_back, col_next = st.columns([1, 4])
|
||||
if col_back.button("\u2190 Back", key="ident_back"):
|
||||
st.session_state.wizard_step = 2
|
||||
st.rerun()
|
||||
if col_next.button("Next \u2192", type="primary", key="ident_next"):
|
||||
errs = validate({"name": name, "email": email, "career_summary": summary})
|
||||
if errs:
|
||||
st.error("\n".join(errs))
|
||||
else:
|
||||
_save_yaml({
|
||||
"name": name, "email": email, "phone": phone,
|
||||
"linkedin": linkedin, "career_summary": summary,
|
||||
"wizard_complete": False, "wizard_step": 3,
|
||||
})
|
||||
st.session_state.wizard_step = 4
|
||||
st.rerun()
|
||||
|
||||
|
||||
# ── Step 4: Resume ─────────────────────────────────────────────────────────────
|
||||
elif step == 4:
|
||||
from app.wizard.step_resume import validate
|
||||
|
||||
st.subheader("Step 4 \u2014 Resume")
|
||||
st.subheader("Step 3 \u2014 Resume")
|
||||
st.caption("Upload your resume for fast parsing, or build it section by section.")
|
||||
|
||||
tab_upload, tab_builder = st.tabs(["\U0001f4ce Upload", "\U0001f4dd Build manually"])
|
||||
# Read LinkedIn import result before tabs render (spec: "at step render time")
|
||||
_li_data = st.session_state.pop("_linkedin_extracted", None)
|
||||
if _li_data:
|
||||
st.session_state["_parsed_resume"] = _li_data
|
||||
|
||||
tab_upload, tab_builder, tab_linkedin = st.tabs([
|
||||
"\U0001f4ce Upload", "\U0001f4dd Build Manually", "\U0001f517 LinkedIn"
|
||||
])
|
||||
|
||||
with tab_upload:
|
||||
uploaded = st.file_uploader("Upload PDF, DOCX, or ODT", type=["pdf", "docx", "odt"])
|
||||
|
|
@ -393,9 +366,13 @@ elif step == 4:
|
|||
input_data={"bullet_notes": all_bullets},
|
||||
)
|
||||
|
||||
with tab_linkedin:
|
||||
from app.components.linkedin_import import render_linkedin_tab
|
||||
render_linkedin_tab(config_dir=CONFIG_DIR, tier=_tier)
|
||||
|
||||
col_back, col_next = st.columns([1, 4])
|
||||
if col_back.button("\u2190 Back", key="resume_back"):
|
||||
st.session_state.wizard_step = 3
|
||||
st.session_state.wizard_step = 2
|
||||
st.rerun()
|
||||
if col_next.button("Next \u2192", type="primary", key="resume_next"):
|
||||
parsed = st.session_state.get("_parsed_resume", {})
|
||||
|
|
@ -407,19 +384,75 @@ elif step == 4:
|
|||
if errs:
|
||||
st.error("\n".join(errs))
|
||||
else:
|
||||
resume_yaml_path = _ROOT / "config" / "plain_text_resume.yaml"
|
||||
resume_yaml_path = CONFIG_DIR / "plain_text_resume.yaml"
|
||||
resume_yaml_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
resume_data = {**parsed, "experience": experience} if parsed else {"experience": experience}
|
||||
resume_yaml_path.write_text(
|
||||
yaml.dump(resume_data, default_flow_style=False, allow_unicode=True)
|
||||
)
|
||||
_save_yaml({"wizard_step": 4})
|
||||
_save_yaml({"wizard_step": 3})
|
||||
st.session_state.wizard_step = 4
|
||||
st.rerun()
|
||||
|
||||
|
||||
# ── Step 4: Identity ───────────────────────────────────────────────────────────
|
||||
elif step == 4:
|
||||
from app.wizard.step_identity import validate
|
||||
|
||||
st.subheader("Step 4 \u2014 Your Identity")
|
||||
st.caption("Used in cover letter PDFs, LLM prompts, and the app header.")
|
||||
|
||||
c1, c2 = st.columns(2)
|
||||
name = c1.text_input("Full Name *", saved_yaml.get("name", ""))
|
||||
email = c1.text_input("Email *", saved_yaml.get("email", ""))
|
||||
phone = c2.text_input("Phone", saved_yaml.get("phone", ""))
|
||||
linkedin = c2.text_input("LinkedIn URL", saved_yaml.get("linkedin", ""))
|
||||
|
||||
# Career summary with optional LLM generation — resume text available now (step 3 ran first)
|
||||
summary_default = st.session_state.get("_gen_result_career_summary") or saved_yaml.get("career_summary", "")
|
||||
summary = st.text_area(
|
||||
"Career Summary *", value=summary_default, height=120,
|
||||
placeholder="Experienced professional with X years in [field]. Specialise in [skills].",
|
||||
help="Injected into cover letter and research prompts as your professional context.",
|
||||
)
|
||||
|
||||
gen_result = _generation_widget(
|
||||
section="career_summary",
|
||||
label="Generate from resume",
|
||||
tier=_tier,
|
||||
feature_key="llm_career_summary",
|
||||
input_data={"resume_text": saved_yaml.get("_raw_resume_text", "")},
|
||||
)
|
||||
if gen_result and gen_result != summary:
|
||||
st.info(f"\u2728 Suggested summary \u2014 paste it above if it looks good:\n\n{gen_result}")
|
||||
|
||||
col_back, col_next = st.columns([1, 4])
|
||||
if col_back.button("\u2190 Back", key="ident_back"):
|
||||
st.session_state.wizard_step = 3
|
||||
st.rerun()
|
||||
if col_next.button("Next \u2192", type="primary", key="ident_next"):
|
||||
errs = validate({"name": name, "email": email, "career_summary": summary})
|
||||
if errs:
|
||||
st.error("\n".join(errs))
|
||||
else:
|
||||
_save_yaml({
|
||||
"name": name, "email": email, "phone": phone,
|
||||
"linkedin": linkedin, "career_summary": summary,
|
||||
"wizard_complete": False, "wizard_step": 4,
|
||||
})
|
||||
st.session_state.wizard_step = 5
|
||||
st.rerun()
|
||||
|
||||
|
||||
# ── Step 5: Inference ──────────────────────────────────────────────────────────
|
||||
elif step == 5:
|
||||
from app.cloud_session import CLOUD_MODE as _CLOUD_MODE
|
||||
if _CLOUD_MODE:
|
||||
# Cloud deployment: inference is managed server-side; skip this step
|
||||
_save_yaml({"wizard_step": 5})
|
||||
st.session_state.wizard_step = 6
|
||||
st.rerun()
|
||||
|
||||
from app.wizard.step_inference import validate
|
||||
|
||||
st.subheader("Step 5 \u2014 Inference & API Keys")
|
||||
|
|
|
|||
|
|
@ -12,23 +12,24 @@ import yaml
|
|||
import os as _os
|
||||
|
||||
from scripts.user_profile import UserProfile
|
||||
from app.cloud_session import resolve_session, get_db_path, CLOUD_MODE
|
||||
|
||||
_USER_YAML = Path(__file__).parent.parent.parent / "config" / "user.yaml"
|
||||
_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
|
||||
_name = _profile.name if _profile else "Job Seeker"
|
||||
from app.cloud_session import resolve_session, get_db_path, get_config_dir, CLOUD_MODE
|
||||
|
||||
resolve_session("peregrine")
|
||||
st.title("⚙️ Settings")
|
||||
|
||||
CONFIG_DIR = Path(__file__).parent.parent.parent / "config"
|
||||
# Config paths — per-user directory in cloud mode, shared repo config/ locally
|
||||
CONFIG_DIR = get_config_dir()
|
||||
SEARCH_CFG = CONFIG_DIR / "search_profiles.yaml"
|
||||
BLOCKLIST_CFG = CONFIG_DIR / "blocklist.yaml"
|
||||
LLM_CFG = CONFIG_DIR / "llm.yaml"
|
||||
NOTION_CFG = CONFIG_DIR / "notion.yaml"
|
||||
RESUME_PATH = Path(__file__).parent.parent.parent / "config" / "plain_text_resume.yaml"
|
||||
RESUME_PATH = CONFIG_DIR / "plain_text_resume.yaml"
|
||||
KEYWORDS_CFG = CONFIG_DIR / "resume_keywords.yaml"
|
||||
|
||||
_USER_YAML = CONFIG_DIR / "user.yaml"
|
||||
_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
|
||||
_name = _profile.name if _profile else "Peregrine User"
|
||||
|
||||
def load_yaml(path: Path) -> dict:
|
||||
if path.exists():
|
||||
return yaml.safe_load(path.read_text()) or {}
|
||||
|
|
@ -54,8 +55,9 @@ def _suggest_search_terms(current_titles, resume_path, blocklist=None, user_prof
|
|||
_show_finetune = bool(_profile and _profile.inference_profile in ("single-gpu", "dual-gpu"))
|
||||
|
||||
USER_CFG = CONFIG_DIR / "user.yaml"
|
||||
SERVER_CFG = CONFIG_DIR / "server.yaml"
|
||||
SERVER_CFG_EXAMPLE = CONFIG_DIR / "server.yaml.example"
|
||||
# Server config is always repo-level — it controls the container, not the user
|
||||
SERVER_CFG = Path(__file__).parent.parent.parent / "config" / "server.yaml"
|
||||
SERVER_CFG_EXAMPLE = Path(__file__).parent.parent.parent / "config" / "server.yaml.example"
|
||||
|
||||
_dev_mode = _os.getenv("DEV_MODE", "").lower() in ("true", "1", "yes")
|
||||
_u_for_dev = yaml.safe_load(USER_CFG.read_text()) or {} if USER_CFG.exists() else {}
|
||||
|
|
@ -587,6 +589,23 @@ def _upload_resume_widget(key_prefix: str) -> None:
|
|||
)
|
||||
|
||||
with tab_resume:
|
||||
# ── LinkedIn import ───────────────────────────────────────────────────────
|
||||
_li_data = st.session_state.pop("_linkedin_extracted", None)
|
||||
if _li_data:
|
||||
# Merge imported data into resume YAML — only bootstrap empty fields,
|
||||
# never overwrite existing detail with sparse LinkedIn data
|
||||
existing = load_yaml(RESUME_PATH)
|
||||
existing.update({k: v for k, v in _li_data.items() if v and not existing.get(k)})
|
||||
RESUME_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
save_yaml(RESUME_PATH, existing)
|
||||
st.success("LinkedIn data applied to resume profile.")
|
||||
st.rerun()
|
||||
|
||||
with st.expander("🔗 Import from LinkedIn", expanded=False):
|
||||
from app.components.linkedin_import import render_linkedin_tab
|
||||
_tab_tier = _profile.tier if _profile else "free"
|
||||
render_linkedin_tab(config_dir=CONFIG_DIR, tier=_tab_tier)
|
||||
|
||||
st.caption(
|
||||
f"Edit {_name}'s application profile. "
|
||||
"Bullets are used as paste-able shortcuts in the Apply Workspace."
|
||||
|
|
@ -867,6 +886,14 @@ with tab_resume:
|
|||
with tab_system:
|
||||
st.caption("Infrastructure, LLM backends, integrations, and service connections.")
|
||||
|
||||
if CLOUD_MODE:
|
||||
st.info(
|
||||
"**Your instance is managed by CircuitForge.**\n\n"
|
||||
"Infrastructure, LLM backends, and service settings are configured by the platform. "
|
||||
"To change your plan or billing, visit your [account page](https://circuitforge.tech/account)."
|
||||
)
|
||||
st.stop()
|
||||
|
||||
# ── File Paths & Inference ────────────────────────────────────────────────
|
||||
with st.expander("📁 File Paths & Inference Profile"):
|
||||
_su = _yaml_up.safe_load(USER_CFG.read_text()) or {} if USER_CFG.exists() else {}
|
||||
|
|
@ -1464,6 +1491,13 @@ with tab_finetune:
|
|||
with tab_license:
|
||||
st.subheader("🔑 License")
|
||||
|
||||
if CLOUD_MODE:
|
||||
_cloud_tier = st.session_state.get("cloud_tier", "free")
|
||||
st.success(f"**{_cloud_tier.title()} tier** — managed via your CircuitForge account")
|
||||
st.caption("Your plan is tied to your account and applied automatically.")
|
||||
st.page_link("https://circuitforge.tech/account", label="Manage plan →", icon="🔗")
|
||||
st.stop()
|
||||
|
||||
from scripts.license import (
|
||||
verify_local as _verify_local,
|
||||
activate as _activate,
|
||||
|
|
|
|||
|
|
@ -389,7 +389,7 @@ with col_tools:
|
|||
|
||||
st.markdown("---")
|
||||
else:
|
||||
st.warning("Resume YAML not found — check that AIHawk is cloned.")
|
||||
st.warning("Resume profile not found — complete setup or upload a resume in Settings → Resume Profile.")
|
||||
|
||||
# ── Application Q&A ───────────────────────────────────────────────────────
|
||||
with st.expander("💬 Answer Application Questions"):
|
||||
|
|
|
|||
|
|
@ -2,6 +2,52 @@
|
|||
|
||||
Unscheduled ideas and deferred features. Roughly grouped by area.
|
||||
|
||||
See also: `circuitforge-plans/shared/2026-03-07-launch-checklist.md` for pre-launch blockers
|
||||
(legal docs, Stripe live keys, website deployment, demo DB ownership fix).
|
||||
|
||||
---
|
||||
|
||||
## Launch Blockers (tracked in shared launch checklist)
|
||||
|
||||
- **ToS + Refund Policy** — required before live Stripe charges. Files go in `website/content/legal/`.
|
||||
- **Stripe live key rotation** — swap test keys to live in `website/.env` (zero code changes).
|
||||
- **Website deployment to bastion** — Caddy route for Nuxt frontend at `circuitforge.tech`.
|
||||
- **Demo DB ownership** — `demo/data/staging.db` is root-owned (Docker artifact); fix with `sudo chown alan:alan` then re-run `demo/seed_demo.py`.
|
||||
|
||||
---
|
||||
|
||||
## Post-Launch / Infrastructure
|
||||
|
||||
- **Accessibility Statement** — WCAG 2.1 conformance doc at `website/content/legal/accessibility.md`. High credibility value for ND audience.
|
||||
- **Data deletion request process** — published procedure at `website/content/legal/data-deletion.md` (GDPR/CCPA; references `privacy@circuitforge.tech`).
|
||||
- **Uptime Kuma monitors** — 6 monitors need to be added manually (website, Heimdall, demo, Directus, Forgejo, Peregrine container health).
|
||||
- **Directus admin password rotation** — change from `changeme-set-via-ui-on-first-run` before website goes public.
|
||||
|
||||
---
|
||||
|
||||
## Discovery — Community Scraper Plugin System
|
||||
|
||||
Design doc: `circuitforge-plans/peregrine/2026-03-07-community-scraper-plugin-design.md`
|
||||
|
||||
**Summary:** Add a `scripts/plugins/` directory with auto-discovery and a documented MIT-licensed
|
||||
plugin API. Separates CF-built custom scrapers (paid, BSL 1.1, in `scripts/custom_boards/`) from
|
||||
community-contributed and CF-freebie scrapers (free, MIT, in `scripts/plugins/`).
|
||||
|
||||
**Implementation tasks:**
|
||||
- [ ] Add `scripts/plugins/` with `__init__.py`, `README.md`, and `example_plugin.py`
|
||||
- [ ] Add `config/plugins/` directory with `.gitkeep`; gitignore `config/plugins/*.yaml` (not `.example`)
|
||||
- [ ] Update `discover.py`: `load_plugins()` auto-discovery + tier gate (`custom_boards` = paid, `plugins` = free)
|
||||
- [ ] Update `search_profiles.yaml` schema: add `plugins:` list + `plugin_config:` block
|
||||
- [ ] Migrate `scripts/custom_boards/craigslist.py` → `scripts/plugins/craigslist.py` (CF freebie)
|
||||
- [ ] Settings UI: render `CONFIG_SCHEMA` fields for installed plugins (Settings → Search)
|
||||
- [ ] Rewrite `docs/developer-guide/adding-scrapers.md` to document the plugin API
|
||||
- [ ] Add `scripts/plugins/LICENSE` (MIT) to make the dual-license split explicit
|
||||
|
||||
**CF freebie candidates** (future, after plugin system ships):
|
||||
- Dice.com (tech-focused, no API key)
|
||||
- We Work Remotely (remote-only, clean HTML)
|
||||
- Wellfound / AngelList (startup roles)
|
||||
|
||||
---
|
||||
|
||||
## Settings / Data Management
|
||||
|
|
|
|||
56
scripts/linkedin_parser.py
Normal file
56
scripts/linkedin_parser.py
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
# scripts/linkedin_parser.py
|
||||
"""
|
||||
LinkedIn staging file reader.
|
||||
|
||||
parse_stage(stage_path) reads an existing staging file and returns
|
||||
a structured dict. For url_scrape sources it re-runs the HTML parser
|
||||
so improvements to linkedin_utils take effect without a new scrape.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from scripts.linkedin_utils import parse_html
|
||||
|
||||
|
||||
def parse_stage(stage_path: Path) -> tuple[dict, str]:
|
||||
"""
|
||||
Read and return the extracted profile data from a staging file.
|
||||
|
||||
For url_scrape sources: re-runs parse_html on stored raw_html so
|
||||
parser improvements are applied without re-scraping.
|
||||
|
||||
Returns (extracted_dict, error_string).
|
||||
On any failure returns ({}, error_message).
|
||||
"""
|
||||
if not stage_path.exists():
|
||||
return {}, f"No staged data found at {stage_path}"
|
||||
|
||||
try:
|
||||
data = json.loads(stage_path.read_text())
|
||||
except Exception as e:
|
||||
return {}, f"Could not read staging file: {e}"
|
||||
|
||||
source = data.get("source")
|
||||
raw_html = data.get("raw_html")
|
||||
|
||||
if source == "url_scrape" and raw_html:
|
||||
# Re-run the parser — picks up any selector improvements
|
||||
extracted = parse_html(raw_html)
|
||||
# Preserve linkedin URL — parse_html always returns "" for this field
|
||||
extracted["linkedin"] = extracted.get("linkedin") or data.get("url") or ""
|
||||
|
||||
# Write updated extracted back to staging file atomically
|
||||
data["extracted"] = extracted
|
||||
tmp = stage_path.with_suffix(".tmp")
|
||||
tmp.write_text(json.dumps(data, ensure_ascii=False, indent=2))
|
||||
tmp.rename(stage_path)
|
||||
|
||||
return extracted, ""
|
||||
|
||||
extracted = data.get("extracted")
|
||||
if not extracted:
|
||||
return {}, "Staging file has no extracted data"
|
||||
|
||||
return extracted, ""
|
||||
169
scripts/linkedin_scraper.py
Normal file
169
scripts/linkedin_scraper.py
Normal file
|
|
@ -0,0 +1,169 @@
|
|||
# scripts/linkedin_scraper.py
|
||||
"""
|
||||
LinkedIn profile scraper.
|
||||
|
||||
Two entry points:
|
||||
scrape_profile(url, stage_path) — Playwright headless fetch
|
||||
parse_export_zip(zip_bytes, stage_path) — LinkedIn data archive CSV parse
|
||||
|
||||
Both write a staging file at stage_path and return the extracted dict.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import io
|
||||
import json
|
||||
import re
|
||||
import zipfile
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
|
||||
from scripts.linkedin_utils import parse_html
|
||||
|
||||
_LINKEDIN_PROFILE_RE = re.compile(r"https?://(www\.)?linkedin\.com/in/", re.I)
|
||||
|
||||
_CHROME_UA = (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
|
||||
def _write_stage(stage_path: Path, payload: dict) -> None:
|
||||
"""Atomic write: write to .tmp then rename to avoid partial reads."""
|
||||
tmp = stage_path.with_suffix(".tmp")
|
||||
tmp.write_text(json.dumps(payload, ensure_ascii=False, indent=2))
|
||||
tmp.rename(stage_path)
|
||||
|
||||
|
||||
def scrape_profile(url: str, stage_path: Path) -> dict:
|
||||
"""
|
||||
Fetch a public LinkedIn profile via Playwright headless Chrome.
|
||||
|
||||
Raises ValueError if url is not a linkedin.com/in/ URL.
|
||||
Raises RuntimeError on scrape failure (timeout, blocked, etc.).
|
||||
Returns the extracted dict and writes the staging file.
|
||||
"""
|
||||
if not _LINKEDIN_PROFILE_RE.match(url):
|
||||
raise ValueError(
|
||||
f"Expected a LinkedIn profile URL (linkedin.com/in/…), got: {url}"
|
||||
)
|
||||
|
||||
try:
|
||||
with sync_playwright() as pw:
|
||||
browser = pw.chromium.launch(headless=True)
|
||||
page = browser.new_page(user_agent=_CHROME_UA)
|
||||
page.goto(url, timeout=30_000)
|
||||
page.wait_for_selector(
|
||||
"h1, section[data-section], #experience, #about",
|
||||
timeout=20_000,
|
||||
)
|
||||
raw_html = page.content()
|
||||
browser.close()
|
||||
except PWTimeout:
|
||||
raise RuntimeError(
|
||||
"LinkedIn did not load in time — the request may have been blocked. "
|
||||
"Try the data export option instead."
|
||||
)
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"LinkedIn scrape failed: {e}") from e
|
||||
|
||||
extracted = parse_html(raw_html)
|
||||
extracted["linkedin"] = url
|
||||
|
||||
_write_stage(stage_path, {
|
||||
"url": url,
|
||||
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
||||
"source": "url_scrape",
|
||||
"raw_html": raw_html,
|
||||
"extracted": extracted,
|
||||
})
|
||||
return extracted
|
||||
|
||||
|
||||
def parse_export_zip(zip_bytes: bytes, stage_path: Path) -> dict:
|
||||
"""
|
||||
Parse a LinkedIn data export archive.
|
||||
|
||||
zip_bytes: raw zip bytes — callers do: zip_bytes = uploaded_file.read()
|
||||
Returns the extracted dict and writes the staging file.
|
||||
Missing CSV files are skipped silently.
|
||||
"""
|
||||
extracted: dict = {
|
||||
"name": "", "email": "", "phone": "", "linkedin": "",
|
||||
"career_summary": "",
|
||||
"experience": [], "education": [], "skills": [], "achievements": [],
|
||||
}
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
|
||||
names_in_zip = {n.lower(): n for n in zf.namelist()}
|
||||
|
||||
def _read_csv(filename: str) -> list[dict]:
|
||||
key = filename.lower()
|
||||
if key not in names_in_zip:
|
||||
return []
|
||||
text = zf.read(names_in_zip[key]).decode("utf-8-sig", errors="replace")
|
||||
return list(csv.DictReader(io.StringIO(text)))
|
||||
|
||||
for row in _read_csv("Profile.csv"):
|
||||
first = row.get("First Name", "").strip()
|
||||
last = row.get("Last Name", "").strip()
|
||||
extracted["name"] = f"{first} {last}".strip()
|
||||
extracted["email"] = row.get("Email Address", "").strip()
|
||||
extracted["career_summary"] = row.get("Summary", "").strip()
|
||||
break
|
||||
|
||||
for row in _read_csv("Position.csv"):
|
||||
company = row.get("Company Name", "").strip()
|
||||
title = row.get("Title", "").strip()
|
||||
desc = row.get("Description", "").strip()
|
||||
start = row.get("Started On", "").strip()
|
||||
end = row.get("Finished On", "").strip()
|
||||
end_label = end if end else ("Present" if start else "")
|
||||
date_range = f"{start} – {end_label}".strip(" –") if (start or end) else ""
|
||||
bullets = [d.strip() for d in re.split(r"[.•\n]+", desc) if d.strip() and len(d.strip()) > 3]
|
||||
if company or title:
|
||||
extracted["experience"].append({
|
||||
"company": company,
|
||||
"title": title,
|
||||
"date_range": date_range,
|
||||
"bullets": bullets,
|
||||
})
|
||||
|
||||
for row in _read_csv("Education.csv"):
|
||||
school = row.get("School Name", "").strip()
|
||||
degree = row.get("Degree Name", "").strip()
|
||||
field = row.get("Field Of Study", "").strip()
|
||||
start = row.get("Start Date", "").strip()
|
||||
end = row.get("End Date", "").strip()
|
||||
dates = f"{start} – {end}".strip(" –") if start or end else ""
|
||||
if school or degree:
|
||||
extracted["education"].append({
|
||||
"school": school,
|
||||
"degree": degree,
|
||||
"field": field,
|
||||
"dates": dates,
|
||||
})
|
||||
|
||||
for row in _read_csv("Skills.csv"):
|
||||
skill = row.get("Name", "").strip()
|
||||
if skill:
|
||||
extracted["skills"].append(skill)
|
||||
|
||||
for row in _read_csv("Certifications.csv"):
|
||||
name = row.get("Name", "").strip()
|
||||
if name:
|
||||
extracted["achievements"].append(name)
|
||||
|
||||
except zipfile.BadZipFile as e:
|
||||
raise ValueError(f"Not a valid zip file: {e}")
|
||||
|
||||
_write_stage(stage_path, {
|
||||
"url": None,
|
||||
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
||||
"source": "export_zip",
|
||||
"raw_html": None,
|
||||
"extracted": extracted,
|
||||
})
|
||||
return extracted
|
||||
194
scripts/linkedin_utils.py
Normal file
194
scripts/linkedin_utils.py
Normal file
|
|
@ -0,0 +1,194 @@
|
|||
# scripts/linkedin_utils.py
|
||||
"""
|
||||
LinkedIn profile HTML parser.
|
||||
|
||||
Extracts structured profile data from a raw LinkedIn public profile page.
|
||||
No Playwright dependency — importable by both linkedin_scraper and linkedin_parser.
|
||||
|
||||
Selectors target the 2024-2025 LinkedIn public profile DOM.
|
||||
When LinkedIn changes their markup, update the selector lists here only.
|
||||
Each section uses ordered fallbacks — first matching selector wins.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
# ── Selector fallback lists ────────────────────────────────────────────────────
|
||||
|
||||
_NAME_SELECTORS = [
|
||||
"h1.top-card-layout__title",
|
||||
"h1[class*='title']",
|
||||
".pv-top-card--list h1",
|
||||
"h1",
|
||||
]
|
||||
|
||||
_SUMMARY_SELECTORS = [
|
||||
"section[data-section='about'] .show-more-less-text__text--less",
|
||||
"section[data-section='about'] p",
|
||||
"#about ~ * p.show-more-less-text__text--less",
|
||||
".pv-about-section p",
|
||||
]
|
||||
|
||||
_EXPERIENCE_ITEM_SELECTORS = [
|
||||
"section[data-section='experience'] li.experience-item",
|
||||
"section[data-section='experience'] li",
|
||||
"#experience-section li",
|
||||
"#experience ~ * li",
|
||||
]
|
||||
|
||||
_EXP_TITLE_SELECTORS = ["span.experience-item__title", "span[class*='title']", "h3"]
|
||||
_EXP_COMPANY_SELECTORS = ["span.experience-item__subtitle", "span[class*='subtitle']", "p[class*='company']"]
|
||||
_EXP_DATE_SELECTORS = ["span.date-range", "[class*='date-range']", "span[class*='duration']"]
|
||||
_EXP_DESC_SELECTORS = [".show-more-less-text__text--less", "p[class*='description']", "p"]
|
||||
|
||||
_EDUCATION_ITEM_SELECTORS = [
|
||||
"section[data-section='education'] li.education__list-item",
|
||||
"section[data-section='education'] li",
|
||||
"#education ~ * li",
|
||||
]
|
||||
|
||||
_EDU_SCHOOL_SELECTORS = ["h3.education__school-name", "h3[class*='school']", "h3"]
|
||||
_EDU_DEGREE_SELECTORS = ["span.education__item--degree-name", "span[class*='degree']", "p[class*='degree']"]
|
||||
_EDU_DATES_SELECTORS = ["span.education__item--duration", "span[class*='duration']", "time"]
|
||||
|
||||
_SKILLS_SELECTORS = [
|
||||
"section[data-section='skills'] span.mr1",
|
||||
"section[data-section='skills'] li span[class*='bold']",
|
||||
"section[data-section='skills'] li span",
|
||||
"#skills ~ * li span",
|
||||
]
|
||||
|
||||
_CERT_ITEM_SELECTORS = [
|
||||
"section[data-section='certifications'] li",
|
||||
"#certifications ~ * li",
|
||||
"#licenses_and_certifications ~ * li",
|
||||
]
|
||||
_CERT_NAME_SELECTORS = ["h3.certifications__name", "h3[class*='name']", "h3", "span[class*='title']"]
|
||||
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
def _select_first(soup, selectors):
|
||||
for sel in selectors:
|
||||
try:
|
||||
el = soup.select_one(sel)
|
||||
if el and el.get_text(strip=True):
|
||||
return el.get_text(strip=True)
|
||||
except Exception:
|
||||
continue
|
||||
return ""
|
||||
|
||||
|
||||
def _select_all(soup, selectors):
|
||||
for sel in selectors:
|
||||
try:
|
||||
els = soup.select(sel)
|
||||
if els:
|
||||
return els
|
||||
except Exception:
|
||||
continue
|
||||
return []
|
||||
|
||||
|
||||
def _split_bullets(text):
|
||||
parts = re.split(r"[•·]\s*|(?<=\s)–\s+|\n+", text)
|
||||
return [p.strip() for p in parts if p.strip() and len(p.strip()) > 3]
|
||||
|
||||
|
||||
def _date_range_text(item):
|
||||
for sel in _EXP_DATE_SELECTORS:
|
||||
try:
|
||||
el = item.select_one(sel)
|
||||
if el:
|
||||
times = [t.get_text(strip=True) for t in el.find_all("time")]
|
||||
if times:
|
||||
return " – ".join(times)
|
||||
text = el.get_text(strip=True)
|
||||
if text:
|
||||
return text
|
||||
except Exception:
|
||||
continue
|
||||
return ""
|
||||
|
||||
|
||||
# ── Public API ────────────────────────────────────────────────────────────────
|
||||
|
||||
def parse_html(raw_html: str) -> dict:
|
||||
"""
|
||||
Extract structured profile data from a raw LinkedIn public profile HTML page.
|
||||
|
||||
Returns a dict with keys: name, email, phone, linkedin, career_summary,
|
||||
experience[], education[], skills[], achievements[]
|
||||
|
||||
Never raises — returns empty values for sections that cannot be parsed.
|
||||
"""
|
||||
soup = BeautifulSoup(raw_html, "lxml")
|
||||
|
||||
name = _select_first(soup, _NAME_SELECTORS)
|
||||
career_summary = _select_first(soup, _SUMMARY_SELECTORS)
|
||||
|
||||
experience = []
|
||||
for item in _select_all(soup, _EXPERIENCE_ITEM_SELECTORS):
|
||||
title = _select_first(item, _EXP_TITLE_SELECTORS)
|
||||
company = _select_first(item, _EXP_COMPANY_SELECTORS)
|
||||
dates = _date_range_text(item)
|
||||
desc_el = None
|
||||
for sel in _EXP_DESC_SELECTORS:
|
||||
try:
|
||||
desc_el = item.select_one(sel)
|
||||
if desc_el:
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
bullets = _split_bullets(desc_el.get_text(" ", strip=True)) if desc_el else []
|
||||
if title or company:
|
||||
experience.append({
|
||||
"company": company,
|
||||
"title": title,
|
||||
"date_range": dates,
|
||||
"bullets": bullets,
|
||||
})
|
||||
|
||||
education = []
|
||||
for item in _select_all(soup, _EDUCATION_ITEM_SELECTORS):
|
||||
school = _select_first(item, _EDU_SCHOOL_SELECTORS)
|
||||
degree = _select_first(item, _EDU_DEGREE_SELECTORS)
|
||||
dates = ""
|
||||
for sel in _EDU_DATES_SELECTORS:
|
||||
try:
|
||||
el = item.select_one(sel)
|
||||
if el:
|
||||
dates = el.get_text(strip=True)
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
if school or degree:
|
||||
education.append({
|
||||
"school": school,
|
||||
"degree": degree,
|
||||
"field": "",
|
||||
"dates": dates,
|
||||
})
|
||||
|
||||
skills = [el.get_text(strip=True) for el in _select_all(soup, _SKILLS_SELECTORS)
|
||||
if el.get_text(strip=True)]
|
||||
skills = list(dict.fromkeys(skills))
|
||||
|
||||
achievements = []
|
||||
for item in _select_all(soup, _CERT_ITEM_SELECTORS):
|
||||
label = _select_first(item, _CERT_NAME_SELECTORS)
|
||||
if label:
|
||||
achievements.append(label)
|
||||
|
||||
return {
|
||||
"name": name,
|
||||
"email": "",
|
||||
"phone": "",
|
||||
"linkedin": "",
|
||||
"career_summary": career_summary,
|
||||
"experience": experience,
|
||||
"education": education,
|
||||
"skills": skills,
|
||||
"achievements": achievements,
|
||||
}
|
||||
|
|
@ -83,10 +83,10 @@ def _extract_career_summary(source: Path) -> str:
|
|||
|
||||
|
||||
def _extract_personal_info(source: Path) -> dict:
|
||||
"""Extract personal info from aihawk resume yaml."""
|
||||
"""Extract personal info from resume yaml."""
|
||||
resume = source / "config" / "plain_text_resume.yaml"
|
||||
if not resume.exists():
|
||||
resume = source / "aihawk" / "data_folder" / "plain_text_resume.yaml"
|
||||
resume = source / "aihawk" / "data_folder" / "plain_text_resume.yaml" # legacy path
|
||||
if not resume.exists():
|
||||
return {}
|
||||
data = _load_yaml(resume)
|
||||
|
|
@ -196,7 +196,7 @@ def _copy_configs(source: Path, dest: Path, apply: bool) -> None:
|
|||
|
||||
|
||||
def _copy_aihawk_resume(source: Path, dest: Path, apply: bool) -> None:
|
||||
print("\n── Copying AIHawk resume profile")
|
||||
print("\n── Copying resume profile")
|
||||
src = source / "config" / "plain_text_resume.yaml"
|
||||
if not src.exists():
|
||||
src = source / "aihawk" / "data_folder" / "plain_text_resume.yaml"
|
||||
|
|
|
|||
110
tests/fixtures/linkedin_profile.html
vendored
Normal file
110
tests/fixtures/linkedin_profile.html
vendored
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
<!-- tests/fixtures/linkedin_profile.html -->
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>Alan Weinstock | LinkedIn</title></head>
|
||||
<body>
|
||||
<!-- Name and headline -->
|
||||
<div class="top-card-layout__entity-info">
|
||||
<h1 class="top-card-layout__title">Alan Weinstock</h1>
|
||||
<h2 class="top-card-layout__headline">Staff Engineer · Open to Work</h2>
|
||||
</div>
|
||||
|
||||
<!-- About / Summary -->
|
||||
<section data-section="about">
|
||||
<div class="core-section-container__content">
|
||||
<p class="show-more-less-text__text--less">
|
||||
Experienced engineer with 10 years in embedded systems and DevOps.
|
||||
Passionate about open-source and accessibility tooling.
|
||||
</p>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Experience -->
|
||||
<section data-section="experience">
|
||||
<ul>
|
||||
<li class="experience-item">
|
||||
<div class="experience-item__info">
|
||||
<span class="experience-item__title">Staff Engineer</span>
|
||||
<span class="experience-item__subtitle">Acme Corp</span>
|
||||
<span class="experience-item__duration">
|
||||
<span class="date-range">
|
||||
<time>Jan 2022</time>
|
||||
<time>Present</time>
|
||||
</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class="experience-item__description">
|
||||
<p class="show-more-less-text__text--less">
|
||||
Led migration of monolith to microservices. •
|
||||
Reduced p99 latency by 40%. •
|
||||
Mentored three junior engineers.
|
||||
</p>
|
||||
</div>
|
||||
</li>
|
||||
<li class="experience-item">
|
||||
<div class="experience-item__info">
|
||||
<span class="experience-item__title">Senior Engineer</span>
|
||||
<span class="experience-item__subtitle">Beta Industries</span>
|
||||
<span class="experience-item__duration">
|
||||
<span class="date-range">
|
||||
<time>Mar 2019</time>
|
||||
<time>Dec 2021</time>
|
||||
</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class="experience-item__description">
|
||||
<p class="show-more-less-text__text--less">
|
||||
Designed CI/CD pipeline. • Maintained Kubernetes clusters.
|
||||
</p>
|
||||
</div>
|
||||
</li>
|
||||
</ul>
|
||||
</section>
|
||||
|
||||
<!-- Education -->
|
||||
<section data-section="education">
|
||||
<ul>
|
||||
<li class="education__list-item">
|
||||
<div class="education__item--degree-info">
|
||||
<h3 class="education__school-name">State University</h3>
|
||||
<span class="education__item--degree-name">B.S. Computer Science</span>
|
||||
<span class="education__item--duration">2010 – 2014</span>
|
||||
</div>
|
||||
</li>
|
||||
</ul>
|
||||
</section>
|
||||
|
||||
<!-- Skills -->
|
||||
<section data-section="skills">
|
||||
<ul>
|
||||
<li class="skills-section__list-item">
|
||||
<div class="skills-section__skill">
|
||||
<span class="mr1 t-bold">Python</span>
|
||||
</div>
|
||||
</li>
|
||||
<li class="skills-section__list-item">
|
||||
<div class="skills-section__skill">
|
||||
<span class="mr1 t-bold">Kubernetes</span>
|
||||
</div>
|
||||
</li>
|
||||
<li class="skills-section__list-item">
|
||||
<div class="skills-section__skill">
|
||||
<span class="mr1 t-bold">PostgreSQL</span>
|
||||
</div>
|
||||
</li>
|
||||
</ul>
|
||||
</section>
|
||||
|
||||
<!-- Certifications -->
|
||||
<section data-section="certifications">
|
||||
<ul>
|
||||
<li class="certifications__list-item">
|
||||
<h3 class="certifications__name">AWS Solutions Architect – Associate</h3>
|
||||
</li>
|
||||
<li class="certifications__list-item">
|
||||
<h3 class="certifications__name">CKA: Certified Kubernetes Administrator</h3>
|
||||
</li>
|
||||
</ul>
|
||||
</section>
|
||||
</body>
|
||||
</html>
|
||||
96
tests/test_linkedin_parser.py
Normal file
96
tests/test_linkedin_parser.py
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
# tests/test_linkedin_parser.py
|
||||
import json
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
FIXTURE_HTML = (Path(__file__).parent / "fixtures" / "linkedin_profile.html").read_text()
|
||||
|
||||
|
||||
def _write_url_stage(path: Path) -> None:
|
||||
"""Write a minimal url_scrape staging file with intentionally stale extracted data."""
|
||||
path.write_text(json.dumps({
|
||||
"url": "https://linkedin.com/in/alanw",
|
||||
"scraped_at": "2026-03-12T14:30:00+00:00",
|
||||
"source": "url_scrape",
|
||||
"raw_html": FIXTURE_HTML,
|
||||
"extracted": {
|
||||
"name": "Alan Weinstock (stale)", # stale — re-parse should update this
|
||||
"career_summary": "",
|
||||
"experience": [], "education": [], "skills": [], "achievements": [],
|
||||
"email": "", "phone": "", "linkedin": "",
|
||||
},
|
||||
}))
|
||||
|
||||
|
||||
def _write_zip_stage(path: Path) -> None:
|
||||
"""Write a minimal export_zip staging file (no raw_html)."""
|
||||
path.write_text(json.dumps({
|
||||
"url": None,
|
||||
"scraped_at": "2026-03-12T14:30:00+00:00",
|
||||
"source": "export_zip",
|
||||
"raw_html": None,
|
||||
"extracted": {
|
||||
"name": "Alan Weinstock",
|
||||
"career_summary": "Engineer",
|
||||
"experience": [{"company": "Acme", "title": "SE", "date_range": "", "bullets": []}],
|
||||
"education": [], "skills": ["Python"], "achievements": [],
|
||||
"email": "alan@example.com", "phone": "", "linkedin": "",
|
||||
},
|
||||
}))
|
||||
|
||||
|
||||
def test_parse_stage_reruns_parser_on_url_scrape():
|
||||
"""parse_stage re-runs parse_html from raw_html, ignoring stale extracted data."""
|
||||
from scripts.linkedin_parser import parse_stage
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
stage = Path(tmp) / "stage.json"
|
||||
_write_url_stage(stage)
|
||||
result, err = parse_stage(stage)
|
||||
assert err == ""
|
||||
assert result["name"] == "Alan Weinstock" # fresh parse, not "(stale)"
|
||||
assert len(result["experience"]) == 2
|
||||
|
||||
|
||||
def test_parse_stage_returns_stored_data_for_zip():
|
||||
"""parse_stage returns stored extracted dict for export_zip (no raw_html to re-parse)."""
|
||||
from scripts.linkedin_parser import parse_stage
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
stage = Path(tmp) / "stage.json"
|
||||
_write_zip_stage(stage)
|
||||
result, err = parse_stage(stage)
|
||||
assert err == ""
|
||||
assert result["name"] == "Alan Weinstock"
|
||||
assert result["email"] == "alan@example.com"
|
||||
assert "Python" in result["skills"]
|
||||
|
||||
|
||||
def test_parse_stage_missing_file_returns_error():
|
||||
from scripts.linkedin_parser import parse_stage
|
||||
result, err = parse_stage(Path("/nonexistent/stage.json"))
|
||||
assert result == {}
|
||||
assert err != ""
|
||||
|
||||
|
||||
def test_parse_stage_corrupted_file_returns_error():
|
||||
from scripts.linkedin_parser import parse_stage
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
stage = Path(tmp) / "stage.json"
|
||||
stage.write_text("not valid json {{{{")
|
||||
result, err = parse_stage(stage)
|
||||
assert result == {}
|
||||
assert err != ""
|
||||
|
||||
|
||||
def test_parse_stage_updates_staging_file_after_reparse():
|
||||
"""After re-parsing, the staging file's extracted dict is updated."""
|
||||
from scripts.linkedin_parser import parse_stage
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
stage = Path(tmp) / "stage.json"
|
||||
_write_url_stage(stage)
|
||||
parse_stage(stage)
|
||||
updated = json.loads(stage.read_text())
|
||||
assert updated["extracted"]["name"] == "Alan Weinstock"
|
||||
assert len(updated["extracted"]["experience"]) == 2
|
||||
213
tests/test_linkedin_scraper.py
Normal file
213
tests/test_linkedin_scraper.py
Normal file
|
|
@ -0,0 +1,213 @@
|
|||
# tests/test_linkedin_scraper.py
|
||||
import io
|
||||
import json
|
||||
import sys
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
import tempfile
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
|
||||
def test_invalid_url_raises():
|
||||
from scripts.linkedin_scraper import scrape_profile
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
stage = Path(tmp) / "stage.json"
|
||||
try:
|
||||
scrape_profile("https://linkedin.com/company/acme", stage)
|
||||
assert False, "should have raised"
|
||||
except ValueError as e:
|
||||
assert "linkedin.com/in/" in str(e)
|
||||
|
||||
|
||||
def test_non_linkedin_url_raises():
|
||||
from scripts.linkedin_scraper import scrape_profile
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
stage = Path(tmp) / "stage.json"
|
||||
try:
|
||||
scrape_profile("https://example.com/profile", stage)
|
||||
assert False, "should have raised"
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
|
||||
def test_valid_linkedin_url_accepted():
|
||||
from scripts.linkedin_scraper import scrape_profile
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
stage = Path(tmp) / "stage.json"
|
||||
fixture_html = (Path(__file__).parent / "fixtures" / "linkedin_profile.html").read_text()
|
||||
|
||||
mock_page = MagicMock()
|
||||
mock_page.content.return_value = fixture_html
|
||||
mock_browser = MagicMock()
|
||||
mock_browser.new_page.return_value = mock_page
|
||||
mock_playwright = MagicMock()
|
||||
mock_playwright.chromium.launch.return_value = mock_browser
|
||||
|
||||
with patch("scripts.linkedin_scraper.sync_playwright") as mock_sync_pw:
|
||||
mock_sync_pw.return_value.__enter__ = MagicMock(return_value=mock_playwright)
|
||||
mock_sync_pw.return_value.__exit__ = MagicMock(return_value=False)
|
||||
result = scrape_profile("https://linkedin.com/in/alanw", stage)
|
||||
|
||||
assert result["name"] == "Alan Weinstock"
|
||||
assert stage.exists()
|
||||
|
||||
|
||||
def test_scrape_profile_writes_staging_file():
|
||||
from scripts.linkedin_scraper import scrape_profile
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
stage = Path(tmp) / "stage.json"
|
||||
fixture_html = (Path(__file__).parent / "fixtures" / "linkedin_profile.html").read_text()
|
||||
|
||||
mock_page = MagicMock()
|
||||
mock_page.content.return_value = fixture_html
|
||||
mock_browser = MagicMock()
|
||||
mock_browser.new_page.return_value = mock_page
|
||||
mock_playwright = MagicMock()
|
||||
mock_playwright.chromium.launch.return_value = mock_browser
|
||||
|
||||
with patch("scripts.linkedin_scraper.sync_playwright") as mock_sync_pw:
|
||||
mock_sync_pw.return_value.__enter__ = MagicMock(return_value=mock_playwright)
|
||||
mock_sync_pw.return_value.__exit__ = MagicMock(return_value=False)
|
||||
scrape_profile("https://linkedin.com/in/alanw", stage)
|
||||
|
||||
data = json.loads(stage.read_text())
|
||||
assert data["source"] == "url_scrape"
|
||||
assert data["url"] == "https://linkedin.com/in/alanw"
|
||||
assert "raw_html" in data
|
||||
assert "extracted" in data
|
||||
assert data["extracted"]["name"] == "Alan Weinstock"
|
||||
|
||||
|
||||
def _make_export_zip() -> bytes:
|
||||
buf = io.BytesIO()
|
||||
with zipfile.ZipFile(buf, "w") as zf:
|
||||
zf.writestr("Position.csv",
|
||||
"Company Name,Title,Description,Started On,Finished On\n"
|
||||
"Acme Corp,Staff Engineer,Led migration. Built CI/CD.,Jan 2022,\n"
|
||||
"Beta Industries,Senior Engineer,Maintained clusters.,Mar 2019,Dec 2021\n"
|
||||
)
|
||||
zf.writestr("Education.csv",
|
||||
"School Name,Degree Name,Field Of Study,Start Date,End Date\n"
|
||||
"State University,Bachelor of Science,Computer Science,2010,2014\n"
|
||||
)
|
||||
zf.writestr("Skills.csv",
|
||||
"Name,Description\n"
|
||||
"Python,\n"
|
||||
"Kubernetes,\n"
|
||||
)
|
||||
zf.writestr("Profile.csv",
|
||||
"First Name,Last Name,Headline,Summary,Email Address\n"
|
||||
"Alan,Weinstock,Staff Engineer,Experienced engineer.,alan@example.com\n"
|
||||
)
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
def test_parse_export_zip_experience():
|
||||
from scripts.linkedin_scraper import parse_export_zip
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
stage = Path(tmp) / "stage.json"
|
||||
result = parse_export_zip(_make_export_zip(), stage)
|
||||
assert len(result["experience"]) == 2
|
||||
assert result["experience"][0]["company"] == "Acme Corp"
|
||||
assert result["experience"][0]["title"] == "Staff Engineer"
|
||||
|
||||
|
||||
def test_parse_export_zip_education():
|
||||
from scripts.linkedin_scraper import parse_export_zip
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
stage = Path(tmp) / "stage.json"
|
||||
result = parse_export_zip(_make_export_zip(), stage)
|
||||
assert result["education"][0]["school"] == "State University"
|
||||
assert result["education"][0]["field"] == "Computer Science"
|
||||
|
||||
|
||||
def test_parse_export_zip_skills():
|
||||
from scripts.linkedin_scraper import parse_export_zip
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
stage = Path(tmp) / "stage.json"
|
||||
result = parse_export_zip(_make_export_zip(), stage)
|
||||
assert "Python" in result["skills"]
|
||||
|
||||
|
||||
def test_parse_export_zip_name_and_email():
|
||||
from scripts.linkedin_scraper import parse_export_zip
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
stage = Path(tmp) / "stage.json"
|
||||
result = parse_export_zip(_make_export_zip(), stage)
|
||||
assert result["name"] == "Alan Weinstock"
|
||||
assert result["email"] == "alan@example.com"
|
||||
|
||||
|
||||
def test_parse_export_zip_missing_csv_does_not_raise():
|
||||
from scripts.linkedin_scraper import parse_export_zip
|
||||
buf = io.BytesIO()
|
||||
with zipfile.ZipFile(buf, "w") as zf:
|
||||
zf.writestr("Profile.csv",
|
||||
"First Name,Last Name,Headline,Summary,Email Address\n"
|
||||
"Alan,Weinstock,Engineer,Summary here.,alan@example.com\n"
|
||||
)
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
stage = Path(tmp) / "stage.json"
|
||||
result = parse_export_zip(buf.getvalue(), stage)
|
||||
assert result["name"] == "Alan Weinstock"
|
||||
assert result["experience"] == []
|
||||
|
||||
|
||||
def test_parse_export_zip_writes_staging_file():
|
||||
from scripts.linkedin_scraper import parse_export_zip
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
stage = Path(tmp) / "stage.json"
|
||||
parse_export_zip(_make_export_zip(), stage)
|
||||
data = json.loads(stage.read_text())
|
||||
assert data["source"] == "export_zip"
|
||||
assert data["raw_html"] is None
|
||||
|
||||
|
||||
def test_scrape_profile_sets_linkedin_url():
|
||||
from scripts.linkedin_scraper import scrape_profile
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
stage = Path(tmp) / "stage.json"
|
||||
fixture_html = (Path(__file__).parent / "fixtures" / "linkedin_profile.html").read_text()
|
||||
mock_page = MagicMock()
|
||||
mock_page.content.return_value = fixture_html
|
||||
mock_browser = MagicMock()
|
||||
mock_browser.new_page.return_value = mock_page
|
||||
mock_playwright = MagicMock()
|
||||
mock_playwright.chromium.launch.return_value = mock_browser
|
||||
with patch("scripts.linkedin_scraper.sync_playwright") as mock_sync_pw:
|
||||
mock_sync_pw.return_value.__enter__ = MagicMock(return_value=mock_playwright)
|
||||
mock_sync_pw.return_value.__exit__ = MagicMock(return_value=False)
|
||||
result = scrape_profile("https://linkedin.com/in/alanw", stage)
|
||||
assert result["linkedin"] == "https://linkedin.com/in/alanw"
|
||||
|
||||
|
||||
def test_parse_export_zip_bad_zip_raises():
|
||||
from scripts.linkedin_scraper import parse_export_zip
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
stage = Path(tmp) / "stage.json"
|
||||
try:
|
||||
parse_export_zip(b"not a zip file at all", stage)
|
||||
assert False, "should have raised"
|
||||
except ValueError as e:
|
||||
assert "zip" in str(e).lower()
|
||||
|
||||
|
||||
def test_parse_export_zip_current_job_shows_present():
|
||||
"""Empty Finished On renders as '– Present', not truncated."""
|
||||
from scripts.linkedin_scraper import parse_export_zip
|
||||
buf = io.BytesIO()
|
||||
with zipfile.ZipFile(buf, "w") as zf:
|
||||
zf.writestr("Position.csv",
|
||||
"Company Name,Title,Description,Started On,Finished On\n"
|
||||
"Acme Corp,Staff Engineer,,Jan 2022,\n"
|
||||
)
|
||||
zf.writestr("Profile.csv",
|
||||
"First Name,Last Name,Headline,Summary,Email Address\n"
|
||||
"Alan,Weinstock,Engineer,,\n"
|
||||
)
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
stage = Path(tmp) / "stage.json"
|
||||
result = parse_export_zip(buf.getvalue(), stage)
|
||||
assert result["experience"][0]["date_range"] == "Jan 2022 – Present"
|
||||
73
tests/test_linkedin_utils.py
Normal file
73
tests/test_linkedin_utils.py
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
# tests/test_linkedin_utils.py
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
FIXTURE = (Path(__file__).parent / "fixtures" / "linkedin_profile.html").read_text()
|
||||
|
||||
|
||||
def test_parse_html_name():
|
||||
from scripts.linkedin_utils import parse_html
|
||||
result = parse_html(FIXTURE)
|
||||
assert result["name"] == "Alan Weinstock"
|
||||
|
||||
|
||||
def test_parse_html_summary():
|
||||
from scripts.linkedin_utils import parse_html
|
||||
result = parse_html(FIXTURE)
|
||||
assert "embedded systems" in result["career_summary"]
|
||||
|
||||
|
||||
def test_parse_html_experience_count():
|
||||
from scripts.linkedin_utils import parse_html
|
||||
result = parse_html(FIXTURE)
|
||||
assert len(result["experience"]) == 2
|
||||
|
||||
|
||||
def test_parse_html_experience_fields():
|
||||
from scripts.linkedin_utils import parse_html
|
||||
result = parse_html(FIXTURE)
|
||||
first = result["experience"][0]
|
||||
assert first["company"] == "Acme Corp"
|
||||
assert first["title"] == "Staff Engineer"
|
||||
assert "Jan 2022" in first["date_range"]
|
||||
assert len(first["bullets"]) >= 2
|
||||
assert any("latency" in b for b in first["bullets"])
|
||||
|
||||
|
||||
def test_parse_html_education():
|
||||
from scripts.linkedin_utils import parse_html
|
||||
result = parse_html(FIXTURE)
|
||||
assert len(result["education"]) == 1
|
||||
edu = result["education"][0]
|
||||
assert edu["school"] == "State University"
|
||||
assert "Computer Science" in edu["degree"]
|
||||
|
||||
|
||||
def test_parse_html_skills():
|
||||
from scripts.linkedin_utils import parse_html
|
||||
result = parse_html(FIXTURE)
|
||||
assert "Python" in result["skills"]
|
||||
assert "Kubernetes" in result["skills"]
|
||||
|
||||
|
||||
def test_parse_html_achievements():
|
||||
from scripts.linkedin_utils import parse_html
|
||||
result = parse_html(FIXTURE)
|
||||
assert any("AWS" in a for a in result["achievements"])
|
||||
|
||||
|
||||
def test_parse_html_missing_section_returns_empty():
|
||||
"""A profile with no skills section returns empty skills list, not an error."""
|
||||
from scripts.linkedin_utils import parse_html
|
||||
html_no_skills = FIXTURE.replace('data-section="skills"', 'data-section="hidden"')
|
||||
result = parse_html(html_no_skills)
|
||||
assert result["skills"] == []
|
||||
|
||||
|
||||
def test_parse_html_returns_all_keys():
|
||||
from scripts.linkedin_utils import parse_html
|
||||
result = parse_html(FIXTURE)
|
||||
for key in ("name", "email", "phone", "linkedin", "career_summary",
|
||||
"experience", "education", "skills", "achievements"):
|
||||
assert key in result, f"Missing key: {key}"
|
||||
Loading…
Reference in a new issue