feat(linkedin): add staging file parser with re-parse support

2026-03-13 10:18:01 -07:00 · 2026-03-13 10:18:01 -07:00 · 00f0eb4807
commit 00f0eb4807
parent e937094884
2 changed files with 152 additions and 0 deletions
--- a/scripts/linkedin_parser.py
+++ b/scripts/linkedin_parser.py
@ -0,0 +1,56 @@
+# scripts/linkedin_parser.py
+"""
+LinkedIn staging file reader.
+
+parse_stage(stage_path) reads an existing staging file and returns
+a structured dict. For url_scrape sources it re-runs the HTML parser
+so improvements to linkedin_utils take effect without a new scrape.
+"""
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from scripts.linkedin_utils import parse_html
+
+
+def parse_stage(stage_path: Path) -> tuple[dict, str]:
+    """
+    Read and return the extracted profile data from a staging file.
+
+    For url_scrape sources: re-runs parse_html on stored raw_html so
+    parser improvements are applied without re-scraping.
+
+    Returns (extracted_dict, error_string).
+    On any failure returns ({}, error_message).
+    """
+    if not stage_path.exists():
+        return {}, f"No staged data found at {stage_path}"
+
+    try:
+        data = json.loads(stage_path.read_text())
+    except Exception as e:
+        return {}, f"Could not read staging file: {e}"
+
+    source   = data.get("source")
+    raw_html = data.get("raw_html")
+
+    if source == "url_scrape" and raw_html:
+        # Re-run the parser — picks up any selector improvements
+        extracted = parse_html(raw_html)
+        # Preserve linkedin URL — parse_html always returns "" for this field
+        extracted["linkedin"] = extracted.get("linkedin") or data.get("url") or ""
+
+        # Write updated extracted back to staging file atomically
+        data["extracted"] = extracted
+        tmp = stage_path.with_suffix(".tmp")
+        tmp.write_text(json.dumps(data, ensure_ascii=False, indent=2))
+        tmp.rename(stage_path)
+
+        return extracted, ""
+
+    extracted = data.get("extracted")
+    if not extracted:
+        return {}, "Staging file has no extracted data"
+
+    return extracted, ""
--- a/tests/test_linkedin_parser.py
+++ b/tests/test_linkedin_parser.py
@ -0,0 +1,96 @@
+# tests/test_linkedin_parser.py
+import json
+import sys
+import tempfile
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+FIXTURE_HTML = (Path(__file__).parent / "fixtures" / "linkedin_profile.html").read_text()
+
+
+def _write_url_stage(path: Path) -> None:
+    """Write a minimal url_scrape staging file with intentionally stale extracted data."""
+    path.write_text(json.dumps({
+        "url": "https://linkedin.com/in/alanw",
+        "scraped_at": "2026-03-12T14:30:00+00:00",
+        "source": "url_scrape",
+        "raw_html": FIXTURE_HTML,
+        "extracted": {
+            "name": "Alan Weinstock (stale)",   # stale — re-parse should update this
+            "career_summary": "",
+            "experience": [], "education": [], "skills": [], "achievements": [],
+            "email": "", "phone": "", "linkedin": "",
+        },
+    }))
+
+
+def _write_zip_stage(path: Path) -> None:
+    """Write a minimal export_zip staging file (no raw_html)."""
+    path.write_text(json.dumps({
+        "url": None,
+        "scraped_at": "2026-03-12T14:30:00+00:00",
+        "source": "export_zip",
+        "raw_html": None,
+        "extracted": {
+            "name": "Alan Weinstock",
+            "career_summary": "Engineer",
+            "experience": [{"company": "Acme", "title": "SE", "date_range": "", "bullets": []}],
+            "education": [], "skills": ["Python"], "achievements": [],
+            "email": "alan@example.com", "phone": "", "linkedin": "",
+        },
+    }))
+
+
+def test_parse_stage_reruns_parser_on_url_scrape():
+    """parse_stage re-runs parse_html from raw_html, ignoring stale extracted data."""
+    from scripts.linkedin_parser import parse_stage
+    with tempfile.TemporaryDirectory() as tmp:
+        stage = Path(tmp) / "stage.json"
+        _write_url_stage(stage)
+        result, err = parse_stage(stage)
+    assert err == ""
+    assert result["name"] == "Alan Weinstock"   # fresh parse, not "(stale)"
+    assert len(result["experience"]) == 2
+
+
+def test_parse_stage_returns_stored_data_for_zip():
+    """parse_stage returns stored extracted dict for export_zip (no raw_html to re-parse)."""
+    from scripts.linkedin_parser import parse_stage
+    with tempfile.TemporaryDirectory() as tmp:
+        stage = Path(tmp) / "stage.json"
+        _write_zip_stage(stage)
+        result, err = parse_stage(stage)
+    assert err == ""
+    assert result["name"] == "Alan Weinstock"
+    assert result["email"] == "alan@example.com"
+    assert "Python" in result["skills"]
+
+
+def test_parse_stage_missing_file_returns_error():
+    from scripts.linkedin_parser import parse_stage
+    result, err = parse_stage(Path("/nonexistent/stage.json"))
+    assert result == {}
+    assert err != ""
+
+
+def test_parse_stage_corrupted_file_returns_error():
+    from scripts.linkedin_parser import parse_stage
+    with tempfile.TemporaryDirectory() as tmp:
+        stage = Path(tmp) / "stage.json"
+        stage.write_text("not valid json {{{{")
+        result, err = parse_stage(stage)
+    assert result == {}
+    assert err != ""
+
+
+def test_parse_stage_updates_staging_file_after_reparse():
+    """After re-parsing, the staging file's extracted dict is updated."""
+    from scripts.linkedin_parser import parse_stage
+    with tempfile.TemporaryDirectory() as tmp:
+        stage = Path(tmp) / "stage.json"
+        _write_url_stage(stage)
+        parse_stage(stage)
+        updated = json.loads(stage.read_text())
+    assert updated["extracted"]["name"] == "Alan Weinstock"
+    assert len(updated["extracted"]["experience"]) == 2