feat(linkedin): add staging file parser with re-parse support
This commit is contained in:
parent
e937094884
commit
00f0eb4807
2 changed files with 152 additions and 0 deletions
56
scripts/linkedin_parser.py
Normal file
56
scripts/linkedin_parser.py
Normal file
|
|
@ -0,0 +1,56 @@
|
||||||
|
# scripts/linkedin_parser.py
|
||||||
|
"""
|
||||||
|
LinkedIn staging file reader.
|
||||||
|
|
||||||
|
parse_stage(stage_path) reads an existing staging file and returns
|
||||||
|
a structured dict. For url_scrape sources it re-runs the HTML parser
|
||||||
|
so improvements to linkedin_utils take effect without a new scrape.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from scripts.linkedin_utils import parse_html
|
||||||
|
|
||||||
|
|
||||||
|
def parse_stage(stage_path: Path) -> tuple[dict, str]:
|
||||||
|
"""
|
||||||
|
Read and return the extracted profile data from a staging file.
|
||||||
|
|
||||||
|
For url_scrape sources: re-runs parse_html on stored raw_html so
|
||||||
|
parser improvements are applied without re-scraping.
|
||||||
|
|
||||||
|
Returns (extracted_dict, error_string).
|
||||||
|
On any failure returns ({}, error_message).
|
||||||
|
"""
|
||||||
|
if not stage_path.exists():
|
||||||
|
return {}, f"No staged data found at {stage_path}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(stage_path.read_text())
|
||||||
|
except Exception as e:
|
||||||
|
return {}, f"Could not read staging file: {e}"
|
||||||
|
|
||||||
|
source = data.get("source")
|
||||||
|
raw_html = data.get("raw_html")
|
||||||
|
|
||||||
|
if source == "url_scrape" and raw_html:
|
||||||
|
# Re-run the parser — picks up any selector improvements
|
||||||
|
extracted = parse_html(raw_html)
|
||||||
|
# Preserve linkedin URL — parse_html always returns "" for this field
|
||||||
|
extracted["linkedin"] = extracted.get("linkedin") or data.get("url") or ""
|
||||||
|
|
||||||
|
# Write updated extracted back to staging file atomically
|
||||||
|
data["extracted"] = extracted
|
||||||
|
tmp = stage_path.with_suffix(".tmp")
|
||||||
|
tmp.write_text(json.dumps(data, ensure_ascii=False, indent=2))
|
||||||
|
tmp.rename(stage_path)
|
||||||
|
|
||||||
|
return extracted, ""
|
||||||
|
|
||||||
|
extracted = data.get("extracted")
|
||||||
|
if not extracted:
|
||||||
|
return {}, "Staging file has no extracted data"
|
||||||
|
|
||||||
|
return extracted, ""
|
||||||
96
tests/test_linkedin_parser.py
Normal file
96
tests/test_linkedin_parser.py
Normal file
|
|
@ -0,0 +1,96 @@
|
||||||
|
# tests/test_linkedin_parser.py
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
FIXTURE_HTML = (Path(__file__).parent / "fixtures" / "linkedin_profile.html").read_text()
|
||||||
|
|
||||||
|
|
||||||
|
def _write_url_stage(path: Path) -> None:
|
||||||
|
"""Write a minimal url_scrape staging file with intentionally stale extracted data."""
|
||||||
|
path.write_text(json.dumps({
|
||||||
|
"url": "https://linkedin.com/in/alanw",
|
||||||
|
"scraped_at": "2026-03-12T14:30:00+00:00",
|
||||||
|
"source": "url_scrape",
|
||||||
|
"raw_html": FIXTURE_HTML,
|
||||||
|
"extracted": {
|
||||||
|
"name": "Alan Weinstock (stale)", # stale — re-parse should update this
|
||||||
|
"career_summary": "",
|
||||||
|
"experience": [], "education": [], "skills": [], "achievements": [],
|
||||||
|
"email": "", "phone": "", "linkedin": "",
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
|
||||||
|
|
||||||
|
def _write_zip_stage(path: Path) -> None:
|
||||||
|
"""Write a minimal export_zip staging file (no raw_html)."""
|
||||||
|
path.write_text(json.dumps({
|
||||||
|
"url": None,
|
||||||
|
"scraped_at": "2026-03-12T14:30:00+00:00",
|
||||||
|
"source": "export_zip",
|
||||||
|
"raw_html": None,
|
||||||
|
"extracted": {
|
||||||
|
"name": "Alan Weinstock",
|
||||||
|
"career_summary": "Engineer",
|
||||||
|
"experience": [{"company": "Acme", "title": "SE", "date_range": "", "bullets": []}],
|
||||||
|
"education": [], "skills": ["Python"], "achievements": [],
|
||||||
|
"email": "alan@example.com", "phone": "", "linkedin": "",
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_stage_reruns_parser_on_url_scrape():
|
||||||
|
"""parse_stage re-runs parse_html from raw_html, ignoring stale extracted data."""
|
||||||
|
from scripts.linkedin_parser import parse_stage
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
stage = Path(tmp) / "stage.json"
|
||||||
|
_write_url_stage(stage)
|
||||||
|
result, err = parse_stage(stage)
|
||||||
|
assert err == ""
|
||||||
|
assert result["name"] == "Alan Weinstock" # fresh parse, not "(stale)"
|
||||||
|
assert len(result["experience"]) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_stage_returns_stored_data_for_zip():
|
||||||
|
"""parse_stage returns stored extracted dict for export_zip (no raw_html to re-parse)."""
|
||||||
|
from scripts.linkedin_parser import parse_stage
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
stage = Path(tmp) / "stage.json"
|
||||||
|
_write_zip_stage(stage)
|
||||||
|
result, err = parse_stage(stage)
|
||||||
|
assert err == ""
|
||||||
|
assert result["name"] == "Alan Weinstock"
|
||||||
|
assert result["email"] == "alan@example.com"
|
||||||
|
assert "Python" in result["skills"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_stage_missing_file_returns_error():
|
||||||
|
from scripts.linkedin_parser import parse_stage
|
||||||
|
result, err = parse_stage(Path("/nonexistent/stage.json"))
|
||||||
|
assert result == {}
|
||||||
|
assert err != ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_stage_corrupted_file_returns_error():
|
||||||
|
from scripts.linkedin_parser import parse_stage
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
stage = Path(tmp) / "stage.json"
|
||||||
|
stage.write_text("not valid json {{{{")
|
||||||
|
result, err = parse_stage(stage)
|
||||||
|
assert result == {}
|
||||||
|
assert err != ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_stage_updates_staging_file_after_reparse():
|
||||||
|
"""After re-parsing, the staging file's extracted dict is updated."""
|
||||||
|
from scripts.linkedin_parser import parse_stage
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
stage = Path(tmp) / "stage.json"
|
||||||
|
_write_url_stage(stage)
|
||||||
|
parse_stage(stage)
|
||||||
|
updated = json.loads(stage.read_text())
|
||||||
|
assert updated["extracted"]["name"] == "Alan Weinstock"
|
||||||
|
assert len(updated["extracted"]["experience"]) == 2
|
||||||
Loading…
Reference in a new issue