peregrine/scripts/linkedin_parser.py

# scripts/linkedin_parser.py
"""
LinkedIn staging file reader.

parse_stage(stage_path) reads an existing staging file and returns
a structured dict. For url_scrape sources it re-runs the HTML parser
so improvements to linkedin_utils take effect without a new scrape.
"""
from __future__ import annotations

import json
from pathlib import Path

from scripts.linkedin_utils import parse_html


def parse_stage(stage_path: Path) -> tuple[dict, str]:
    """
    Read and return the extracted profile data from a staging file.

    For url_scrape sources: re-runs parse_html on stored raw_html so
    parser improvements are applied without re-scraping.

    Returns (extracted_dict, error_string).
    On any failure returns ({}, error_message).
    """
    if not stage_path.exists():
        return {}, f"No staged data found at {stage_path}"

    try:
        data = json.loads(stage_path.read_text())
    except Exception as e:
        return {}, f"Could not read staging file: {e}"

    source   = data.get("source")
    raw_html = data.get("raw_html")

    if source == "url_scrape" and raw_html:
        # Re-run the parser — picks up any selector improvements
        extracted = parse_html(raw_html)
        # Preserve linkedin URL — parse_html always returns "" for this field
        extracted["linkedin"] = extracted.get("linkedin") or data.get("url") or ""

        # Write updated extracted back to staging file atomically
        data["extracted"] = extracted
        tmp = stage_path.with_suffix(".tmp")
        tmp.write_text(json.dumps(data, ensure_ascii=False, indent=2))
        tmp.rename(stage_path)

        return extracted, ""

    extracted = data.get("extracted")
    if not extracted:
        return {}, "Staging file has no extracted data"

    return extracted, ""