peregrine/scripts/company_research.py

# scripts/company_research.py
"""
Pre-interview company research generator.

Three-phase approach:
  1. If SearXNG is available, use companyScraper.py to fetch live
     data: CEO name, HQ address, LinkedIn, contact info.
  1b. Use Phase 1 data (company name + CEO if found) to query SearXNG for
      recent news snippets (funding, launches, leadership changes, etc.).
  2. Feed all real data into an LLM prompt to synthesise a structured brief
     covering company overview, leadership, recent developments, and talking
     points tailored to the candidate.

Falls back to pure LLM knowledge when SearXNG is offline.

Usage (standalone):
    conda run -n job-seeker python scripts/company_research.py --job-id 42
    conda run -n job-seeker python scripts/company_research.py --job-id 42 --no-scrape
"""
import re
import sys
from pathlib import Path
from types import SimpleNamespace

sys.path.insert(0, str(Path(__file__).parent.parent))

from scripts.user_profile import UserProfile
_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None

# ── SearXNG scraper integration ───────────────────────────────────────────────
# companyScraper is bundled into the Docker image at /app/scrapers/
_SCRAPER_AVAILABLE = False
for _scraper_candidate in [
    Path("/app/scrapers"),          # Docker container path
    Path(__file__).parent.parent / "scrapers",  # local dev fallback
]:
    if _scraper_candidate.exists():
        sys.path.insert(0, str(_scraper_candidate))
        try:
            from companyScraper import EnhancedCompanyScraper, Config as _ScraperConfig
            _SCRAPER_AVAILABLE = True
        except (ImportError, SystemExit):
            pass
        break


_SEARXNG_URL: str = _profile.searxng_url if _profile else "http://localhost:8888"


def _searxng_running(searxng_url: str = "http://localhost:8888") -> bool:
    """Quick check whether SearXNG is reachable."""
    try:
        import requests
        r = requests.get(f"{searxng_url}/", timeout=3)
        return r.status_code == 200
    except Exception:
        return False


def _scrape_company(company: str) -> dict:
    """
    Use companyScraper in minimal mode to pull live CEO / HQ data.
    Returns a dict with keys: ceo, headquarters, linkedin (may be 'Not found').
    """
    mock_args = SimpleNamespace(
        mode="minimal",
        verbose=False,
        dry_run=False,
        debug=False,
        use_cache=True,
        save_raw=False,
        target_staff=None,
        include_types=None,
        exclude_types=None,
        include_contact=False,
        include_address=False,
        include_social=True,   # grab LinkedIn while we're at it
        timeout=20,
        input_file=None,
        output_file="/dev/null",
        searxng_url=_SEARXNG_URL + "/",
    )
    # Override the singleton Config URL
    _ScraperConfig.SEARXNG_URL = _SEARXNG_URL + "/"

    scraper = EnhancedCompanyScraper(mock_args)
    scraper.companies = [company]

    result: dict = {"ceo": "Not found", "headquarters": "Not found", "linkedin": "Not found"}
    for search_type in ["ceo", "hq", "social"]:
        html = scraper.search_company(company, search_type)
        if search_type == "ceo":
            result["ceo"] = scraper.extract_ceo(html, company)
        elif search_type == "hq":
            result["headquarters"] = scraper.extract_address(html, company)
        elif search_type == "social":
            social = scraper.extract_social(html, company)
            # Pull out just the LinkedIn entry
            for part in (social or "").split(";"):
                if "linkedin" in part.lower():
                    result["linkedin"] = part.strip()
                    break

    return result


_SEARCH_QUERIES = {
    "news":          '"{company}" news 2025 2026',
    "funding":       '"{company}" funding round investors Series valuation',
    "tech":          '"{company}" tech stack engineering technology platform',
    "competitors":   '"{company}" competitors alternatives vs market',
    "culture":       '"{company}" glassdoor culture reviews employees',
    "accessibility": '"{company}" ADA accessibility disability inclusion accommodation ERG',
    "ceo_press":     '"{ceo}" "{company}"',  # only used if ceo is known
}


def _run_search_query(query: str, results: dict, key: str) -> None:
    """Thread target: run one SearXNG JSON query, store up to 4 snippets in results[key]."""
    import requests

    snippets: list[str] = []
    seen: set[str] = set()
    try:
        resp = requests.get(
            f"{_SEARXNG_URL}/search",
            params={"q": query, "format": "json", "language": "en-US"},
            timeout=12,
        )
        if resp.status_code != 200:
            return
        for r in resp.json().get("results", [])[:4]:
            url = r.get("url", "")
            if url in seen:
                continue
            seen.add(url)
            title = r.get("title", "").strip()
            content = r.get("content", "").strip()
            if title or content:
                snippets.append(f"- **{title}**\n  {content}\n  <{url}>")
    except Exception:
        pass
    results[key] = "\n\n".join(snippets)


def _fetch_search_data(company: str, ceo: str = "") -> dict[str, str]:
    """
    Run all search queries in parallel threads.
    Returns dict keyed by search type (news, funding, tech, competitors, culture, ceo_press).
    Missing/failed queries produce empty strings.
    """
    import threading

    results: dict[str, str] = {}
    threads = []

    keys: list[str] = []
    for key, pattern in _SEARCH_QUERIES.items():
        if key == "ceo_press" and not ceo or (ceo or "").lower() == "not found":
            continue
        # Use replace() not .format() — company names may contain curly braces
        query = pattern.replace("{company}", company).replace("{ceo}", ceo)
        t = threading.Thread(
            target=_run_search_query,
            args=(query, results, key),
            daemon=True,
        )
        threads.append(t)
        keys.append(key)
        t.start()

    for t, key in zip(threads, keys):
        t.join(timeout=15)
        # Thread may still be alive after timeout — pre-populate key so
        # the results dict contract ("missing queries → empty string") holds
        if t.is_alive():
            results.setdefault(key, "")

    return results


def _parse_sections(text: str) -> dict[str, str]:
    """Split LLM markdown output on ## headers into named sections."""
    sections: dict[str, str] = {}
    pattern = re.compile(r"^##\s+(.+)$", re.MULTILINE)
    matches = list(pattern.finditer(text))
    for i, match in enumerate(matches):
        name = match.group(1).strip()
        start = match.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        sections[name] = text[start:end].strip()
    return sections


_RESUME_YAML = Path(__file__).parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml"
_KEYWORDS_YAML = Path(__file__).parent.parent / "config" / "resume_keywords.yaml"


def _company_label(exp: dict) -> str:
    company = exp.get("company", "")
    score = exp.get("score", 0)
    if _profile:
        return _profile.nda_label(company, score)
    return company


def _score_experiences(experiences: list[dict], keywords: list[str], jd: str) -> list[dict]:
    """Score each experience entry by keyword overlap with JD; return sorted descending."""
    jd_lower = jd.lower()
    scored = []
    for exp in experiences:
        text = " ".join([
            exp.get("position", ""),
            exp.get("company", ""),
            " ".join(
                v
                for resp in exp.get("key_responsibilities", [])
                for v in resp.values()
            ),
        ]).lower()
        score = sum(1 for kw in keywords if kw.lower() in text and kw.lower() in jd_lower)
        scored.append({**exp, "score": score})
    return sorted(scored, key=lambda x: x["score"], reverse=True)


def _build_resume_context(resume: dict, keywords: list[str], jd: str) -> str:
    """
    Build the resume section of the LLM context block.
    Top 2 scored experiences included in full detail; rest as one-liners.
    NDA companies are masked via UserProfile.nda_label() when score < threshold.
    """
    experiences = resume.get("experience_details", [])
    if not experiences:
        return ""

    scored = _score_experiences(experiences, keywords, jd)
    top2 = scored[:2]
    rest = scored[2:]

    candidate = _profile.name if _profile else "the candidate"

    def _exp_header(exp: dict) -> str:
        return f"{exp.get('position', '')} @ {_company_label(exp)} ({exp.get('employment_period', '')})"

    def _exp_bullets(exp: dict) -> str:
        bullets = [v for resp in exp.get("key_responsibilities", []) for v in resp.values()]
        return "\n".join(f"  - {b}" for b in bullets)

    lines = [f"## {candidate}'s Matched Experience"]
    for exp in top2:
        lines.append(f"\n**{_exp_header(exp)}** (match score: {exp['score']})")
        lines.append(_exp_bullets(exp))

    if rest:
        condensed = ", ".join(_exp_header(e) for e in rest)
        lines.append(f"\nAlso in {candidate}'s background: {condensed}")

    return "\n".join(lines)


def _load_resume_and_keywords() -> tuple[dict, list[str]]:
    """Load resume YAML and keywords config. Returns (resume_dict, all_keywords_list)."""
    import yaml as _yaml

    resume = {}
    if _RESUME_YAML.exists():
        resume = _yaml.safe_load(_RESUME_YAML.read_text()) or {}

    keywords: list[str] = []
    if _KEYWORDS_YAML.exists():
        kw_cfg = _yaml.safe_load(_KEYWORDS_YAML.read_text()) or {}
        for lst in kw_cfg.values():
            if isinstance(lst, list):
                keywords.extend(lst)

    return resume, keywords


def research_company(job: dict, use_scraper: bool = True, on_stage=None) -> dict:
    """
    Generate a pre-interview research brief for a job.

    Parameters
    ----------
    job : dict
        Job row from the DB (needs at least 'company', 'title', 'description').
    use_scraper : bool
        Whether to attempt live data via SearXNG before falling back to LLM.

    Returns
    -------
    dict with keys: raw_output, company_brief, ceo_brief, tech_brief,
    funding_brief, competitors_brief, red_flags, talking_points
    """
    from scripts.llm_router import LLMRouter

    router = LLMRouter()
    research_order = router.config.get("research_fallback_order") or router.config["fallback_order"]
    company = job.get("company") or "the company"
    title = job.get("title") or "this role"
    jd_excerpt = (job.get("description") or "")[:1500]

    resume, keywords = _load_resume_and_keywords()
    matched_keywords = [kw for kw in keywords if kw.lower() in jd_excerpt.lower()]
    resume_context = _build_resume_context(resume, keywords, jd_excerpt)
    keywords_note = (
        f"\n\n## Matched Skills & Keywords\nSkills matching this JD: {', '.join(matched_keywords)}"
        if matched_keywords else ""
    )

    def _stage(msg: str) -> None:
        if on_stage:
            try:
                on_stage(msg)
            except Exception:
                pass  # never let stage callbacks break the task

    # ── Phase 1: live scrape (optional) ──────────────────────────────────────
    live_data: dict = {}
    scrape_note = ""
    _stage("Checking for live company data…")
    if use_scraper and _SCRAPER_AVAILABLE and _searxng_running(_SEARXNG_URL):
        _stage("Scraping CEO & HQ data…")
        try:
            live_data = _scrape_company(company)
            parts = []
            if live_data.get("ceo") not in (None, "Not found"):
                parts.append(f"CEO: {live_data['ceo']}")
            if live_data.get("headquarters") not in (None, "Not found"):
                parts.append(f"HQ: {live_data['headquarters']}")
            if live_data.get("linkedin") not in (None, "Not found"):
                parts.append(f"LinkedIn: {live_data['linkedin']}")
            if parts:
                scrape_note = (
                    "\n\n**Live data retrieved via SearXNG:**\n"
                    + "\n".join(f"- {p}" for p in parts)
                    + "\n\nIncorporate these facts where relevant."
                )
        except BaseException as e:
            scrape_note = f"\n\n_(Live scrape attempted but failed: {e})_"

    # ── Phase 1b: parallel search queries ────────────────────────────────────
    search_data: dict[str, str] = {}
    _stage("Running web searches…")
    if use_scraper and _searxng_running(_SEARXNG_URL):
        _stage("Running web searches (news, funding, tech, culture)…")
        try:
            ceo_name = (live_data.get("ceo") or "") if live_data else ""
            search_data = _fetch_search_data(company, ceo=ceo_name)
        except BaseException:
            pass  # best-effort; never fail the whole task

    # Track whether SearXNG actually contributed usable data to this brief.
    scrape_used = 1 if (live_data or any(v.strip() for v in search_data.values())) else 0

    def _section_note(key: str, label: str) -> str:
        text = search_data.get(key, "").strip()
        return f"\n\n## {label} (live web search)\n\n{text}" if text else ""

    news_note          = _section_note("news",          "News & Press")
    funding_note       = _section_note("funding",       "Funding & Investors")
    tech_note          = _section_note("tech",          "Tech Stack")
    competitors_note   = _section_note("competitors",   "Competitors")
    culture_note       = _section_note("culture",       "Culture & Employee Signals")
    accessibility_note = _section_note("accessibility", "Accessibility & Disability Inclusion")
    ceo_press_note     = _section_note("ceo_press",     "CEO in the News")

    # ── Phase 2: LLM synthesis ────────────────────────────────────────────────
    _stage("Generating brief with LLM… (30–90 seconds)")
    name = _profile.name if _profile else "the candidate"
    career_summary = _profile.career_summary if _profile else ""
    accessibility_focus = _profile.candidate_accessibility_focus if _profile else False
    lgbtq_focus = _profile.candidate_lgbtq_focus if _profile else False
    _section_count = 7 + (1 if accessibility_focus else 0) + (1 if lgbtq_focus else 0)
    _accessibility_section = """
## Inclusion & Accessibility
Assess {company}'s commitment to disability inclusion and accessibility. Cover:
- ADA accommodation language in job postings or company policy
- Disability Employee Resource Group (ERG) or affinity group
- Product or service accessibility (WCAG compliance, adaptive features, AT integrations)
- Any public disability/accessibility advocacy, partnerships, or certifications
- Glassdoor or press signals about how employees with disabilities experience the company
If no specific signals are found, say so clearly — absence of public commitment is itself signal.
This section is for the candidate's personal decision-making only and will not appear in any application.
""".format(company=company) if accessibility_focus else ""
    _lgbtq_section = """
## LGBTQIA+ Inclusion
Assess {company}'s culture and policies around LGBTQIA+ inclusion. Cover:
- Non-discrimination policies that explicitly include sexual orientation and gender identity
- LGBTQIA+ Employee Resource Group (ERG) or Pride Network
- Benefits that support LGBTQIA+ employees (gender-affirming care, domestic partner benefits)
- Public statements, donations, or advocacy (Pride sponsorships, HRC Corporate Equality Index rating)
- Glassdoor or press signals about how LGBTQIA+ employees experience the company day-to-day
If no specific signals are found, say so clearly — absence of public commitment is itself signal.
This section is for the candidate's personal decision-making only and will not appear in any application.
""".format(company=company) if lgbtq_focus else ""
    prompt = f"""You are preparing {name} for a job interview.
{f"Candidate background: {career_summary}" if career_summary else ""}

Role: **{title}** at **{company}**

## Job Description
{jd_excerpt}
{resume_context}{keywords_note}

## Live Company Data
{scrape_note.strip() or "_(scrape unavailable)_"}
{news_note}{funding_note}{tech_note}{competitors_note}{culture_note}{accessibility_note}{ceo_press_note}

---

Produce a structured research brief using **exactly** these {_section_count} markdown section headers
(include all {_section_count} even if a section has limited data — say so honestly):

## Company Overview
What {company} does, core product/service, business model, size/stage (startup / scale-up / enterprise), market positioning.

## Leadership & Culture
CEO background and leadership style, key execs, mission/values statements, Glassdoor themes.

## Tech Stack & Product
Technologies, platforms, and product direction relevant to the {title} role.

## Funding & Market Position
Funding stage, key investors, recent rounds, burn/growth signals, competitor landscape.

## Recent Developments
News, launches, acquisitions, exec moves, pivots, or press from the past 12–18 months.
Draw on the live snippets above; if none available, note what is publicly known.

## Red Flags & Watch-outs
Culture issues, layoffs, exec departures, financial stress, or Glassdoor concerns worth knowing before the call.
If nothing notable, write "No significant red flags identified."

{_lgbtq_section}{_accessibility_section}
## Talking Points for {name}
Five specific talking points for the phone screen. Each must:
- Reference a concrete experience from {name}'s matched background by name
  (NDA rule: use the masked label shown in the matched experience section for any NDA-protected employer)
- Connect to a specific signal from the JD or company context above
- Be 1–2 sentences, ready to speak aloud
- Never give generic advice

---
⚠️ This brief combines live web data and LLM training knowledge. Verify key facts before the call.
"""

    raw = router.complete(prompt, fallback_order=research_order)
    # Strip <think>…</think> blocks emitted by reasoning models (e.g. DeepSeek, Qwen-R)
    raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
    sections = _parse_sections(raw)

    return {
        "raw_output":        raw,
        "company_brief":     sections.get("Company Overview", ""),
        "ceo_brief":         sections.get("Leadership & Culture", ""),
        "tech_brief":        sections.get("Tech Stack & Product", ""),
        "funding_brief":     sections.get("Funding & Market Position", ""),
        "competitors_brief": sections.get("Funding & Market Position", ""),  # competitor landscape is in the funding section
        "red_flags":         sections.get("Red Flags & Watch-outs", ""),
        "accessibility_brief": sections.get("Inclusion & Accessibility", ""),
        "talking_points":    sections.get(f"Talking Points for {name}", ""),
        "scrape_used":       scrape_used,
    }


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Generate company research brief")
    parser.add_argument("--job-id", type=int, required=True, help="Job ID in staging.db")
    parser.add_argument("--no-scrape", action="store_true", help="Skip SearXNG live scrape")
    args = parser.parse_args()

    from scripts.db import DEFAULT_DB, init_db, save_research
    import sqlite3

    init_db(DEFAULT_DB)
    conn = sqlite3.connect(DEFAULT_DB)
    conn.row_factory = sqlite3.Row
    row = conn.execute("SELECT * FROM jobs WHERE id = ?", (args.job_id,)).fetchone()
    conn.close()

    if not row:
        sys.exit(f"Job {args.job_id} not found in {DEFAULT_DB}")

    job = dict(row)
    print(f"Researching: {job['title']} @ {job['company']} …\n")
    if _SCRAPER_AVAILABLE and not args.no_scrape:
        print(f"SearXNG available: {_searxng_running(_SEARXNG_URL)}")

    result = research_company(job, use_scraper=not args.no_scrape)
    save_research(DEFAULT_DB, job_id=args.job_id, **result)
    print(result["raw_output"])
    print(f"\n[Saved to company_research for job {args.job_id}]")