# scripts/company_research.py
"""
Pre-interview company research generator.
Three-phase approach:
1. If SearXNG is available (port 8888), use companyScraper.py to fetch live
data: CEO name, HQ address, LinkedIn, contact info.
1b. Use Phase 1 data (company name + CEO if found) to query SearXNG for
recent news snippets (funding, launches, leadership changes, etc.).
2. Feed all real data into an LLM prompt to synthesise a structured brief
covering company overview, leadership, recent developments, and talking
points tailored to Alex.
Falls back to pure LLM knowledge when SearXNG is offline.
Usage (standalone):
conda run -n job-seeker python scripts/company_research.py --job-id 42
conda run -n job-seeker python scripts/company_research.py --job-id 42 --no-scrape
"""
import re
import sys
from pathlib import Path
from types import SimpleNamespace
sys.path.insert(0, str(Path(__file__).parent.parent))
# ── SearXNG scraper integration ───────────────────────────────────────────────
_SCRAPER_DIR = Path("/Library/Development/scrapers")
_SCRAPER_AVAILABLE = False
if _SCRAPER_DIR.exists():
sys.path.insert(0, str(_SCRAPER_DIR))
try:
from companyScraper import EnhancedCompanyScraper, Config as _ScraperConfig
_SCRAPER_AVAILABLE = True
except (ImportError, SystemExit):
# companyScraper calls sys.exit(1) if bs4/fake-useragent aren't installed
pass
def _searxng_running() -> bool:
"""Quick check whether SearXNG is reachable."""
try:
import requests
r = requests.get("http://localhost:8888/", timeout=3)
return r.status_code == 200
except Exception:
return False
def _scrape_company(company: str) -> dict:
"""
Use companyScraper in minimal mode to pull live CEO / HQ data.
Returns a dict with keys: ceo, headquarters, linkedin (may be 'Not found').
"""
mock_args = SimpleNamespace(
mode="minimal",
verbose=False,
dry_run=False,
debug=False,
use_cache=True,
save_raw=False,
target_staff=None,
include_types=None,
exclude_types=None,
include_contact=False,
include_address=False,
include_social=True, # grab LinkedIn while we're at it
timeout=20,
input_file=None,
output_file="/dev/null",
searxng_url="http://localhost:8888/",
)
# Override the singleton Config URL
_ScraperConfig.SEARXNG_URL = "http://localhost:8888/"
scraper = EnhancedCompanyScraper(mock_args)
scraper.companies = [company]
result: dict = {"ceo": "Not found", "headquarters": "Not found", "linkedin": "Not found"}
for search_type in ["ceo", "hq", "social"]:
html = scraper.search_company(company, search_type)
if search_type == "ceo":
result["ceo"] = scraper.extract_ceo(html, company)
elif search_type == "hq":
result["headquarters"] = scraper.extract_address(html, company)
elif search_type == "social":
social = scraper.extract_social(html, company)
# Pull out just the LinkedIn entry
for part in (social or "").split(";"):
if "linkedin" in part.lower():
result["linkedin"] = part.strip()
break
return result
_SEARCH_QUERIES = {
"news": '"{company}" news 2025 2026',
"funding": '"{company}" funding round investors Series valuation',
"tech": '"{company}" tech stack engineering technology platform',
"competitors": '"{company}" competitors alternatives vs market',
"culture": '"{company}" glassdoor culture reviews employees',
"accessibility": '"{company}" ADA accessibility disability inclusion accommodation ERG',
"ceo_press": '"{ceo}" "{company}"', # only used if ceo is known
}
def _run_search_query(query: str, results: dict, key: str) -> None:
"""Thread target: run one SearXNG JSON query, store up to 4 snippets in results[key]."""
import requests
snippets: list[str] = []
seen: set[str] = set()
try:
resp = requests.get(
"http://localhost:8888/search",
params={"q": query, "format": "json", "language": "en-US"},
timeout=12,
)
if resp.status_code != 200:
return
for r in resp.json().get("results", [])[:4]:
url = r.get("url", "")
if url in seen:
continue
seen.add(url)
title = r.get("title", "").strip()
content = r.get("content", "").strip()
if title or content:
snippets.append(f"- **{title}**\n {content}\n <{url}>")
except Exception:
pass
results[key] = "\n\n".join(snippets)
def _fetch_search_data(company: str, ceo: str = "") -> dict[str, str]:
"""
Run all search queries in parallel threads.
Returns dict keyed by search type (news, funding, tech, competitors, culture, ceo_press).
Missing/failed queries produce empty strings.
"""
import threading
results: dict[str, str] = {}
threads = []
keys: list[str] = []
for key, pattern in _SEARCH_QUERIES.items():
if key == "ceo_press" and not ceo or (ceo or "").lower() == "not found":
continue
# Use replace() not .format() — company names may contain curly braces
query = pattern.replace("{company}", company).replace("{ceo}", ceo)
t = threading.Thread(
target=_run_search_query,
args=(query, results, key),
daemon=True,
)
threads.append(t)
keys.append(key)
t.start()
for t, key in zip(threads, keys):
t.join(timeout=15)
# Thread may still be alive after timeout — pre-populate key so
# the results dict contract ("missing queries → empty string") holds
if t.is_alive():
results.setdefault(key, "")
return results
def _parse_sections(text: str) -> dict[str, str]:
"""Split LLM markdown output on ## headers into named sections."""
sections: dict[str, str] = {}
pattern = re.compile(r"^##\s+(.+)$", re.MULTILINE)
matches = list(pattern.finditer(text))
for i, match in enumerate(matches):
name = match.group(1).strip()
start = match.end()
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
sections[name] = text[start:end].strip()
return sections
_RESUME_YAML = Path(__file__).parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml"
_KEYWORDS_YAML = Path(__file__).parent.parent / "config" / "resume_keywords.yaml"
# Companies where Alex has an NDA — reference as generic label unless
# the role is security-focused (score >= 3 matching JD keywords).
_NDA_COMPANIES = {"upguard"}
def _score_experiences(experiences: list[dict], keywords: list[str], jd: str) -> list[dict]:
"""Score each experience entry by keyword overlap with JD; return sorted descending."""
jd_lower = jd.lower()
scored = []
for exp in experiences:
text = " ".join([
exp.get("position", ""),
exp.get("company", ""),
" ".join(
v
for resp in exp.get("key_responsibilities", [])
for v in resp.values()
),
]).lower()
score = sum(1 for kw in keywords if kw.lower() in text and kw.lower() in jd_lower)
scored.append({**exp, "score": score})
return sorted(scored, key=lambda x: x["score"], reverse=True)
def _build_resume_context(resume: dict, keywords: list[str], jd: str) -> str:
"""
Build the resume section of the LLM context block.
Top 2 scored experiences included in full detail; rest as one-liners.
Applies UpGuard NDA rule: reference as 'enterprise security vendor (NDA)'
unless the role is security-focused (score >= 3).
"""
experiences = resume.get("experience_details", [])
if not experiences:
return ""
scored = _score_experiences(experiences, keywords, jd)
top2 = scored[:2]
rest = scored[2:]
def _company_label(exp: dict) -> str:
company = exp.get("company", "")
if company.lower() in _NDA_COMPANIES and exp.get("score", 0) < 3:
return "enterprise security vendor (NDA)"
return company
def _exp_header(exp: dict) -> str:
return f"{exp.get('position', '')} @ {_company_label(exp)} ({exp.get('employment_period', '')})"
def _exp_bullets(exp: dict) -> str:
bullets = [v for resp in exp.get("key_responsibilities", []) for v in resp.values()]
return "\n".join(f" - {b}" for b in bullets)
lines = ["## Alex's Matched Experience"]
for exp in top2:
lines.append(f"\n**{_exp_header(exp)}** (match score: {exp['score']})")
lines.append(_exp_bullets(exp))
if rest:
condensed = ", ".join(_exp_header(e) for e in rest)
lines.append(f"\nAlso in Alex's background: {condensed}")
return "\n".join(lines)
def _load_resume_and_keywords() -> tuple[dict, list[str]]:
"""Load resume YAML and keywords config. Returns (resume_dict, all_keywords_list)."""
import yaml as _yaml
resume = {}
if _RESUME_YAML.exists():
resume = _yaml.safe_load(_RESUME_YAML.read_text()) or {}
keywords: list[str] = []
if _KEYWORDS_YAML.exists():
kw_cfg = _yaml.safe_load(_KEYWORDS_YAML.read_text()) or {}
for lst in kw_cfg.values():
if isinstance(lst, list):
keywords.extend(lst)
return resume, keywords
def research_company(job: dict, use_scraper: bool = True, on_stage=None) -> dict:
"""
Generate a pre-interview research brief for a job.
Parameters
----------
job : dict
Job row from the DB (needs at least 'company', 'title', 'description').
use_scraper : bool
Whether to attempt live data via SearXNG before falling back to LLM.
Returns
-------
dict with keys: raw_output, company_brief, ceo_brief, tech_brief,
funding_brief, competitors_brief, red_flags, talking_points
"""
from scripts.llm_router import LLMRouter
router = LLMRouter()
research_order = router.config.get("research_fallback_order") or router.config["fallback_order"]
company = job.get("company") or "the company"
title = job.get("title") or "this role"
jd_excerpt = (job.get("description") or "")[:1500]
resume, keywords = _load_resume_and_keywords()
matched_keywords = [kw for kw in keywords if kw.lower() in jd_excerpt.lower()]
resume_context = _build_resume_context(resume, keywords, jd_excerpt)
keywords_note = (
f"\n\n## Matched Skills & Keywords\nSkills matching this JD: {', '.join(matched_keywords)}"
if matched_keywords else ""
)
def _stage(msg: str) -> None:
if on_stage:
try:
on_stage(msg)
except Exception:
pass # never let stage callbacks break the task
# ── Phase 1: live scrape (optional) ──────────────────────────────────────
live_data: dict = {}
scrape_note = ""
_stage("Checking for live company data…")
if use_scraper and _SCRAPER_AVAILABLE and _searxng_running():
_stage("Scraping CEO & HQ data…")
try:
live_data = _scrape_company(company)
parts = []
if live_data.get("ceo") not in (None, "Not found"):
parts.append(f"CEO: {live_data['ceo']}")
if live_data.get("headquarters") not in (None, "Not found"):
parts.append(f"HQ: {live_data['headquarters']}")
if live_data.get("linkedin") not in (None, "Not found"):
parts.append(f"LinkedIn: {live_data['linkedin']}")
if parts:
scrape_note = (
"\n\n**Live data retrieved via SearXNG:**\n"
+ "\n".join(f"- {p}" for p in parts)
+ "\n\nIncorporate these facts where relevant."
)
except BaseException as e:
scrape_note = f"\n\n_(Live scrape attempted but failed: {e})_"
# ── Phase 1b: parallel search queries ────────────────────────────────────
search_data: dict[str, str] = {}
_stage("Running web searches…")
if use_scraper and _searxng_running():
_stage("Running web searches (news, funding, tech, culture)…")
try:
ceo_name = (live_data.get("ceo") or "") if live_data else ""
search_data = _fetch_search_data(company, ceo=ceo_name)
except BaseException:
pass # best-effort; never fail the whole task
# Track whether SearXNG actually contributed usable data to this brief.
scrape_used = 1 if (live_data or any(v.strip() for v in search_data.values())) else 0
def _section_note(key: str, label: str) -> str:
text = search_data.get(key, "").strip()
return f"\n\n## {label} (live web search)\n\n{text}" if text else ""
news_note = _section_note("news", "News & Press")
funding_note = _section_note("funding", "Funding & Investors")
tech_note = _section_note("tech", "Tech Stack")
competitors_note = _section_note("competitors", "Competitors")
culture_note = _section_note("culture", "Culture & Employee Signals")
accessibility_note = _section_note("accessibility", "Accessibility & Disability Inclusion")
ceo_press_note = _section_note("ceo_press", "CEO in the News")
# ── Phase 2: LLM synthesis ────────────────────────────────────────────────
_stage("Generating brief with LLM… (30–90 seconds)")
prompt = f"""You are preparing Alex Rivera for a job interview.
Role: **{title}** at **{company}**
## Job Description
{jd_excerpt}
{resume_context}{keywords_note}
## Live Company Data
{scrape_note.strip() or "_(scrape unavailable)_"}
{news_note}{funding_note}{tech_note}{competitors_note}{culture_note}{accessibility_note}{ceo_press_note}
---
Produce a structured research brief using **exactly** these eight markdown section headers
(include all eight even if a section has limited data — say so honestly):
## Company Overview
What {company} does, core product/service, business model, size/stage (startup / scale-up / enterprise), market positioning.
## Leadership & Culture
CEO background and leadership style, key execs, mission/values statements, Glassdoor themes.
## Tech Stack & Product
Technologies, platforms, and product direction relevant to the {title} role.
## Funding & Market Position
Funding stage, key investors, recent rounds, burn/growth signals, competitor landscape.
## Recent Developments
News, launches, acquisitions, exec moves, pivots, or press from the past 12–18 months.
Draw on the live snippets above; if none available, note what is publicly known.
## Red Flags & Watch-outs
Culture issues, layoffs, exec departures, financial stress, or Glassdoor concerns worth knowing before the call.
If nothing notable, write "No significant red flags identified."
## Inclusion & Accessibility
Assess {company}'s commitment to disability inclusion and accessibility. Cover:
- ADA accommodation language in job postings or company policy
- Disability Employee Resource Group (ERG) or affinity group
- Product or service accessibility (WCAG compliance, adaptive features, AT integrations)
- Any public disability/accessibility advocacy, partnerships, or certifications
- Glassdoor or press signals about how employees with disabilities experience the company
If no specific signals are found, say so clearly — absence of public commitment is itself signal.
This section is for Alex's personal decision-making only and will not appear in any application.
## Talking Points for Alex
Five specific talking points for the phone screen. Each must:
- Reference a concrete experience from Alex's matched background by name
(UpGuard NDA rule: say "enterprise security vendor" unless the role has a clear security/compliance focus)
- Connect to a specific signal from the JD or company context above
- Be 1–2 sentences, ready to speak aloud
- Never give generic advice
---
⚠️ This brief combines live web data and LLM training knowledge. Verify key facts before the call.
"""
raw = router.complete(prompt, fallback_order=research_order)
# Strip … blocks emitted by reasoning models (e.g. DeepSeek, Qwen-R)
raw = re.sub(r".*?", "", raw, flags=re.DOTALL).strip()
sections = _parse_sections(raw)
return {
"raw_output": raw,
"company_brief": sections.get("Company Overview", ""),
"ceo_brief": sections.get("Leadership & Culture", ""),
"tech_brief": sections.get("Tech Stack & Product", ""),
"funding_brief": sections.get("Funding & Market Position", ""),
"competitors_brief": sections.get("Funding & Market Position", ""), # competitor landscape is in the funding section
"red_flags": sections.get("Red Flags & Watch-outs", ""),
"accessibility_brief": sections.get("Inclusion & Accessibility", ""),
"talking_points": sections.get("Talking Points for Alex", ""),
"scrape_used": scrape_used,
}
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Generate company research brief")
parser.add_argument("--job-id", type=int, required=True, help="Job ID in staging.db")
parser.add_argument("--no-scrape", action="store_true", help="Skip SearXNG live scrape")
args = parser.parse_args()
from scripts.db import DEFAULT_DB, init_db, save_research
import sqlite3
init_db(DEFAULT_DB)
conn = sqlite3.connect(DEFAULT_DB)
conn.row_factory = sqlite3.Row
row = conn.execute("SELECT * FROM jobs WHERE id = ?", (args.job_id,)).fetchone()
conn.close()
if not row:
sys.exit(f"Job {args.job_id} not found in {DEFAULT_DB}")
job = dict(row)
print(f"Researching: {job['title']} @ {job['company']} …\n")
if _SCRAPER_AVAILABLE and not args.no_scrape:
print(f"SearXNG available: {_searxng_running()}")
result = research_company(job, use_scraper=not args.no_scrape)
save_research(DEFAULT_DB, job_id=args.job_id, **result)
print(result["raw_output"])
print(f"\n[Saved to company_research for job {args.job_id}]")