peregrine/scripts/discover.py
pyr0ball 0e4fce44c4 feat: shadow listing detector, hired feedback widget, contacts manager
Shadow listing detector (#95):
- Capture date_posted from JobSpy in discover.py + insert_job()
- Add date_posted migration to _MIGRATIONS
- _shadow_score() heuristic: 'shadow' (≥30 days stale), 'stale' (≥14 days)
- list_jobs() computes shadow_score per listing
- JobCard.vue: 'Ghost post' and 'Stale' badges with tooltip

Post-hire feedback widget (#91):
- Add hired_feedback migration to _MIGRATIONS
- POST /api/jobs/:id/hired-feedback endpoint
- InterviewCard.vue: optional widget on hired cards with factor
  checkboxes + freetext; dismissible; shows saved state
- PipelineJob interface extended with hired_feedback field

Contacts manager (#73):
- GET /api/contacts endpoint with job join, direction/search filters
- New ContactsView.vue: searchable table, inbound/outbound filter,
  signal chip column, job link
- Route /contacts added; Contacts nav link (UsersIcon) in AppNav

Also: add git to Dockerfile apt-get for circuitforge-core editable install
2026-04-15 08:34:12 -07:00

365 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# scripts/discover.py
"""
JobSpy → SQLite staging pipeline (default) or Notion (notion_push=True).
Usage:
conda run -n job-seeker python scripts/discover.py
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import yaml
from datetime import datetime
import pandas as pd
from jobspy import scrape_jobs
from notion_client import Client
from scripts.db import DEFAULT_DB, init_db, insert_job, get_existing_urls as db_existing_urls
from scripts.custom_boards import adzuna as _adzuna
from scripts.custom_boards import theladders as _theladders
from scripts.custom_boards import craigslist as _craigslist
CONFIG_DIR = Path(__file__).parent.parent / "config"
NOTION_CFG = CONFIG_DIR / "notion.yaml"
PROFILES_CFG = CONFIG_DIR / "search_profiles.yaml"
BLOCKLIST_CFG = CONFIG_DIR / "blocklist.yaml"
# Registry of custom board scrapers keyed by name used in search_profiles.yaml
CUSTOM_SCRAPERS: dict[str, object] = {
"adzuna": _adzuna.scrape,
"theladders": _theladders.scrape,
"craigslist": _craigslist.scrape,
}
def _normalize_profiles(raw: dict) -> dict:
"""Normalize search_profiles.yaml to the canonical {profiles: [...]} format.
The onboarding wizard (pre-fix) wrote a flat `default: {...}` structure.
Canonical format is `profiles: [{name, titles/job_titles, boards, ...}]`.
This converts on load so both formats work without a migration.
"""
if "profiles" in raw:
return raw
# Wizard-written format: top-level keys are profile names (usually "default")
profiles = []
for name, body in raw.items():
if not isinstance(body, dict):
continue
# job_boards: [{name, enabled}] → boards: [name] (enabled only)
job_boards = body.pop("job_boards", None)
if job_boards and "boards" not in body:
body["boards"] = [b["name"] for b in job_boards if b.get("enabled", True)]
# blocklist_* keys live in load_blocklist, not per-profile — drop them
body.pop("blocklist_companies", None)
body.pop("blocklist_industries", None)
body.pop("blocklist_locations", None)
profiles.append({"name": name, **body})
return {"profiles": profiles}
def load_config(config_dir: Path | None = None) -> tuple[dict, dict]:
cfg = config_dir or CONFIG_DIR
profiles_path = cfg / "search_profiles.yaml"
notion_path = cfg / "notion.yaml"
raw = yaml.safe_load(profiles_path.read_text()) or {}
profiles = _normalize_profiles(raw)
notion_cfg = yaml.safe_load(notion_path.read_text()) if notion_path.exists() else {"field_map": {}, "token": None, "database_id": None}
return profiles, notion_cfg
def load_blocklist(config_dir: Path | None = None) -> dict:
"""Load global blocklist config. Returns dict with companies, industries, locations lists."""
blocklist_path = (config_dir or CONFIG_DIR) / "blocklist.yaml"
if not blocklist_path.exists():
return {"companies": [], "industries": [], "locations": []}
raw = yaml.safe_load(blocklist_path.read_text()) or {}
return {
"companies": [c.lower() for c in raw.get("companies", []) if c],
"industries": [i.lower() for i in raw.get("industries", []) if i],
"locations": [loc.lower() for loc in raw.get("locations", []) if loc],
}
def _is_blocklisted(job_row: dict, blocklist: dict) -> bool:
"""Return True if this job matches any global blocklist rule."""
company_lower = (job_row.get("company") or "").lower()
location_lower = (job_row.get("location") or "").lower()
desc_lower = (job_row.get("description") or "").lower()
content_lower = f"{company_lower} {desc_lower}"
if any(bl in company_lower for bl in blocklist["companies"]):
return True
if any(bl in content_lower for bl in blocklist["industries"]):
return True
if any(bl in location_lower for bl in blocklist["locations"]):
return True
return False
def get_existing_urls(notion: Client, db_id: str, url_field: str) -> set[str]:
"""Return the set of all job URLs already tracked in Notion (for notion_push mode)."""
existing: set[str] = set()
has_more = True
start_cursor = None
while has_more:
kwargs: dict = {"database_id": db_id, "page_size": 100}
if start_cursor:
kwargs["start_cursor"] = start_cursor
resp = notion.databases.query(**kwargs)
for page in resp["results"]:
url = page["properties"].get(url_field, {}).get("url")
if url:
existing.add(url)
has_more = resp.get("has_more", False)
start_cursor = resp.get("next_cursor")
return existing
def push_to_notion(notion: Client, db_id: str, job: dict, fm: dict) -> None:
"""Create a new page in the Notion jobs database for a single listing."""
min_amt = job.get("min_amount")
max_amt = job.get("max_amount")
if min_amt and max_amt and not (pd.isna(min_amt) or pd.isna(max_amt)):
title_content = f"${int(min_amt):,} ${int(max_amt):,}"
elif job.get("salary_source") and str(job["salary_source"]) not in ("nan", "None", ""):
title_content = str(job["salary_source"])
else:
title_content = str(job.get("title", "Unknown"))
job_url = str(job.get("job_url", "") or "")
if job_url in ("nan", "None"):
job_url = ""
notion.pages.create(
parent={"database_id": db_id},
properties={
fm["title_field"]: {"title": [{"text": {"content": title_content}}]},
fm["job_title"]: {"rich_text": [{"text": {"content": str(job.get("title", "Unknown"))}}]},
fm["company"]: {"rich_text": [{"text": {"content": str(job.get("company", "") or "")}}]},
fm["url"]: {"url": job_url or None},
fm["source"]: {"multi_select": [{"name": str(job.get("site", "unknown")).title()}]},
fm["status"]: {"select": {"name": fm["status_new"]}},
fm["remote"]: {"checkbox": bool(job.get("is_remote", False))},
fm["date_found"]: {"date": {"start": datetime.now().isoformat()[:10]}},
},
)
def run_discovery(db_path: Path = DEFAULT_DB, notion_push: bool = False, config_dir: Path | None = None) -> None:
# In cloud mode, config_dir is the per-user config directory derived from db_path.
# Falls back to the app-level /app/config for single-tenant deployments.
resolved_cfg = config_dir or Path(db_path).parent / "config"
if not resolved_cfg.exists():
resolved_cfg = CONFIG_DIR
profiles_cfg, notion_cfg = load_config(resolved_cfg)
fm = notion_cfg.get("field_map") or {}
blocklist = load_blocklist(resolved_cfg)
_bl_summary = {k: len(v) for k, v in blocklist.items() if v}
if _bl_summary:
print(f"[discover] Blocklist active: {_bl_summary}")
# SQLite dedup — by URL and by (title, company) to catch cross-board reposts
init_db(db_path)
existing_urls = db_existing_urls(db_path)
import sqlite3 as _sqlite3
_conn = _sqlite3.connect(db_path)
existing_tc = {
(r[0].lower().strip()[:80], r[1].lower().strip())
for r in _conn.execute("SELECT title, company FROM jobs").fetchall()
}
_conn.close()
# Notion dedup (only in notion_push mode)
notion = None
if notion_push:
notion = Client(auth=notion_cfg["token"])
existing_urls |= get_existing_urls(notion, notion_cfg["database_id"], fm["url"])
print(f"[discover] {len(existing_urls)} existing listings in DB")
new_count = 0
def _s(val, default="") -> str:
"""Convert a value to str, treating pandas NaN/None as default."""
if val is None:
return default
s = str(val)
return default if s in ("nan", "None", "NaN") else s
def _insert_if_new(job_row: dict, source_label: str) -> bool:
"""Dedup-check, blocklist-check, and insert a job dict. Returns True if inserted."""
url = job_row.get("url", "")
if not url or url in existing_urls:
return False
# Global blocklist — checked before anything else
if _is_blocklisted(job_row, blocklist):
return False
title_lower = job_row.get("title", "").lower()
desc_lower = job_row.get("description", "").lower()
exclude_kw = job_row.get("_exclude_kw", [])
if any(kw in title_lower or kw in desc_lower for kw in exclude_kw):
return False
tc_key = (title_lower[:80], job_row.get("company", "").lower().strip())
if tc_key in existing_tc:
return False
existing_tc.add(tc_key)
insert_job(db_path, {
"title": job_row.get("title", ""),
"company": job_row.get("company", ""),
"url": url,
"source": job_row.get("source", source_label),
"location": job_row.get("location", ""),
"is_remote": bool(job_row.get("is_remote", False)),
"salary": job_row.get("salary", ""),
"description": job_row.get("description", ""),
"date_found": datetime.now().isoformat()[:10],
})
existing_urls.add(url)
return True
for profile in profiles_cfg["profiles"]:
print(f"\n[discover] ── Profile: {profile['name']} ──")
boards = profile.get("boards", [])
custom_boards = profile.get("custom_boards", [])
exclude_kw = [kw.lower() for kw in profile.get("exclude_keywords", [])]
results_per_board = profile.get("results_per_board", 25)
# Map remote_preference → JobSpy is_remote param:
# 'remote' → True (remote-only listings)
# 'onsite' → False (on-site-only listings)
# 'both' → None (no filter — JobSpy default)
_rp = profile.get("remote_preference", "both")
_is_remote: bool | None = True if _rp == "remote" else (False if _rp == "onsite" else None)
# When filtering for remote-only, also drop hybrid roles at the description level.
# Job boards (especially LinkedIn) tag hybrid listings as is_remote=True, so the
# board-side filter alone is not reliable. We match specific work-arrangement
# phrases to avoid false positives like "hybrid cloud" or "hybrid architecture".
_HYBRID_PHRASES = [
"hybrid role", "hybrid position", "hybrid work", "hybrid schedule",
"hybrid model", "hybrid arrangement", "hybrid opportunity",
"in-office/remote", "in office/remote", "remote/in-office",
"remote/office", "office/remote",
"days in office", "days per week in", "days onsite", "days on-site",
"required to be in office", "required in office",
]
if _rp == "remote":
exclude_kw = exclude_kw + _HYBRID_PHRASES
for location in profile["locations"]:
# ── JobSpy boards ──────────────────────────────────────────────────
if boards:
# Validate boards against the installed JobSpy Site enum.
# One unsupported name in the list aborts the entire scrape_jobs() call.
try:
from jobspy import Site as _Site
_valid = {s.value for s in _Site}
_filtered = [b for b in boards if b in _valid]
_dropped = [b for b in boards if b not in _valid]
if _dropped:
print(f" [jobspy] Skipping unsupported boards: {', '.join(_dropped)}")
except ImportError:
_filtered = boards # fallback: pass through unchanged
if not _filtered:
print(f" [jobspy] No valid boards for {location} — skipping")
continue
print(f" [jobspy] {location} — boards: {', '.join(_filtered)}")
try:
jobspy_kwargs: dict = dict(
site_name=_filtered,
search_term=" OR ".join(f'"{t}"' for t in (profile.get("titles") or profile.get("job_titles", []))),
location=location,
results_wanted=results_per_board,
hours_old=profile.get("hours_old", 72),
linkedin_fetch_description=True,
)
if _is_remote is not None:
jobspy_kwargs["is_remote"] = _is_remote
jobs: pd.DataFrame = scrape_jobs(**jobspy_kwargs)
print(f" [jobspy] {len(jobs)} raw results")
except Exception as exc:
print(f" [jobspy] ERROR: {exc}")
jobs = pd.DataFrame()
jobspy_new = 0
for _, job in jobs.iterrows():
url = str(job.get("job_url", "") or "")
if not url or url in ("nan", "None"):
continue
job_dict = job.to_dict()
# Build salary string from JobSpy numeric fields
min_amt = job_dict.get("min_amount")
max_amt = job_dict.get("max_amount")
salary_str = ""
if min_amt and max_amt and not (pd.isna(min_amt) or pd.isna(max_amt)):
salary_str = f"${int(min_amt):,} ${int(max_amt):,}"
elif job_dict.get("salary_source") and str(job_dict["salary_source"]) not in ("nan", "None", ""):
salary_str = str(job_dict["salary_source"])
_dp = job_dict.get("date_posted")
date_posted_str = (
_dp.isoformat() if hasattr(_dp, "isoformat") else str(_dp)
) if _dp and str(_dp) not in ("nan", "None", "") else ""
row = {
"url": url,
"title": _s(job_dict.get("title")),
"company": _s(job_dict.get("company")),
"source": _s(job_dict.get("site")),
"location": _s(job_dict.get("location")),
"is_remote": bool(job_dict.get("is_remote", False)),
"salary": salary_str,
"description": _s(job_dict.get("description")),
"date_posted": date_posted_str,
"_exclude_kw": exclude_kw,
}
if _insert_if_new(row, _s(job_dict.get("site"))):
if notion_push:
push_to_notion(notion, notion_cfg["database_id"], job_dict, fm)
new_count += 1
jobspy_new += 1
print(f" + {row['title']} @ {row['company']} [{row['source']}]")
print(f" [jobspy] {jobspy_new} new listings from {location}")
# ── Custom boards ──────────────────────────────────────────────────
for board_name in custom_boards:
scraper_fn = CUSTOM_SCRAPERS.get(board_name)
if scraper_fn is None:
print(f" [{board_name}] Unknown scraper — skipping (not in CUSTOM_SCRAPERS registry)")
continue
print(f" [{board_name}] {location} — fetching up to {results_per_board} results …")
try:
custom_jobs = scraper_fn(profile, location, results_wanted=results_per_board)
except Exception as exc:
print(f" [{board_name}] ERROR: {exc}")
custom_jobs = []
print(f" [{board_name}] {len(custom_jobs)} raw results")
board_new = 0
for job in custom_jobs:
row = {**job, "_exclude_kw": exclude_kw}
if _insert_if_new(row, board_name):
new_count += 1
board_new += 1
print(f" + {job.get('title')} @ {job.get('company')} [{board_name}]")
print(f" [{board_name}] {board_new} new listings from {location}")
print(f"\n[discover] Done — {new_count} new listings staged total.")
return new_count
if __name__ == "__main__":
run_discovery()