Shadow listing detector (#95): - Capture date_posted from JobSpy in discover.py + insert_job() - Add date_posted migration to _MIGRATIONS - _shadow_score() heuristic: 'shadow' (≥30 days stale), 'stale' (≥14 days) - list_jobs() computes shadow_score per listing - JobCard.vue: 'Ghost post' and 'Stale' badges with tooltip Post-hire feedback widget (#91): - Add hired_feedback migration to _MIGRATIONS - POST /api/jobs/:id/hired-feedback endpoint - InterviewCard.vue: optional widget on hired cards with factor checkboxes + freetext; dismissible; shows saved state - PipelineJob interface extended with hired_feedback field Contacts manager (#73): - GET /api/contacts endpoint with job join, direction/search filters - New ContactsView.vue: searchable table, inbound/outbound filter, signal chip column, job link - Route /contacts added; Contacts nav link (UsersIcon) in AppNav Also: add git to Dockerfile apt-get for circuitforge-core editable install
365 lines
16 KiB
Python
365 lines
16 KiB
Python
# scripts/discover.py
|
||
"""
|
||
JobSpy → SQLite staging pipeline (default) or Notion (notion_push=True).
|
||
|
||
Usage:
|
||
conda run -n job-seeker python scripts/discover.py
|
||
"""
|
||
import sys
|
||
from pathlib import Path
|
||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||
|
||
import yaml
|
||
from datetime import datetime
|
||
|
||
import pandas as pd
|
||
from jobspy import scrape_jobs
|
||
from notion_client import Client
|
||
|
||
from scripts.db import DEFAULT_DB, init_db, insert_job, get_existing_urls as db_existing_urls
|
||
from scripts.custom_boards import adzuna as _adzuna
|
||
from scripts.custom_boards import theladders as _theladders
|
||
from scripts.custom_boards import craigslist as _craigslist
|
||
|
||
CONFIG_DIR = Path(__file__).parent.parent / "config"
|
||
NOTION_CFG = CONFIG_DIR / "notion.yaml"
|
||
PROFILES_CFG = CONFIG_DIR / "search_profiles.yaml"
|
||
BLOCKLIST_CFG = CONFIG_DIR / "blocklist.yaml"
|
||
|
||
# Registry of custom board scrapers keyed by name used in search_profiles.yaml
|
||
CUSTOM_SCRAPERS: dict[str, object] = {
|
||
"adzuna": _adzuna.scrape,
|
||
"theladders": _theladders.scrape,
|
||
"craigslist": _craigslist.scrape,
|
||
}
|
||
|
||
|
||
def _normalize_profiles(raw: dict) -> dict:
|
||
"""Normalize search_profiles.yaml to the canonical {profiles: [...]} format.
|
||
|
||
The onboarding wizard (pre-fix) wrote a flat `default: {...}` structure.
|
||
Canonical format is `profiles: [{name, titles/job_titles, boards, ...}]`.
|
||
This converts on load so both formats work without a migration.
|
||
"""
|
||
if "profiles" in raw:
|
||
return raw
|
||
# Wizard-written format: top-level keys are profile names (usually "default")
|
||
profiles = []
|
||
for name, body in raw.items():
|
||
if not isinstance(body, dict):
|
||
continue
|
||
# job_boards: [{name, enabled}] → boards: [name] (enabled only)
|
||
job_boards = body.pop("job_boards", None)
|
||
if job_boards and "boards" not in body:
|
||
body["boards"] = [b["name"] for b in job_boards if b.get("enabled", True)]
|
||
# blocklist_* keys live in load_blocklist, not per-profile — drop them
|
||
body.pop("blocklist_companies", None)
|
||
body.pop("blocklist_industries", None)
|
||
body.pop("blocklist_locations", None)
|
||
profiles.append({"name": name, **body})
|
||
return {"profiles": profiles}
|
||
|
||
|
||
def load_config(config_dir: Path | None = None) -> tuple[dict, dict]:
|
||
cfg = config_dir or CONFIG_DIR
|
||
profiles_path = cfg / "search_profiles.yaml"
|
||
notion_path = cfg / "notion.yaml"
|
||
raw = yaml.safe_load(profiles_path.read_text()) or {}
|
||
profiles = _normalize_profiles(raw)
|
||
notion_cfg = yaml.safe_load(notion_path.read_text()) if notion_path.exists() else {"field_map": {}, "token": None, "database_id": None}
|
||
return profiles, notion_cfg
|
||
|
||
|
||
def load_blocklist(config_dir: Path | None = None) -> dict:
|
||
"""Load global blocklist config. Returns dict with companies, industries, locations lists."""
|
||
blocklist_path = (config_dir or CONFIG_DIR) / "blocklist.yaml"
|
||
if not blocklist_path.exists():
|
||
return {"companies": [], "industries": [], "locations": []}
|
||
raw = yaml.safe_load(blocklist_path.read_text()) or {}
|
||
return {
|
||
"companies": [c.lower() for c in raw.get("companies", []) if c],
|
||
"industries": [i.lower() for i in raw.get("industries", []) if i],
|
||
"locations": [loc.lower() for loc in raw.get("locations", []) if loc],
|
||
}
|
||
|
||
|
||
def _is_blocklisted(job_row: dict, blocklist: dict) -> bool:
|
||
"""Return True if this job matches any global blocklist rule."""
|
||
company_lower = (job_row.get("company") or "").lower()
|
||
location_lower = (job_row.get("location") or "").lower()
|
||
desc_lower = (job_row.get("description") or "").lower()
|
||
content_lower = f"{company_lower} {desc_lower}"
|
||
|
||
if any(bl in company_lower for bl in blocklist["companies"]):
|
||
return True
|
||
if any(bl in content_lower for bl in blocklist["industries"]):
|
||
return True
|
||
if any(bl in location_lower for bl in blocklist["locations"]):
|
||
return True
|
||
return False
|
||
|
||
|
||
def get_existing_urls(notion: Client, db_id: str, url_field: str) -> set[str]:
|
||
"""Return the set of all job URLs already tracked in Notion (for notion_push mode)."""
|
||
existing: set[str] = set()
|
||
has_more = True
|
||
start_cursor = None
|
||
while has_more:
|
||
kwargs: dict = {"database_id": db_id, "page_size": 100}
|
||
if start_cursor:
|
||
kwargs["start_cursor"] = start_cursor
|
||
resp = notion.databases.query(**kwargs)
|
||
for page in resp["results"]:
|
||
url = page["properties"].get(url_field, {}).get("url")
|
||
if url:
|
||
existing.add(url)
|
||
has_more = resp.get("has_more", False)
|
||
start_cursor = resp.get("next_cursor")
|
||
return existing
|
||
|
||
|
||
def push_to_notion(notion: Client, db_id: str, job: dict, fm: dict) -> None:
|
||
"""Create a new page in the Notion jobs database for a single listing."""
|
||
min_amt = job.get("min_amount")
|
||
max_amt = job.get("max_amount")
|
||
if min_amt and max_amt and not (pd.isna(min_amt) or pd.isna(max_amt)):
|
||
title_content = f"${int(min_amt):,} – ${int(max_amt):,}"
|
||
elif job.get("salary_source") and str(job["salary_source"]) not in ("nan", "None", ""):
|
||
title_content = str(job["salary_source"])
|
||
else:
|
||
title_content = str(job.get("title", "Unknown"))
|
||
|
||
job_url = str(job.get("job_url", "") or "")
|
||
if job_url in ("nan", "None"):
|
||
job_url = ""
|
||
|
||
notion.pages.create(
|
||
parent={"database_id": db_id},
|
||
properties={
|
||
fm["title_field"]: {"title": [{"text": {"content": title_content}}]},
|
||
fm["job_title"]: {"rich_text": [{"text": {"content": str(job.get("title", "Unknown"))}}]},
|
||
fm["company"]: {"rich_text": [{"text": {"content": str(job.get("company", "") or "")}}]},
|
||
fm["url"]: {"url": job_url or None},
|
||
fm["source"]: {"multi_select": [{"name": str(job.get("site", "unknown")).title()}]},
|
||
fm["status"]: {"select": {"name": fm["status_new"]}},
|
||
fm["remote"]: {"checkbox": bool(job.get("is_remote", False))},
|
||
fm["date_found"]: {"date": {"start": datetime.now().isoformat()[:10]}},
|
||
},
|
||
)
|
||
|
||
|
||
def run_discovery(db_path: Path = DEFAULT_DB, notion_push: bool = False, config_dir: Path | None = None) -> None:
|
||
# In cloud mode, config_dir is the per-user config directory derived from db_path.
|
||
# Falls back to the app-level /app/config for single-tenant deployments.
|
||
resolved_cfg = config_dir or Path(db_path).parent / "config"
|
||
if not resolved_cfg.exists():
|
||
resolved_cfg = CONFIG_DIR
|
||
profiles_cfg, notion_cfg = load_config(resolved_cfg)
|
||
fm = notion_cfg.get("field_map") or {}
|
||
blocklist = load_blocklist(resolved_cfg)
|
||
|
||
_bl_summary = {k: len(v) for k, v in blocklist.items() if v}
|
||
if _bl_summary:
|
||
print(f"[discover] Blocklist active: {_bl_summary}")
|
||
|
||
# SQLite dedup — by URL and by (title, company) to catch cross-board reposts
|
||
init_db(db_path)
|
||
existing_urls = db_existing_urls(db_path)
|
||
|
||
import sqlite3 as _sqlite3
|
||
_conn = _sqlite3.connect(db_path)
|
||
existing_tc = {
|
||
(r[0].lower().strip()[:80], r[1].lower().strip())
|
||
for r in _conn.execute("SELECT title, company FROM jobs").fetchall()
|
||
}
|
||
_conn.close()
|
||
|
||
# Notion dedup (only in notion_push mode)
|
||
notion = None
|
||
if notion_push:
|
||
notion = Client(auth=notion_cfg["token"])
|
||
existing_urls |= get_existing_urls(notion, notion_cfg["database_id"], fm["url"])
|
||
|
||
print(f"[discover] {len(existing_urls)} existing listings in DB")
|
||
new_count = 0
|
||
|
||
def _s(val, default="") -> str:
|
||
"""Convert a value to str, treating pandas NaN/None as default."""
|
||
if val is None:
|
||
return default
|
||
s = str(val)
|
||
return default if s in ("nan", "None", "NaN") else s
|
||
|
||
def _insert_if_new(job_row: dict, source_label: str) -> bool:
|
||
"""Dedup-check, blocklist-check, and insert a job dict. Returns True if inserted."""
|
||
url = job_row.get("url", "")
|
||
if not url or url in existing_urls:
|
||
return False
|
||
|
||
# Global blocklist — checked before anything else
|
||
if _is_blocklisted(job_row, blocklist):
|
||
return False
|
||
|
||
title_lower = job_row.get("title", "").lower()
|
||
desc_lower = job_row.get("description", "").lower()
|
||
exclude_kw = job_row.get("_exclude_kw", [])
|
||
if any(kw in title_lower or kw in desc_lower for kw in exclude_kw):
|
||
return False
|
||
|
||
tc_key = (title_lower[:80], job_row.get("company", "").lower().strip())
|
||
if tc_key in existing_tc:
|
||
return False
|
||
existing_tc.add(tc_key)
|
||
|
||
insert_job(db_path, {
|
||
"title": job_row.get("title", ""),
|
||
"company": job_row.get("company", ""),
|
||
"url": url,
|
||
"source": job_row.get("source", source_label),
|
||
"location": job_row.get("location", ""),
|
||
"is_remote": bool(job_row.get("is_remote", False)),
|
||
"salary": job_row.get("salary", ""),
|
||
"description": job_row.get("description", ""),
|
||
"date_found": datetime.now().isoformat()[:10],
|
||
})
|
||
existing_urls.add(url)
|
||
return True
|
||
|
||
for profile in profiles_cfg["profiles"]:
|
||
print(f"\n[discover] ── Profile: {profile['name']} ──")
|
||
boards = profile.get("boards", [])
|
||
custom_boards = profile.get("custom_boards", [])
|
||
exclude_kw = [kw.lower() for kw in profile.get("exclude_keywords", [])]
|
||
results_per_board = profile.get("results_per_board", 25)
|
||
|
||
# Map remote_preference → JobSpy is_remote param:
|
||
# 'remote' → True (remote-only listings)
|
||
# 'onsite' → False (on-site-only listings)
|
||
# 'both' → None (no filter — JobSpy default)
|
||
_rp = profile.get("remote_preference", "both")
|
||
_is_remote: bool | None = True if _rp == "remote" else (False if _rp == "onsite" else None)
|
||
|
||
# When filtering for remote-only, also drop hybrid roles at the description level.
|
||
# Job boards (especially LinkedIn) tag hybrid listings as is_remote=True, so the
|
||
# board-side filter alone is not reliable. We match specific work-arrangement
|
||
# phrases to avoid false positives like "hybrid cloud" or "hybrid architecture".
|
||
_HYBRID_PHRASES = [
|
||
"hybrid role", "hybrid position", "hybrid work", "hybrid schedule",
|
||
"hybrid model", "hybrid arrangement", "hybrid opportunity",
|
||
"in-office/remote", "in office/remote", "remote/in-office",
|
||
"remote/office", "office/remote",
|
||
"days in office", "days per week in", "days onsite", "days on-site",
|
||
"required to be in office", "required in office",
|
||
]
|
||
if _rp == "remote":
|
||
exclude_kw = exclude_kw + _HYBRID_PHRASES
|
||
|
||
for location in profile["locations"]:
|
||
|
||
# ── JobSpy boards ──────────────────────────────────────────────────
|
||
if boards:
|
||
# Validate boards against the installed JobSpy Site enum.
|
||
# One unsupported name in the list aborts the entire scrape_jobs() call.
|
||
try:
|
||
from jobspy import Site as _Site
|
||
_valid = {s.value for s in _Site}
|
||
_filtered = [b for b in boards if b in _valid]
|
||
_dropped = [b for b in boards if b not in _valid]
|
||
if _dropped:
|
||
print(f" [jobspy] Skipping unsupported boards: {', '.join(_dropped)}")
|
||
except ImportError:
|
||
_filtered = boards # fallback: pass through unchanged
|
||
if not _filtered:
|
||
print(f" [jobspy] No valid boards for {location} — skipping")
|
||
continue
|
||
print(f" [jobspy] {location} — boards: {', '.join(_filtered)}")
|
||
try:
|
||
jobspy_kwargs: dict = dict(
|
||
site_name=_filtered,
|
||
search_term=" OR ".join(f'"{t}"' for t in (profile.get("titles") or profile.get("job_titles", []))),
|
||
location=location,
|
||
results_wanted=results_per_board,
|
||
hours_old=profile.get("hours_old", 72),
|
||
linkedin_fetch_description=True,
|
||
)
|
||
if _is_remote is not None:
|
||
jobspy_kwargs["is_remote"] = _is_remote
|
||
jobs: pd.DataFrame = scrape_jobs(**jobspy_kwargs)
|
||
print(f" [jobspy] {len(jobs)} raw results")
|
||
except Exception as exc:
|
||
print(f" [jobspy] ERROR: {exc}")
|
||
jobs = pd.DataFrame()
|
||
|
||
jobspy_new = 0
|
||
for _, job in jobs.iterrows():
|
||
url = str(job.get("job_url", "") or "")
|
||
if not url or url in ("nan", "None"):
|
||
continue
|
||
|
||
job_dict = job.to_dict()
|
||
|
||
# Build salary string from JobSpy numeric fields
|
||
min_amt = job_dict.get("min_amount")
|
||
max_amt = job_dict.get("max_amount")
|
||
salary_str = ""
|
||
if min_amt and max_amt and not (pd.isna(min_amt) or pd.isna(max_amt)):
|
||
salary_str = f"${int(min_amt):,} – ${int(max_amt):,}"
|
||
elif job_dict.get("salary_source") and str(job_dict["salary_source"]) not in ("nan", "None", ""):
|
||
salary_str = str(job_dict["salary_source"])
|
||
|
||
_dp = job_dict.get("date_posted")
|
||
date_posted_str = (
|
||
_dp.isoformat() if hasattr(_dp, "isoformat") else str(_dp)
|
||
) if _dp and str(_dp) not in ("nan", "None", "") else ""
|
||
row = {
|
||
"url": url,
|
||
"title": _s(job_dict.get("title")),
|
||
"company": _s(job_dict.get("company")),
|
||
"source": _s(job_dict.get("site")),
|
||
"location": _s(job_dict.get("location")),
|
||
"is_remote": bool(job_dict.get("is_remote", False)),
|
||
"salary": salary_str,
|
||
"description": _s(job_dict.get("description")),
|
||
"date_posted": date_posted_str,
|
||
"_exclude_kw": exclude_kw,
|
||
}
|
||
if _insert_if_new(row, _s(job_dict.get("site"))):
|
||
if notion_push:
|
||
push_to_notion(notion, notion_cfg["database_id"], job_dict, fm)
|
||
new_count += 1
|
||
jobspy_new += 1
|
||
print(f" + {row['title']} @ {row['company']} [{row['source']}]")
|
||
|
||
print(f" [jobspy] {jobspy_new} new listings from {location}")
|
||
|
||
# ── Custom boards ──────────────────────────────────────────────────
|
||
for board_name in custom_boards:
|
||
scraper_fn = CUSTOM_SCRAPERS.get(board_name)
|
||
if scraper_fn is None:
|
||
print(f" [{board_name}] Unknown scraper — skipping (not in CUSTOM_SCRAPERS registry)")
|
||
continue
|
||
|
||
print(f" [{board_name}] {location} — fetching up to {results_per_board} results …")
|
||
try:
|
||
custom_jobs = scraper_fn(profile, location, results_wanted=results_per_board)
|
||
except Exception as exc:
|
||
print(f" [{board_name}] ERROR: {exc}")
|
||
custom_jobs = []
|
||
|
||
print(f" [{board_name}] {len(custom_jobs)} raw results")
|
||
board_new = 0
|
||
for job in custom_jobs:
|
||
row = {**job, "_exclude_kw": exclude_kw}
|
||
if _insert_if_new(row, board_name):
|
||
new_count += 1
|
||
board_new += 1
|
||
print(f" + {job.get('title')} @ {job.get('company')} [{board_name}]")
|
||
|
||
print(f" [{board_name}] {board_new} new listings from {location}")
|
||
|
||
print(f"\n[discover] Done — {new_count} new listings staged total.")
|
||
return new_count
|
||
|
||
|
||
if __name__ == "__main__":
|
||
run_discovery()
|