peregrine/scripts/discover.py

# scripts/discover.py
"""
JobSpy → SQLite staging pipeline (default) or Notion (notion_push=True).

Usage:
    conda run -n job-seeker python scripts/discover.py
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))

import yaml
from datetime import datetime

import pandas as pd
from jobspy import scrape_jobs
from notion_client import Client

from scripts.db import DEFAULT_DB, init_db, insert_job, get_existing_urls as db_existing_urls
from scripts.custom_boards import adzuna as _adzuna
from scripts.custom_boards import theladders as _theladders
from scripts.custom_boards import craigslist as _craigslist

CONFIG_DIR = Path(__file__).parent.parent / "config"
NOTION_CFG = CONFIG_DIR / "notion.yaml"
PROFILES_CFG = CONFIG_DIR / "search_profiles.yaml"
BLOCKLIST_CFG = CONFIG_DIR / "blocklist.yaml"

# Registry of custom board scrapers keyed by name used in search_profiles.yaml
CUSTOM_SCRAPERS: dict[str, object] = {
    "adzuna": _adzuna.scrape,
    "theladders": _theladders.scrape,
    "craigslist": _craigslist.scrape,
}


def _normalize_profiles(raw: dict) -> dict:
    """Normalize search_profiles.yaml to the canonical {profiles: [...]} format.

    The onboarding wizard (pre-fix) wrote a flat `default: {...}` structure.
    Canonical format is `profiles: [{name, titles/job_titles, boards, ...}]`.
    This converts on load so both formats work without a migration.
    """
    if "profiles" in raw:
        return raw
    # Wizard-written format: top-level keys are profile names (usually "default")
    profiles = []
    for name, body in raw.items():
        if not isinstance(body, dict):
            continue
        # job_boards: [{name, enabled}] → boards: [name] (enabled only)
        job_boards = body.pop("job_boards", None)
        if job_boards and "boards" not in body:
            body["boards"] = [b["name"] for b in job_boards if b.get("enabled", True)]
        # blocklist_* keys live in load_blocklist, not per-profile — drop them
        body.pop("blocklist_companies", None)
        body.pop("blocklist_industries", None)
        body.pop("blocklist_locations", None)
        profiles.append({"name": name, **body})
    return {"profiles": profiles}


def load_config(config_dir: Path | None = None) -> tuple[dict, dict]:
    cfg = config_dir or CONFIG_DIR
    profiles_path = cfg / "search_profiles.yaml"
    notion_path = cfg / "notion.yaml"
    raw = yaml.safe_load(profiles_path.read_text()) or {}
    profiles = _normalize_profiles(raw)
    notion_cfg = yaml.safe_load(notion_path.read_text()) if notion_path.exists() else {"field_map": {}, "token": None, "database_id": None}
    return profiles, notion_cfg


def load_blocklist(config_dir: Path | None = None) -> dict:
    """Load global blocklist config. Returns dict with companies, industries, locations lists."""
    blocklist_path = (config_dir or CONFIG_DIR) / "blocklist.yaml"
    if not blocklist_path.exists():
        return {"companies": [], "industries": [], "locations": []}
    raw = yaml.safe_load(blocklist_path.read_text()) or {}
    return {
        "companies":  [c.lower() for c in raw.get("companies", []) if c],
        "industries": [i.lower() for i in raw.get("industries", []) if i],
        "locations":  [loc.lower() for loc in raw.get("locations", []) if loc],
    }


def _is_blocklisted(job_row: dict, blocklist: dict) -> bool:
    """Return True if this job matches any global blocklist rule."""
    company_lower = (job_row.get("company") or "").lower()
    location_lower = (job_row.get("location") or "").lower()
    desc_lower = (job_row.get("description") or "").lower()
    content_lower = f"{company_lower} {desc_lower}"

    if any(bl in company_lower for bl in blocklist["companies"]):
        return True
    if any(bl in content_lower for bl in blocklist["industries"]):
        return True
    if any(bl in location_lower for bl in blocklist["locations"]):
        return True
    return False


def get_existing_urls(notion: Client, db_id: str, url_field: str) -> set[str]:
    """Return the set of all job URLs already tracked in Notion (for notion_push mode)."""
    existing: set[str] = set()
    has_more = True
    start_cursor = None
    while has_more:
        kwargs: dict = {"database_id": db_id, "page_size": 100}
        if start_cursor:
            kwargs["start_cursor"] = start_cursor
        resp = notion.databases.query(**kwargs)
        for page in resp["results"]:
            url = page["properties"].get(url_field, {}).get("url")
            if url:
                existing.add(url)
        has_more = resp.get("has_more", False)
        start_cursor = resp.get("next_cursor")
    return existing


def push_to_notion(notion: Client, db_id: str, job: dict, fm: dict) -> None:
    """Create a new page in the Notion jobs database for a single listing."""
    min_amt = job.get("min_amount")
    max_amt = job.get("max_amount")
    if min_amt and max_amt and not (pd.isna(min_amt) or pd.isna(max_amt)):
        title_content = f"${int(min_amt):,} – ${int(max_amt):,}"
    elif job.get("salary_source") and str(job["salary_source"]) not in ("nan", "None", ""):
        title_content = str(job["salary_source"])
    else:
        title_content = str(job.get("title", "Unknown"))

    job_url = str(job.get("job_url", "") or "")
    if job_url in ("nan", "None"):
        job_url = ""

    notion.pages.create(
        parent={"database_id": db_id},
        properties={
            fm["title_field"]: {"title": [{"text": {"content": title_content}}]},
            fm["job_title"]:   {"rich_text": [{"text": {"content": str(job.get("title", "Unknown"))}}]},
            fm["company"]:     {"rich_text": [{"text": {"content": str(job.get("company", "") or "")}}]},
            fm["url"]:         {"url": job_url or None},
            fm["source"]:      {"multi_select": [{"name": str(job.get("site", "unknown")).title()}]},
            fm["status"]:      {"select": {"name": fm["status_new"]}},
            fm["remote"]:      {"checkbox": bool(job.get("is_remote", False))},
            fm["date_found"]:  {"date": {"start": datetime.now().isoformat()[:10]}},
        },
    )


def run_discovery(db_path: Path = DEFAULT_DB, notion_push: bool = False, config_dir: Path | None = None) -> None:
    # In cloud mode, config_dir is the per-user config directory derived from db_path.
    # Falls back to the app-level /app/config for single-tenant deployments.
    resolved_cfg = config_dir or Path(db_path).parent / "config"
    if not resolved_cfg.exists():
        resolved_cfg = CONFIG_DIR
    profiles_cfg, notion_cfg = load_config(resolved_cfg)
    fm = notion_cfg.get("field_map") or {}
    blocklist = load_blocklist(resolved_cfg)

    _bl_summary = {k: len(v) for k, v in blocklist.items() if v}
    if _bl_summary:
        print(f"[discover] Blocklist active: {_bl_summary}")

    # SQLite dedup — by URL and by (title, company) to catch cross-board reposts
    init_db(db_path)
    existing_urls = db_existing_urls(db_path)

    import sqlite3 as _sqlite3
    _conn = _sqlite3.connect(db_path)
    existing_tc = {
        (r[0].lower().strip()[:80], r[1].lower().strip())
        for r in _conn.execute("SELECT title, company FROM jobs").fetchall()
    }
    _conn.close()

    # Notion dedup (only in notion_push mode)
    notion = None
    if notion_push:
        notion = Client(auth=notion_cfg["token"])
        existing_urls |= get_existing_urls(notion, notion_cfg["database_id"], fm["url"])

    print(f"[discover] {len(existing_urls)} existing listings in DB")
    new_count = 0

    def _s(val, default="") -> str:
        """Convert a value to str, treating pandas NaN/None as default."""
        if val is None:
            return default
        s = str(val)
        return default if s in ("nan", "None", "NaN") else s

    def _insert_if_new(job_row: dict, source_label: str) -> bool:
        """Dedup-check, blocklist-check, and insert a job dict. Returns True if inserted."""
        url = job_row.get("url", "")
        if not url or url in existing_urls:
            return False

        # Global blocklist — checked before anything else
        if _is_blocklisted(job_row, blocklist):
            return False

        title_lower = job_row.get("title", "").lower()
        desc_lower  = job_row.get("description", "").lower()
        exclude_kw  = job_row.get("_exclude_kw", [])
        if any(kw in title_lower or kw in desc_lower for kw in exclude_kw):
            return False

        tc_key = (title_lower[:80], job_row.get("company", "").lower().strip())
        if tc_key in existing_tc:
            return False
        existing_tc.add(tc_key)

        insert_job(db_path, {
            "title":       job_row.get("title", ""),
            "company":     job_row.get("company", ""),
            "url":         url,
            "source":      job_row.get("source", source_label),
            "location":    job_row.get("location", ""),
            "is_remote":   bool(job_row.get("is_remote", False)),
            "salary":      job_row.get("salary", ""),
            "description": job_row.get("description", ""),
            "date_found":  datetime.now().isoformat()[:10],
        })
        existing_urls.add(url)
        return True

    for profile in profiles_cfg["profiles"]:
        print(f"\n[discover] ── Profile: {profile['name']} ──")
        boards = profile.get("boards", [])
        custom_boards = profile.get("custom_boards", [])
        exclude_kw = [kw.lower() for kw in profile.get("exclude_keywords", [])]
        results_per_board = profile.get("results_per_board", 25)

        # Map remote_preference → JobSpy is_remote param:
        #   'remote'  → True  (remote-only listings)
        #   'onsite'  → False (on-site-only listings)
        #   'both'    → None  (no filter — JobSpy default)
        _rp = profile.get("remote_preference", "both")
        _is_remote: bool | None = True if _rp == "remote" else (False if _rp == "onsite" else None)

        # When filtering for remote-only, also drop hybrid roles at the description level.
        # Job boards (especially LinkedIn) tag hybrid listings as is_remote=True, so the
        # board-side filter alone is not reliable.  We match specific work-arrangement
        # phrases to avoid false positives like "hybrid cloud" or "hybrid architecture".
        _HYBRID_PHRASES = [
            "hybrid role", "hybrid position", "hybrid work", "hybrid schedule",
            "hybrid model", "hybrid arrangement", "hybrid opportunity",
            "in-office/remote", "in office/remote", "remote/in-office",
            "remote/office", "office/remote",
            "days in office", "days per week in", "days onsite", "days on-site",
            "required to be in office", "required in office",
        ]
        if _rp == "remote":
            exclude_kw = exclude_kw + _HYBRID_PHRASES

        for location in profile["locations"]:

            # ── JobSpy boards ──────────────────────────────────────────────────
            if boards:
                # Validate boards against the installed JobSpy Site enum.
                # One unsupported name in the list aborts the entire scrape_jobs() call.
                try:
                    from jobspy import Site as _Site
                    _valid = {s.value for s in _Site}
                    _filtered = [b for b in boards if b in _valid]
                    _dropped  = [b for b in boards if b not in _valid]
                    if _dropped:
                        print(f"  [jobspy] Skipping unsupported boards: {', '.join(_dropped)}")
                except ImportError:
                    _filtered = boards  # fallback: pass through unchanged
                if not _filtered:
                    print(f"  [jobspy] No valid boards for {location} — skipping")
                    continue
                print(f"  [jobspy] {location} — boards: {', '.join(_filtered)}")
                try:
                    jobspy_kwargs: dict = dict(
                        site_name=_filtered,
                        search_term=" OR ".join(f'"{t}"' for t in (profile.get("titles") or profile.get("job_titles", []))),
                        location=location,
                        results_wanted=results_per_board,
                        hours_old=profile.get("hours_old", 72),
                        linkedin_fetch_description=True,
                    )
                    if _is_remote is not None:
                        jobspy_kwargs["is_remote"] = _is_remote
                    jobs: pd.DataFrame = scrape_jobs(**jobspy_kwargs)
                    print(f"  [jobspy] {len(jobs)} raw results")
                except Exception as exc:
                    print(f"  [jobspy] ERROR: {exc}")
                    jobs = pd.DataFrame()

                jobspy_new = 0
                for _, job in jobs.iterrows():
                    url = str(job.get("job_url", "") or "")
                    if not url or url in ("nan", "None"):
                        continue

                    job_dict = job.to_dict()

                    # Build salary string from JobSpy numeric fields
                    min_amt = job_dict.get("min_amount")
                    max_amt = job_dict.get("max_amount")
                    salary_str = ""
                    if min_amt and max_amt and not (pd.isna(min_amt) or pd.isna(max_amt)):
                        salary_str = f"${int(min_amt):,} – ${int(max_amt):,}"
                    elif job_dict.get("salary_source") and str(job_dict["salary_source"]) not in ("nan", "None", ""):
                        salary_str = str(job_dict["salary_source"])

                    _dp = job_dict.get("date_posted")
                    date_posted_str = (
                        _dp.isoformat() if hasattr(_dp, "isoformat") else str(_dp)
                    ) if _dp and str(_dp) not in ("nan", "None", "") else ""
                    row = {
                        "url":         url,
                        "title":       _s(job_dict.get("title")),
                        "company":     _s(job_dict.get("company")),
                        "source":      _s(job_dict.get("site")),
                        "location":    _s(job_dict.get("location")),
                        "is_remote":   bool(job_dict.get("is_remote", False)),
                        "salary":      salary_str,
                        "description": _s(job_dict.get("description")),
                        "date_posted": date_posted_str,
                        "_exclude_kw": exclude_kw,
                    }
                    if _insert_if_new(row, _s(job_dict.get("site"))):
                        if notion_push:
                            push_to_notion(notion, notion_cfg["database_id"], job_dict, fm)
                        new_count += 1
                        jobspy_new += 1
                        print(f"    + {row['title']} @ {row['company']} [{row['source']}]")

                print(f"  [jobspy] {jobspy_new} new listings from {location}")

            # ── Custom boards ──────────────────────────────────────────────────
            for board_name in custom_boards:
                scraper_fn = CUSTOM_SCRAPERS.get(board_name)
                if scraper_fn is None:
                    print(f"  [{board_name}] Unknown scraper — skipping (not in CUSTOM_SCRAPERS registry)")
                    continue

                print(f"  [{board_name}] {location} — fetching up to {results_per_board} results …")
                try:
                    custom_jobs = scraper_fn(profile, location, results_wanted=results_per_board)
                except Exception as exc:
                    print(f"  [{board_name}] ERROR: {exc}")
                    custom_jobs = []

                print(f"  [{board_name}] {len(custom_jobs)} raw results")
                board_new = 0
                for job in custom_jobs:
                    row = {**job, "_exclude_kw": exclude_kw}
                    if _insert_if_new(row, board_name):
                        new_count += 1
                        board_new += 1
                        print(f"    + {job.get('title')} @ {job.get('company')} [{board_name}]")

                print(f"  [{board_name}] {board_new} new listings from {location}")

    print(f"\n[discover] Done — {new_count} new listings staged total.")
    return new_count


if __name__ == "__main__":
    run_discovery()