peregrine/scripts/custom_boards/craigslist.py

"""Craigslist job scraper — RSS-based.

Uses Craigslist's native RSS feed endpoint for discovery.
Full job description is populated by the scrape_url background task.
Company name and salary (not structured in Craigslist listings) are
extracted from the description body by the enrich_craigslist task.

Config: config/craigslist.yaml  (gitignored — metro list + location map)
        config/craigslist.yaml.example  (committed template)

Returns a list of dicts compatible with scripts.db.insert_job().
"""
from __future__ import annotations

import time
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
from pathlib import Path
from urllib.parse import quote_plus

import requests
import yaml

_CONFIG_PATH = Path(__file__).parent.parent.parent / "config" / "craigslist.yaml"
_DEFAULT_CATEGORY = "jjj"
_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    )
}
_TIMEOUT = 15
_SLEEP = 0.5  # seconds between requests — easy to make configurable later


def _load_config() -> dict:
    if not _CONFIG_PATH.exists():
        raise FileNotFoundError(
            f"Craigslist config not found: {_CONFIG_PATH}\n"
            "Copy config/craigslist.yaml.example → config/craigslist.yaml "
            "and configure your target metros."
        )
    cfg = yaml.safe_load(_CONFIG_PATH.read_text()) or {}
    if not cfg.get("metros"):
        raise ValueError(
            "config/craigslist.yaml must contain at least one entry under 'metros'."
        )
    return cfg


def _rss_url(metro: str, category: str, query: str) -> str:
    return (
        f"https://{metro}.craigslist.org/search/{category}"
        f"?query={quote_plus(query)}&format=rss&sort=date"
    )


def _parse_pubdate(pubdate_str: str) -> datetime | None:
    """Parse an RSS pubDate string to a timezone-aware datetime."""
    try:
        return parsedate_to_datetime(pubdate_str)
    except Exception:
        return None


def _fetch_rss(url: str) -> list[dict]:
    """Fetch and parse a Craigslist RSS feed. Returns list of raw item dicts."""
    resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT)
    resp.raise_for_status()
    try:
        root = ET.fromstring(resp.content)
    except ET.ParseError as exc:
        raise ValueError(f"Malformed RSS XML: {exc}") from exc

    items = []
    for item in root.findall(".//item"):
        def _text(tag: str, _item=item) -> str:
            el = _item.find(tag)
            return (el.text or "").strip() if el is not None else ""

        items.append({
            "title":       _text("title"),
            "link":        _text("link"),
            "description": _text("description"),
            "pubDate":     _text("pubDate"),
        })
    return items


def scrape(profile: dict, location: str, results_wanted: int = 50) -> list[dict]:
    """Fetch jobs from Craigslist RSS for a single location.

    Args:
        profile: Search profile dict from search_profiles.yaml.
        location: Location string (e.g. "Remote" or "San Francisco Bay Area, CA").
        results_wanted: Maximum results to return across all metros and titles.

    Returns:
        List of job dicts with keys: title, company, url, source, location,
        is_remote, salary, description.
        company/salary are empty — filled later by enrich_craigslist task.
    """
    try:
        cfg = _load_config()
    except (FileNotFoundError, ValueError) as exc:
        print(f"    [craigslist] Skipped — {exc}")
        return []

    metros_all: list[str] = cfg.get("metros", [])
    location_map: dict[str, str] = cfg.get("location_map", {})
    category: str = cfg.get("category") or _DEFAULT_CATEGORY

    is_remote_search = location.lower() == "remote"
    if is_remote_search:
        metros = metros_all
    else:
        metro = location_map.get(location)
        if not metro:
            print(f"    [craigslist] No metro mapping for '{location}' — skipping")
            return []
        metros = [metro]

    titles: list[str] = profile.get("titles", [])
    hours_old: int = profile.get("hours_old", 240)
    cutoff = datetime.now(tz=timezone.utc).timestamp() - (hours_old * 3600)

    seen_urls: set[str] = set()
    results: list[dict] = []

    for metro in metros:
        if len(results) >= results_wanted:
            break

        for title in titles:
            if len(results) >= results_wanted:
                break

            url = _rss_url(metro, category, title)
            try:
                items = _fetch_rss(url)
            except requests.RequestException as exc:
                print(f"    [craigslist] HTTP error ({metro}/{title}): {exc}")
                time.sleep(_SLEEP)
                continue
            except ValueError as exc:
                print(f"    [craigslist] Parse error ({metro}/{title}): {exc}")
                time.sleep(_SLEEP)
                continue

            for item in items:
                if len(results) >= results_wanted:
                    break

                item_url = item.get("link", "")
                if not item_url or item_url in seen_urls:
                    continue

                pub = _parse_pubdate(item.get("pubDate", ""))
                if pub and pub.timestamp() < cutoff:
                    continue

                seen_urls.add(item_url)
                results.append({
                    "title":       item.get("title", ""),
                    "company":     "",
                    "url":         item_url,
                    "source":      "craigslist",
                    "location":    f"{metro} (Craigslist)",
                    "is_remote":   is_remote_search,
                    "salary":      "",
                    "description": "",
                })

            time.sleep(_SLEEP)

    return results[:results_wanted]