From faa1807e9663063a05adfc904fba3482061c4ad0 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Wed, 1 Apr 2026 07:10:46 -0700 Subject: [PATCH] feat(api): add job ranker and credential store scripts - scripts/job_ranker.py: two-stage rank pipeline for /api/jobs/stack endpoint; scores pending jobs by match_score + seniority signals - scripts/credential_store.py: per-user credential management (BYOK API keys, email passwords); used by dev_api settings endpoints --- scripts/credential_store.py | 198 +++++++++++++++++++++++ scripts/job_ranker.py | 313 ++++++++++++++++++++++++++++++++++++ 2 files changed, 511 insertions(+) create mode 100644 scripts/credential_store.py create mode 100644 scripts/job_ranker.py diff --git a/scripts/credential_store.py b/scripts/credential_store.py new file mode 100644 index 0000000..3a1c307 --- /dev/null +++ b/scripts/credential_store.py @@ -0,0 +1,198 @@ +""" +Credential store abstraction for Peregrine. + +Backends (set via CREDENTIAL_BACKEND env var): + auto → try keyring, fall back to file (default) + keyring → python-keyring (OS Keychain / SecretService / libsecret) + file → Fernet-encrypted JSON in config/credentials/ (key at config/.credential_key) + +Env var references: + Any stored value matching ${VAR_NAME} is resolved from os.environ at read time. + Users can store "${IMAP_PASSWORD}" as the credential value; it is never treated + as the actual secret — only the env var it points to is used. +""" + +import os +import re +import json +import logging +from pathlib import Path +from typing import Optional + +logger = logging.getLogger(__name__) + +_ENV_REF = re.compile(r'^\$\{([A-Z_][A-Z0-9_]*)\}$') + +_PROJECT_ROOT = Path(__file__).parent.parent +CRED_DIR = _PROJECT_ROOT / "config" / "credentials" +KEY_PATH = _PROJECT_ROOT / "config" / ".credential_key" + + +def _resolve_env_ref(value: str) -> Optional[str]: + """If value is ${VAR_NAME}, return os.environ[VAR_NAME]; otherwise return None.""" + m = _ENV_REF.match(value) + if m: + resolved = os.environ.get(m.group(1)) + if resolved is None: + logger.warning("Credential reference %s is set but env var is not defined", value) + return resolved + return None + + +def _get_backend() -> str: + backend = os.environ.get("CREDENTIAL_BACKEND", "auto").lower() + if backend != "auto": + return backend + # Auto: try keyring, fall back to file + try: + import keyring + kr = keyring.get_keyring() + # Reject the null/fail keyring — it can't actually store anything + if "fail" in type(kr).__name__.lower() or "null" in type(kr).__name__.lower(): + raise RuntimeError("No usable keyring backend found") + return "keyring" + except Exception: + return "file" + + +def _get_fernet(): + """Return a Fernet instance, auto-generating the key on first use.""" + try: + from cryptography.fernet import Fernet + except ImportError: + return None + + if KEY_PATH.exists(): + key = KEY_PATH.read_bytes().strip() + else: + key = Fernet.generate_key() + KEY_PATH.parent.mkdir(parents=True, exist_ok=True) + fd = os.open(str(KEY_PATH), os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600) + with os.fdopen(fd, "wb") as f: + f.write(key) + logger.info("Generated new credential encryption key at %s", KEY_PATH) + + return Fernet(key) + + +def _file_read(service: str) -> dict: + """Read the credentials file for a service, decrypting if possible.""" + cred_file = CRED_DIR / f"{service}.json" + if not cred_file.exists(): + return {} + raw = cred_file.read_bytes() + fernet = _get_fernet() + if fernet: + try: + return json.loads(fernet.decrypt(raw)) + except Exception: + # May be an older plaintext file — try reading as text + try: + return json.loads(raw.decode()) + except Exception: + logger.error("Failed to read credentials for service %s", service) + return {} + else: + try: + return json.loads(raw.decode()) + except Exception: + return {} + + +def _file_write(service: str, data: dict) -> None: + """Write the credentials file for a service, encrypting if possible.""" + CRED_DIR.mkdir(parents=True, exist_ok=True) + cred_file = CRED_DIR / f"{service}.json" + fernet = _get_fernet() + if fernet: + content = fernet.encrypt(json.dumps(data).encode()) + fd = os.open(str(cred_file), os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600) + with os.fdopen(fd, "wb") as f: + f.write(content) + else: + logger.warning( + "cryptography package not installed — storing credentials as plaintext with chmod 600. " + "Install with: pip install cryptography" + ) + content = json.dumps(data).encode() + fd = os.open(str(cred_file), os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600) + with os.fdopen(fd, "wb") as f: + f.write(content) + + +def get_credential(service: str, key: str) -> Optional[str]: + """ + Retrieve a credential. If the stored value is an env var reference (${VAR}), + resolves it from os.environ at call time. + """ + backend = _get_backend() + raw: Optional[str] = None + + if backend == "keyring": + try: + import keyring + raw = keyring.get_password(service, key) + except Exception as e: + logger.error("keyring get failed for %s/%s: %s", service, key, e) + else: # file + data = _file_read(service) + raw = data.get(key) + + if raw is None: + return None + + # Resolve env var references transparently + resolved = _resolve_env_ref(raw) + if resolved is not None: + return resolved + if _ENV_REF.match(raw): + return None # reference defined but env var not set + + return raw + + +def set_credential(service: str, key: str, value: str) -> None: + """ + Store a credential. Value may be a literal secret or a ${VAR_NAME} reference. + Env var references are stored as-is and resolved at get time. + """ + if not value: + return + + backend = _get_backend() + + if backend == "keyring": + try: + import keyring + keyring.set_password(service, key, value) + return + except Exception as e: + logger.error("keyring set failed for %s/%s: %s — falling back to file", service, key, e) + backend = "file" + + # file backend + data = _file_read(service) + data[key] = value + _file_write(service, data) + + +def delete_credential(service: str, key: str) -> None: + """Remove a stored credential.""" + backend = _get_backend() + + if backend == "keyring": + try: + import keyring + keyring.delete_password(service, key) + return + except Exception: + backend = "file" + + data = _file_read(service) + data.pop(key, None) + if data: + _file_write(service, data) + else: + cred_file = CRED_DIR / f"{service}.json" + if cred_file.exists(): + cred_file.unlink() diff --git a/scripts/job_ranker.py b/scripts/job_ranker.py new file mode 100644 index 0000000..470f054 --- /dev/null +++ b/scripts/job_ranker.py @@ -0,0 +1,313 @@ +"""Job ranking engine — two-stage discovery → review pipeline. + +Stage 1 (discover.py) scrapes a wide corpus and stores everything as 'pending'. +Stage 2 (this module) scores the corpus; GET /api/jobs/stack returns top-N best +matches for the user's current review session. + +All signal functions return a float in [0, 1]. The final stack_score is 0–100. + +Usage: + from scripts.job_ranker import rank_jobs + ranked = rank_jobs(jobs, search_titles, salary_min, salary_max, user_level) +""" +from __future__ import annotations + +import math +import re +from datetime import datetime, timezone + + +# ── TUNING ───────────────────────────────────────────────────────────────────── +# Adjust these constants to change how jobs are ranked. +# All individual signal scores are normalised to [0, 1] before weighting. +# Weights should sum to ≤ 1.0; the remainder is unallocated slack. + +W_RESUME_MATCH = 0.40 # TF-IDF cosine similarity stored as match_score (0–100 → 0–1) +W_TITLE_MATCH = 0.30 # seniority-aware title + domain keyword overlap +W_RECENCY = 0.15 # freshness — exponential decay from date_found +W_SALARY_FIT = 0.10 # salary range overlap vs user target (neutral when unknown) +W_DESC_QUALITY = 0.05 # posting completeness — penalises stub / ghost posts + +# Keyword gap penalty: each missing keyword from the resume match costs points. +# Gaps are already partially captured by W_RESUME_MATCH (same TF-IDF source), +# so this is a soft nudge, not a hard filter. +GAP_PENALTY_PER_KEYWORD: float = 0.5 # points off per gap keyword (0–100 scale) +GAP_MAX_PENALTY: float = 5.0 # hard cap so a gap-heavy job can still rank + +# Recency half-life: score halves every N days past date_found +RECENCY_HALF_LIFE: int = 7 # days + +# Description word-count thresholds +DESC_MIN_WORDS: int = 50 # below this → scaled penalty +DESC_TARGET_WORDS: int = 200 # at or above → full quality score +# ── END TUNING ───────────────────────────────────────────────────────────────── + + +# ── Seniority level map ──────────────────────────────────────────────────────── +# (level, [keyword substrings that identify that level]) +# Matched on " " with a space-padded check to avoid false hits. +# Level 3 is the default (mid-level, no seniority modifier in title). +_SENIORITY_MAP: list[tuple[int, list[str]]] = [ + (1, ["intern", "internship", "trainee", "apprentice", "co-op", "coop"]), + (2, ["entry level", "entry-level", "junior", "jr ", "jr.", "associate "]), + (3, ["mid level", "mid-level", "intermediate"]), + (4, ["senior ", "senior,", "sr ", "sr.", " lead ", "lead,", " ii ", " iii ", + "specialist", "experienced"]), + (5, ["staff ", "principal ", "architect ", "expert ", "distinguished"]), + (6, ["director", "head of ", "manager ", "vice president", " vp "]), + (7, ["chief", "cto", "cio", "cpo", "president", "founder"]), +] + +# job_level − user_level → scoring multiplier +# Positive delta = job is more senior (stretch up = encouraged) +# Negative delta = job is below the user's level +_LEVEL_MULTIPLIER: dict[int, float] = { + -4: 0.05, -3: 0.10, -2: 0.25, -1: 0.65, + 0: 1.00, + 1: 0.90, 2: 0.65, 3: 0.25, 4: 0.05, +} +_DEFAULT_LEVEL_MULTIPLIER = 0.05 + + +# ── Seniority helpers ───────────────────────────────────────────────────────── + +def infer_seniority(title: str) -> int: + """Return seniority level 1–7 for a job or resume title. Defaults to 3.""" + padded = f" {title.lower()} " + # Iterate highest → lowest so "Senior Lead" resolves to 4, not 6 + for level, keywords in reversed(_SENIORITY_MAP): + for kw in keywords: + if kw in padded: + return level + return 3 + + +def seniority_from_experience(titles: list[str]) -> int: + """Estimate user's current level from their most recent experience titles. + + Averages the levels of the top-3 most recent titles (first in the list). + Falls back to 3 (mid-level) if no titles are provided. + """ + if not titles: + return 3 + sample = [t for t in titles if t.strip()][:3] + if not sample: + return 3 + levels = [infer_seniority(t) for t in sample] + return round(sum(levels) / len(levels)) + + +def _strip_level_words(text: str) -> str: + """Remove seniority/modifier words so domain keywords stand out.""" + strip = { + "senior", "sr", "junior", "jr", "lead", "staff", "principal", + "associate", "entry", "mid", "intermediate", "experienced", + "director", "head", "manager", "architect", "chief", "intern", + "ii", "iii", "iv", "i", + } + return " ".join(w for w in text.lower().split() if w not in strip) + + +# ── Signal functions ────────────────────────────────────────────────────────── + +def title_match_score(job_title: str, search_titles: list[str], user_level: int) -> float: + """Seniority-aware title similarity in [0, 1]. + + Combines: + - Domain overlap: keyword intersection between job title and search titles + after stripping level modifiers (so "Senior Software Engineer" vs + "Software Engineer" compares only on "software engineer"). + - Seniority multiplier: rewards same-level and +1 stretch; penalises + large downgrade or unreachable stretch. + """ + if not search_titles: + return 0.5 # neutral — user hasn't set title prefs yet + + job_level = infer_seniority(job_title) + level_delta = job_level - user_level + seniority_factor = _LEVEL_MULTIPLIER.get(level_delta, _DEFAULT_LEVEL_MULTIPLIER) + + job_core_words = {w for w in _strip_level_words(job_title).split() if len(w) > 2} + + best_domain = 0.0 + for st in search_titles: + st_core_words = {w for w in _strip_level_words(st).split() if len(w) > 2} + if not st_core_words: + continue + # Recall-biased overlap: what fraction of the search title keywords + # appear in the job title? (A job posting may use synonyms but we + # at least want the core nouns to match.) + overlap = len(st_core_words & job_core_words) / len(st_core_words) + best_domain = max(best_domain, overlap) + + # Base score from domain match scaled by seniority appropriateness. + # A small seniority_factor bonus (×0.2) ensures that even a near-miss + # domain match still benefits from seniority alignment. + return min(1.0, best_domain * seniority_factor + seniority_factor * 0.15) + + +def recency_decay(date_found: str) -> float: + """Exponential decay starting from date_found. + + Returns 1.0 for today, 0.5 after RECENCY_HALF_LIFE days, ~0.0 after ~4×. + Returns 0.5 (neutral) if the date is unparseable. + """ + try: + # Support both "YYYY-MM-DD" and "YYYY-MM-DD HH:MM:SS" + found = datetime.fromisoformat(date_found.split("T")[0].split(" ")[0]) + found = found.replace(tzinfo=timezone.utc) + now = datetime.now(tz=timezone.utc) + days_old = max(0.0, (now - found).total_seconds() / 86400) + return math.exp(-math.log(2) * days_old / RECENCY_HALF_LIFE) + except Exception: + return 0.5 + + +def _parse_salary_range(text: str | None) -> tuple[int | None, int | None]: + """Extract (low, high) salary integers from free-text. Returns (None, None) on failure. + + Handles: "$80k - $120k", "USD 80,000 - 120,000 per year", "£45,000", + "80000", "80K/yr", "80-120k", etc. + """ + if not text: + return None, None + normalized = re.sub(r"[$,£€₹¥\s]", "", text.lower()) + # Match numbers optionally followed by 'k' + raw_nums = re.findall(r"(\d+(?:\.\d+)?)k?", normalized) + values = [] + for n, full in zip(raw_nums, re.finditer(r"(\d+(?:\.\d+)?)(k?)", normalized)): + val = float(full.group(1)) + if full.group(2): # ends with 'k' + val *= 1000 + elif val < 1000: # bare numbers < 1000 are likely thousands (e.g., "80" in "80-120k") + val *= 1000 + if val >= 10_000: # sanity: ignore clearly wrong values + values.append(int(val)) + values = sorted(set(values)) + if not values: + return None, None + return values[0], values[-1] + + +def salary_fit( + salary_text: str | None, + target_min: int | None, + target_max: int | None, +) -> float: + """Salary range overlap score in [0, 1]. + + Returns 0.5 (neutral) when either range is unknown — a missing salary + line is not inherently negative. + """ + if not salary_text or (target_min is None and target_max is None): + return 0.5 + + job_low, job_high = _parse_salary_range(salary_text) + if job_low is None: + return 0.5 + + t_min = target_min or 0 + t_max = target_max or (int(target_min * 1.5) if target_min else job_high or job_low) + job_high = job_high or job_low + + overlap_low = max(job_low, t_min) + overlap_high = min(job_high, t_max) + overlap = max(0, overlap_high - overlap_low) + target_span = max(1, t_max - t_min) + return min(1.0, overlap / target_span) + + +def description_quality(description: str | None) -> float: + """Posting completeness score in [0, 1]. + + Stubs and ghost posts score near 0; well-written descriptions score 1.0. + """ + if not description: + return 0.0 + words = len(description.split()) + if words < DESC_MIN_WORDS: + return (words / DESC_MIN_WORDS) * 0.4 # steep penalty for stubs + if words >= DESC_TARGET_WORDS: + return 1.0 + return 0.4 + 0.6 * (words - DESC_MIN_WORDS) / (DESC_TARGET_WORDS - DESC_MIN_WORDS) + + +# ── Composite scorer ────────────────────────────────────────────────────────── + +def score_job( + job: dict, + search_titles: list[str], + target_salary_min: int | None, + target_salary_max: int | None, + user_level: int, +) -> float: + """Compute composite stack_score (0–100) for a single job dict. + + Args: + job: Row dict from the jobs table (must have title, match_score, + date_found, salary, description, keyword_gaps). + search_titles: User's desired job titles (from search prefs). + target_salary_*: User's salary target from resume profile (or None). + user_level: Inferred seniority level 1–7. + + Returns: + A float 0–100. Higher = better match for this user's session. + """ + # ── Individual signals (all 0–1) ────────────────────────────────────────── + match_raw = job.get("match_score") + s_resume = (match_raw / 100.0) if match_raw is not None else 0.5 + + s_title = title_match_score(job.get("title", ""), search_titles, user_level) + s_recency = recency_decay(job.get("date_found", "")) + s_salary = salary_fit(job.get("salary"), target_salary_min, target_salary_max) + s_desc = description_quality(job.get("description")) + + # ── Weighted sum ────────────────────────────────────────────────────────── + base = ( + W_RESUME_MATCH * s_resume + + W_TITLE_MATCH * s_title + + W_RECENCY * s_recency + + W_SALARY_FIT * s_salary + + W_DESC_QUALITY * s_desc + ) + + # ── Keyword gap penalty (applied on the 0–100 scale) ───────────────────── + gaps_raw = job.get("keyword_gaps") or "" + gap_count = len([g for g in gaps_raw.split(",") if g.strip()]) if gaps_raw else 0 + gap_penalty = min(GAP_MAX_PENALTY, gap_count * GAP_PENALTY_PER_KEYWORD) / 100.0 + + return round(max(0.0, base - gap_penalty) * 100, 1) + + +# ── Public API ──────────────────────────────────────────────────────────────── + +def rank_jobs( + jobs: list[dict], + search_titles: list[str], + target_salary_min: int | None = None, + target_salary_max: int | None = None, + user_level: int = 3, + limit: int = 10, + min_score: float = 20.0, +) -> list[dict]: + """Score and rank pending jobs; return top-N above min_score. + + Args: + jobs: List of job dicts (from DB or any source). + search_titles: User's desired job titles from search prefs. + target_salary_*: User's salary target (from resume profile). + user_level: Seniority level 1–7 (use seniority_from_experience()). + limit: Stack size; pass 0 to return all qualifying jobs. + min_score: Minimum stack_score to include (0–100). + + Returns: + Sorted list (best first) with 'stack_score' key added to each dict. + """ + scored = [] + for job in jobs: + s = score_job(job, search_titles, target_salary_min, target_salary_max, user_level) + if s >= min_score: + scored.append({**job, "stack_score": s}) + + scored.sort(key=lambda j: j["stack_score"], reverse=True) + return scored[:limit] if limit > 0 else scored