diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..8eb2a32 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,13 @@ +# Contributing to Peregrine + +See the full contributing guide in the documentation: +https://docs.circuitforge.io/peregrine/developer-guide/contributing/ + +## Quick start + +1. Fork the repo and create a feature branch (`feat/my-feature`) +2. Set up the dev environment: `conda env create -f environment.yml` +3. Run tests: `conda run -n job-seeker python -m pytest tests/ -v` +4. Open a pull request — all CI checks must pass + +See the docs for: adding custom scrapers, adding integrations, code style, and PR checklist. diff --git a/docs/developer-guide/adding-integrations.md b/docs/developer-guide/adding-integrations.md new file mode 100644 index 0000000..89181b4 --- /dev/null +++ b/docs/developer-guide/adding-integrations.md @@ -0,0 +1,249 @@ +# Adding an Integration + +Peregrine's integration system is auto-discovered — add a class and a config example, and it appears in the wizard and Settings automatically. No registration step is needed. + +--- + +## Step 1 — Create the integration module + +Create `scripts/integrations/myservice.py`: + +```python +# scripts/integrations/myservice.py + +from scripts.integrations.base import IntegrationBase + + +class MyServiceIntegration(IntegrationBase): + name = "myservice" # must be unique; matches config filename + label = "My Service" # display name shown in the UI + tier = "free" # "free" | "paid" | "premium" + + def fields(self) -> list[dict]: + """Return form field definitions for the connection card in the wizard/Settings UI.""" + return [ + { + "key": "api_key", + "label": "API Key", + "type": "password", # "text" | "password" | "url" | "checkbox" + "placeholder": "sk-...", + "required": True, + "help": "Get your key at myservice.com/settings/api", + }, + { + "key": "workspace_id", + "label": "Workspace ID", + "type": "text", + "placeholder": "ws_abc123", + "required": True, + "help": "Found in your workspace URL", + }, + ] + + def connect(self, config: dict) -> bool: + """ + Store credentials in memory. Return True if all required fields are present. + Does NOT verify credentials — call test() for that. + """ + self._api_key = config.get("api_key", "").strip() + self._workspace_id = config.get("workspace_id", "").strip() + return bool(self._api_key and self._workspace_id) + + def test(self) -> bool: + """ + Verify the stored credentials actually work. + Returns True on success, False on any failure. + """ + try: + import requests + r = requests.get( + "https://api.myservice.com/v1/ping", + headers={"Authorization": f"Bearer {self._api_key}"}, + params={"workspace": self._workspace_id}, + timeout=5, + ) + return r.ok + except Exception: + return False + + def sync(self, jobs: list[dict]) -> int: + """ + Optional: push jobs to the external service. + Return the count of successfully synced jobs. + The default implementation in IntegrationBase returns 0 (no-op). + Only override this if your integration supports job syncing + (e.g. Notion, Airtable, Google Sheets). + """ + synced = 0 + for job in jobs: + try: + self._push_job(job) + synced += 1 + except Exception as e: + print(f"[myservice] sync error for job {job.get('id')}: {e}") + return synced + + def _push_job(self, job: dict) -> None: + import requests + requests.post( + "https://api.myservice.com/v1/records", + headers={"Authorization": f"Bearer {self._api_key}"}, + json={ + "workspace": self._workspace_id, + "title": job.get("title", ""), + "company": job.get("company", ""), + "status": job.get("status", "pending"), + "url": job.get("url", ""), + }, + timeout=10, + ).raise_for_status() +``` + +--- + +## Step 2 — Create the config example file + +Create `config/integrations/myservice.yaml.example`: + +```yaml +# config/integrations/myservice.yaml.example +# Copy to config/integrations/myservice.yaml and fill in your credentials. +# This file is gitignored — never commit the live credentials. +api_key: "" +workspace_id: "" +``` + +The live credentials file (`config/integrations/myservice.yaml`) is gitignored automatically via the `config/integrations/` entry in `.gitignore`. + +--- + +## Step 3 — Auto-discovery + +No registration step is needed. The integration registry (`scripts/integrations/__init__.py`) imports all `.py` files in the `integrations/` directory and discovers subclasses of `IntegrationBase` automatically. + +On next startup, `myservice` will appear in: +- The first-run wizard Step 7 (Integrations) +- **Settings → Integrations** with a connection card rendered from `fields()` + +--- + +## Step 4 — Tier-gate new features (optional) + +If you want to gate a specific action (not just the integration itself) behind a tier, add an entry to `app/wizard/tiers.py`: + +```python +FEATURES: dict[str, str] = { + # ...existing entries... + "myservice_sync": "paid", # or "free" | "premium" +} +``` + +Then guard the action in the relevant UI page: + +```python +from app.wizard.tiers import can_use +from scripts.user_profile import UserProfile + +user = UserProfile() +if can_use(user.tier, "myservice_sync"): + # show the sync button +else: + st.info("MyService sync requires a Paid plan.") +``` + +--- + +## Step 5 — Write a test + +Create or add to `tests/test_integrations.py`: + +```python +# tests/test_integrations.py (add to existing file) + +import pytest +from unittest.mock import patch, MagicMock +from pathlib import Path +from scripts.integrations.myservice import MyServiceIntegration + + +def test_fields_returns_required_keys(): + integration = MyServiceIntegration() + fields = integration.fields() + assert len(fields) >= 1 + for field in fields: + assert "key" in field + assert "label" in field + assert "type" in field + assert "required" in field + + +def test_connect_returns_true_with_valid_config(): + integration = MyServiceIntegration() + result = integration.connect({"api_key": "sk-abc", "workspace_id": "ws-123"}) + assert result is True + + +def test_connect_returns_false_with_missing_required_field(): + integration = MyServiceIntegration() + result = integration.connect({"api_key": "", "workspace_id": "ws-123"}) + assert result is False + + +def test_test_returns_true_on_200(tmp_path): + integration = MyServiceIntegration() + integration.connect({"api_key": "sk-abc", "workspace_id": "ws-123"}) + + mock_resp = MagicMock() + mock_resp.ok = True + + with patch("scripts.integrations.myservice.requests.get", return_value=mock_resp): + assert integration.test() is True + + +def test_test_returns_false_on_error(tmp_path): + integration = MyServiceIntegration() + integration.connect({"api_key": "sk-abc", "workspace_id": "ws-123"}) + + with patch("scripts.integrations.myservice.requests.get", side_effect=Exception("timeout")): + assert integration.test() is False + + +def test_is_configured_reflects_file_presence(tmp_path): + config_dir = tmp_path / "config" + config_dir.mkdir() + (config_dir / "integrations").mkdir() + + assert MyServiceIntegration.is_configured(config_dir) is False + + (config_dir / "integrations" / "myservice.yaml").write_text("api_key: sk-abc\n") + assert MyServiceIntegration.is_configured(config_dir) is True +``` + +--- + +## IntegrationBase Reference + +All integrations inherit from `scripts/integrations/base.py`. Here is the full interface: + +| Method / attribute | Required | Description | +|-------------------|----------|-------------| +| `name: str` | Yes | Machine key — must be unique. Matches the YAML config filename. | +| `label: str` | Yes | Human-readable display name for the UI. | +| `tier: str` | Yes | Minimum tier: `"free"`, `"paid"`, or `"premium"`. | +| `fields() -> list[dict]` | Yes | Returns form field definitions. Each dict: `key`, `label`, `type`, `placeholder`, `required`, `help`. | +| `connect(config: dict) -> bool` | Yes | Stores credentials in memory. Returns `True` if required fields are present. Does NOT verify credentials. | +| `test() -> bool` | Yes | Makes a real network call to verify stored credentials. Returns `True` on success. | +| `sync(jobs: list[dict]) -> int` | No | Pushes jobs to the external service. Returns count synced. Default is a no-op returning 0. | +| `config_path(config_dir: Path) -> Path` | Inherited | Returns `config_dir / "integrations" / f"{name}.yaml"`. | +| `is_configured(config_dir: Path) -> bool` | Inherited | Returns `True` if the config YAML file exists. | +| `save_config(config: dict, config_dir: Path)` | Inherited | Writes config dict to the YAML file. Call after `test()` returns `True`. | +| `load_config(config_dir: Path) -> dict` | Inherited | Loads and returns the YAML config, or `{}` if not configured. | + +### Field type values + +| `type` value | UI widget rendered | +|-------------|-------------------| +| `"text"` | Plain text input | +| `"password"` | Password input (masked) | +| `"url"` | URL input | +| `"checkbox"` | Boolean checkbox | diff --git a/docs/developer-guide/adding-scrapers.md b/docs/developer-guide/adding-scrapers.md new file mode 100644 index 0000000..0aba019 --- /dev/null +++ b/docs/developer-guide/adding-scrapers.md @@ -0,0 +1,244 @@ +# Adding a Custom Job Board Scraper + +Peregrine supports pluggable custom job board scrapers. Standard boards use the JobSpy library. Custom scrapers handle boards with non-standard APIs, paywalls, or SSR-rendered pages. + +This guide walks through adding a new scraper from scratch. + +--- + +## Step 1 — Create the scraper module + +Create `scripts/custom_boards/myboard.py`. Every custom scraper must implement one function: + +```python +# scripts/custom_boards/myboard.py + +def scrape(profile: dict, db_path: str) -> list[dict]: + """ + Scrape job listings from MyBoard for the given search profile. + + Args: + profile: The active search profile dict from search_profiles.yaml. + Keys include: titles (list), locations (list), + hours_old (int), results_per_board (int). + db_path: Absolute path to staging.db. Use this if you need to + check for existing URLs before returning. + + Returns: + List of job dicts. Each dict must contain at minimum: + title (str) — job title + company (str) — company name + url (str) — canonical job URL (used as unique key) + source (str) — board identifier, e.g. "myboard" + location (str) — "Remote" or "City, State" + is_remote (bool) — True if remote + salary (str) — salary string or "" if unknown + description (str) — full job description text or "" if unavailable + date_found (str) — ISO 8601 datetime string, e.g. "2026-02-25T12:00:00" + """ + jobs = [] + + for title in profile.get("titles", []): + for location in profile.get("locations", []): + results = _fetch_from_myboard(title, location, profile) + jobs.extend(results) + + return jobs + + +def _fetch_from_myboard(title: str, location: str, profile: dict) -> list[dict]: + """Internal helper — call the board's API and transform results.""" + import requests + from datetime import datetime + + params = { + "q": title, + "l": location, + "limit": profile.get("results_per_board", 50), + } + + try: + resp = requests.get( + "https://api.myboard.com/jobs", + params=params, + timeout=15, + ) + resp.raise_for_status() + data = resp.json() + except Exception as e: + print(f"[myboard] fetch error: {e}") + return [] + + jobs = [] + for item in data.get("results", []): + jobs.append({ + "title": item.get("title", ""), + "company": item.get("company", ""), + "url": item.get("url", ""), + "source": "myboard", + "location": item.get("location", ""), + "is_remote": "remote" in item.get("location", "").lower(), + "salary": item.get("salary", ""), + "description": item.get("description", ""), + "date_found": datetime.utcnow().isoformat(), + }) + + return jobs +``` + +### Required fields + +| Field | Type | Notes | +|-------|------|-------| +| `title` | str | Job title | +| `company` | str | Company name | +| `url` | str | **Unique key** — must be stable and canonical | +| `source` | str | Short board identifier, e.g. `"myboard"` | +| `location` | str | `"Remote"` or `"City, ST"` | +| `is_remote` | bool | `True` if remote | +| `salary` | str | Salary string or `""` | +| `description` | str | Full description text or `""` | +| `date_found` | str | ISO 8601 UTC datetime | + +### Deduplication + +`discover.py` deduplicates by `url` before inserting into the database. If a job with the same URL already exists, it is silently skipped. You do not need to handle deduplication inside your scraper. + +### Rate limiting + +Be a good citizen: +- Add a `time.sleep(0.5)` between paginated requests +- Respect `Retry-After` headers +- Do not scrape faster than a human browsing the site +- If the site provides an official API, prefer that over scraping HTML + +### Credentials + +If your scraper requires API keys or credentials: +- Create `config/myboard.yaml.example` as a template +- Create `config/myboard.yaml` (gitignored) for live credentials +- Read it in your scraper with `yaml.safe_load(open("config/myboard.yaml"))` +- Document the credential setup in comments at the top of your module + +--- + +## Step 2 — Register the scraper + +Open `scripts/discover.py` and add your scraper to the `CUSTOM_SCRAPERS` dict: + +```python +from scripts.custom_boards import adzuna, theladders, craigslist, myboard + +CUSTOM_SCRAPERS = { + "adzuna": adzuna.scrape, + "theladders": theladders.scrape, + "craigslist": craigslist.scrape, + "myboard": myboard.scrape, # add this line +} +``` + +--- + +## Step 3 — Activate in a search profile + +Open `config/search_profiles.yaml` and add `myboard` to `custom_boards` in any profile: + +```yaml +profiles: + - name: cs_leadership + boards: + - linkedin + - indeed + custom_boards: + - adzuna + - myboard # add this line + titles: + - Customer Success Manager + locations: + - Remote +``` + +--- + +## Step 4 — Write a test + +Create `tests/test_myboard.py`. Mock the HTTP call to avoid hitting the live API during tests: + +```python +# tests/test_myboard.py + +from unittest.mock import patch +from scripts.custom_boards.myboard import scrape + +MOCK_RESPONSE = { + "results": [ + { + "title": "Customer Success Manager", + "company": "Acme Corp", + "url": "https://myboard.com/jobs/12345", + "location": "Remote", + "salary": "$80,000 - $100,000", + "description": "We are looking for a CSM...", + } + ] +} + +def test_scrape_returns_correct_shape(): + profile = { + "titles": ["Customer Success Manager"], + "locations": ["Remote"], + "results_per_board": 10, + "hours_old": 240, + } + + with patch("scripts.custom_boards.myboard.requests.get") as mock_get: + mock_get.return_value.ok = True + mock_get.return_value.raise_for_status = lambda: None + mock_get.return_value.json.return_value = MOCK_RESPONSE + + jobs = scrape(profile, db_path="nonexistent.db") + + assert len(jobs) == 1 + job = jobs[0] + + # Required fields + assert "title" in job + assert "company" in job + assert "url" in job + assert "source" in job + assert "location" in job + assert "is_remote" in job + assert "salary" in job + assert "description" in job + assert "date_found" in job + + assert job["source"] == "myboard" + assert job["title"] == "Customer Success Manager" + assert job["url"] == "https://myboard.com/jobs/12345" + + +def test_scrape_handles_http_error_gracefully(): + profile = { + "titles": ["Customer Success Manager"], + "locations": ["Remote"], + "results_per_board": 10, + "hours_old": 240, + } + + with patch("scripts.custom_boards.myboard.requests.get") as mock_get: + mock_get.side_effect = Exception("Connection refused") + + jobs = scrape(profile, db_path="nonexistent.db") + + assert jobs == [] +``` + +--- + +## Existing Scrapers as Reference + +| Scraper | Notes | +|---------|-------| +| `scripts/custom_boards/adzuna.py` | REST API with `app_id` + `app_key` authentication | +| `scripts/custom_boards/theladders.py` | SSR scraper using `curl_cffi` to parse `__NEXT_DATA__` JSON embedded in the page | +| `scripts/custom_boards/craigslist.py` | RSS feed scraper | diff --git a/docs/developer-guide/architecture.md b/docs/developer-guide/architecture.md new file mode 100644 index 0000000..e6c1e22 --- /dev/null +++ b/docs/developer-guide/architecture.md @@ -0,0 +1,168 @@ +# Architecture + +This page describes Peregrine's system structure, layer boundaries, and key design decisions. + +--- + +## System Overview + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Docker Compose │ +│ │ +│ ┌──────────┐ ┌──────────┐ ┌───────┐ ┌───────────────┐ │ +│ │ app │ │ ollama │ │ vllm │ │ vision │ │ +│ │ :8501 │ │ :11434 │ │ :8000 │ │ :8002 │ │ +│ │Streamlit │ │ Local LLM│ │ vLLM │ │ Moondream2 │ │ +│ └────┬─────┘ └──────────┘ └───────┘ └───────────────┘ │ +│ │ │ +│ ┌────┴───────┐ ┌─────────────┐ │ +│ │ searxng │ │ staging.db │ │ +│ │ :8888 │ │ (SQLite) │ │ +│ └────────────┘ └─────────────┘ │ +└─────────────────────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────────┐ +│ Streamlit App Layer │ +│ │ +│ app/app.py (entry point, navigation, sidebar task badge) │ +│ │ +│ app/pages/ │ +│ 0_Setup.py First-run wizard (gates everything) │ +│ 1_Job_Review.py Approve / reject queue │ +│ 2_Settings.py All user configuration │ +│ 4_Apply.py Cover letter gen + PDF export │ +│ 5_Interviews.py Kanban: phone_screen → hired │ +│ 6_Interview_Prep.py Research brief + practice Q&A │ +│ 7_Survey.py Culture-fit survey assistant │ +│ │ +│ app/wizard/ │ +│ step_hardware.py ... step_integrations.py │ +│ tiers.py Feature gate definitions │ +└─────────────────────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────────┐ +│ Scripts Layer │ +│ (framework-independent — could be called by FastAPI) │ +│ │ +│ discover.py JobSpy + custom board orchestration │ +│ match.py Resume keyword scoring │ +│ db.py All SQLite helpers (single source) │ +│ llm_router.py LLM fallback chain │ +│ generate_cover_letter.py Cover letter generation │ +│ company_research.py Pre-interview research brief │ +│ task_runner.py Background daemon thread executor │ +│ imap_sync.py IMAP email fetch + classify │ +│ sync.py Push to external integrations │ +│ user_profile.py UserProfile wrapper for user.yaml │ +│ preflight.py Port + resource check │ +│ │ +│ custom_boards/ Per-board scrapers │ +│ integrations/ Per-service integration drivers │ +│ vision_service/ FastAPI Moondream2 inference server │ +└─────────────────────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────────┐ +│ Config Layer │ +│ │ +│ config/user.yaml Personal data + wizard state │ +│ config/llm.yaml LLM backends + fallback chains │ +│ config/search_profiles.yaml Job search configuration │ +│ config/resume_keywords.yaml Scoring keywords │ +│ config/blocklist.yaml Excluded companies/domains │ +│ config/email.yaml IMAP credentials │ +│ config/integrations/ Per-integration credentials │ +└─────────────────────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────────┐ +│ Database Layer │ +│ │ +│ staging.db (SQLite, local, gitignored) │ +│ │ +│ jobs Core pipeline — all job data │ +│ job_contacts Email thread log per job │ +│ company_research LLM-generated research briefs │ +│ background_tasks Async task queue state │ +│ survey_responses Culture-fit survey Q&A pairs │ +└─────────────────────────────────────────────────────────────┘ +``` + +--- + +## Layer Boundaries + +### App layer (app/) + +The Streamlit UI layer. Its only responsibilities are: + +- Reading from `scripts/db.py` helpers +- Calling `scripts/` functions directly or via `task_runner.submit_task()` +- Rendering results to the browser + +The app layer does not contain business logic. Database queries, LLM calls, and integrations all live in `scripts/`. + +### Scripts layer (scripts/) + +This is the stable public API of Peregrine. Scripts are designed to be framework-independent — they do not import Streamlit and can be called from a CLI, FastAPI endpoint, or background thread without modification. + +All personal data access goes through `scripts/user_profile.py` (`UserProfile` class). Scripts never read `config/user.yaml` directly. + +All database access goes through `scripts/db.py`. No script does raw SQLite outside of `db.py`. + +### Config layer (config/) + +Plain YAML files. Gitignored files contain secrets; `.example` files are committed as templates. + +--- + +## Background Tasks + +`scripts/task_runner.py` provides a simple background thread executor for long-running LLM tasks. + +```python +from scripts.task_runner import submit_task + +# Queue a cover letter generation task +submit_task(db_path, task_type="cover_letter", job_id=42) + +# Queue a company research task +submit_task(db_path, task_type="company_research", job_id=42) +``` + +Tasks are recorded in the `background_tasks` table with statuses: `queued → running → completed / failed`. + +**Dedup rule:** Only one `queued` or `running` task per `(task_type, job_id)` pair is allowed at a time. Submitting a duplicate is a silent no-op. + +**On startup:** `app/app.py` resets any `running` or `queued` rows to `failed` to clear tasks that were interrupted by a server restart. + +**Sidebar indicator:** `app/app.py` polls the `background_tasks` table every 3 seconds via a Streamlit fragment and displays a badge in the sidebar. + +--- + +## LLM Router + +`scripts/llm_router.py` provides a single `complete()` call that tries backends in priority order and falls back transparently. See [LLM Router](../reference/llm-router.md) for full documentation. + +--- + +## Key Design Decisions + +### scripts/ is framework-independent + +The scripts layer was deliberately kept free of Streamlit imports. This means the full pipeline can be migrated to a FastAPI or Celery backend without rewriting business logic. + +### All personal data via UserProfile + +`scripts/user_profile.py` is the single source of truth for all user data. This makes it easy to swap the storage backend (e.g. from YAML to a database) without touching every script. + +### SQLite as staging layer + +`staging.db` acts as the staging layer between discovery and external integrations. This lets discovery, matching, and the UI all run independently without network dependencies. External integrations (Notion, Airtable, etc.) are push-only and optional. + +### Tier system in app/wizard/tiers.py + +`FEATURES` is a single dict that maps feature key → minimum tier. `can_use(tier, feature)` is the single gating function. New features are added to `FEATURES` in one place. + +### Vision service is a separate process + +Moondream2 requires `torch` and `transformers`, which are incompatible with the lightweight main conda environment. The vision service runs as a separate FastAPI process in a separate conda environment (`job-seeker-vision`), keeping the main env free of GPU dependencies. diff --git a/docs/developer-guide/contributing.md b/docs/developer-guide/contributing.md new file mode 100644 index 0000000..d160182 --- /dev/null +++ b/docs/developer-guide/contributing.md @@ -0,0 +1,120 @@ +# Contributing + +Thank you for your interest in contributing to Peregrine. This guide covers the development environment, code standards, test requirements, and pull request process. + +!!! note "License" + Peregrine uses a dual licence. The discovery pipeline (`scripts/discover.py`, `scripts/match.py`, `scripts/db.py`, `scripts/custom_boards/`) is MIT. All AI features, the UI, and everything else is BSL 1.1. + Do not add `Co-Authored-By:` trailers or AI-attribution notices to commits — this is a commercial repository. + +--- + +## Fork and Clone + +```bash +git clone https://git.circuitforge.io/circuitforge/peregrine +cd peregrine +``` + +Create a feature branch from `main`: + +```bash +git checkout -b feat/my-feature +``` + +--- + +## Dev Environment Setup + +Peregrine's Python dependencies are managed with conda. The same `job-seeker` environment is used for both the legacy personal app and Peregrine. + +```bash +# Create the environment from the lockfile +conda env create -f environment.yml + +# Activate +conda activate job-seeker +``` + +Alternatively, install from `requirements.txt` into an existing Python 3.12 environment: + +```bash +pip install -r requirements.txt +``` + +!!! warning "Keep the env lightweight" + Do not add `torch`, `sentence-transformers`, `bitsandbytes`, `transformers`, or any other CUDA/GPU package to the main environment. These live in separate conda environments (`job-seeker-vision` for the vision service, `ogma` for fine-tuning). Adding them to the main env causes out-of-memory failures during test runs. + +--- + +## Running Tests + +```bash +conda run -n job-seeker python -m pytest tests/ -v +``` + +Or with the direct binary (avoids runaway process spawning): + +```bash +/path/to/miniconda3/envs/job-seeker/bin/pytest tests/ -v +``` + +The `pytest.ini` file scopes collection to the `tests/` directory only — do not widen this. + +All tests must pass before submitting a PR. See [Testing](testing.md) for patterns and conventions. + +--- + +## Code Style + +- **PEP 8** for all Python code — use `flake8` or `ruff` to check +- **Type hints preferred** on function signatures — not required but strongly encouraged +- **Docstrings** on all public functions and classes +- **No print statements** in library code (`scripts/`); use Python's `logging` module or return status in the return value. `print` is acceptable in one-off scripts and `discover.py`-style entry points. + +--- + +## Branch Naming + +| Prefix | Use for | +|--------|---------| +| `feat/` | New features | +| `fix/` | Bug fixes | +| `docs/` | Documentation only | +| `refactor/` | Code reorganisation without behaviour change | +| `test/` | Test additions or corrections | +| `chore/` | Dependency updates, CI, tooling | + +Example: `feat/add-greenhouse-scraper`, `fix/email-imap-timeout`, `docs/add-integration-guide` + +--- + +## PR Checklist + +Before opening a pull request: + +- [ ] All tests pass: `conda run -n job-seeker python -m pytest tests/ -v` +- [ ] New behaviour is covered by at least one test +- [ ] No new dependencies added to `environment.yml` or `requirements.txt` without a clear justification in the PR description +- [ ] Documentation updated if the PR changes user-visible behaviour (update the relevant page in `docs/`) +- [ ] Config file changes are reflected in the `.example` file +- [ ] No secrets, tokens, or personal data in any committed file +- [ ] Gitignored files (`config/*.yaml`, `staging.db`, `aihawk/`, `.env`) are not committed + +--- + +## What NOT to Do + +- Do not commit `config/user.yaml`, `config/notion.yaml`, `config/email.yaml`, `config/adzuna.yaml`, or any `config/integrations/*.yaml` — all are gitignored +- Do not commit `staging.db` +- Do not add `torch`, `bitsandbytes`, `transformers`, or `sentence-transformers` to the main environment +- Do not add `Co-Authored-By:` or AI-attribution lines to commit messages +- Do not force-push to `main` + +--- + +## Getting Help + +Open an issue on the repository with the `question` label. Include: +- Your OS and Docker version +- The `inference_profile` from your `config/user.yaml` +- Relevant log output from `make logs` diff --git a/docs/developer-guide/testing.md b/docs/developer-guide/testing.md new file mode 100644 index 0000000..18a66f7 --- /dev/null +++ b/docs/developer-guide/testing.md @@ -0,0 +1,181 @@ +# Testing + +Peregrine has a test suite covering the core scripts layer, LLM router, integrations, wizard steps, and database helpers. + +--- + +## Running the Test Suite + +```bash +conda run -n job-seeker python -m pytest tests/ -v +``` + +Or using the direct binary (recommended to avoid runaway process spawning): + +```bash +/path/to/miniconda3/envs/job-seeker/bin/pytest tests/ -v +``` + +`pytest.ini` scopes test collection to `tests/` only: + +```ini +[pytest] +testpaths = tests +``` + +Do not widen this — the `aihawk/` subtree has its own test files that pull in GPU dependencies. + +--- + +## What Is Covered + +The suite currently has approximately 219 tests covering: + +| Module | What is tested | +|--------|---------------| +| `scripts/db.py` | CRUD helpers, status transitions, dedup logic | +| `scripts/llm_router.py` | Fallback chain, backend selection, vision routing, error handling | +| `scripts/match.py` | Keyword scoring, gap calculation | +| `scripts/imap_sync.py` | Email parsing, classification label mapping | +| `scripts/company_research.py` | Prompt construction, output parsing | +| `scripts/generate_cover_letter.py` | Mission alignment detection, prompt injection | +| `scripts/task_runner.py` | Task submission, dedup, status transitions | +| `scripts/user_profile.py` | Accessor methods, defaults, YAML round-trip | +| `scripts/integrations/` | Base class contract, per-driver `fields()` and `connect()` | +| `app/wizard/tiers.py` | `can_use()`, `tier_label()`, edge cases | +| `scripts/custom_boards/` | Scraper return shape, HTTP error handling | + +--- + +## Test Structure + +Tests live in `tests/`. File naming mirrors the module being tested: + +``` +tests/ + test_db.py + test_llm_router.py + test_match.py + test_imap_sync.py + test_company_research.py + test_cover_letter.py + test_task_runner.py + test_user_profile.py + test_integrations.py + test_tiers.py + test_adzuna.py + test_theladders.py +``` + +--- + +## Key Patterns + +### tmp_path for YAML files + +Use pytest's built-in `tmp_path` fixture for any test that reads or writes YAML config files: + +```python +def test_user_profile_reads_name(tmp_path): + config = tmp_path / "user.yaml" + config.write_text("name: Alice\nemail: alice@example.com\n") + + from scripts.user_profile import UserProfile + profile = UserProfile(config_path=config) + assert profile.name == "Alice" +``` + +### Mocking LLM calls + +Never make real LLM calls in tests. Patch `LLMRouter.complete`: + +```python +from unittest.mock import patch + +def test_cover_letter_calls_llm(tmp_path): + with patch("scripts.generate_cover_letter.LLMRouter") as MockRouter: + MockRouter.return_value.complete.return_value = "Dear Hiring Manager,\n..." + from scripts.generate_cover_letter import generate + result = generate(job={...}, user_profile={...}) + + assert "Dear Hiring Manager" in result + MockRouter.return_value.complete.assert_called_once() +``` + +### Mocking HTTP in scraper tests + +```python +from unittest.mock import patch + +def test_adzuna_returns_jobs(): + with patch("scripts.custom_boards.adzuna.requests.get") as mock_get: + mock_get.return_value.ok = True + mock_get.return_value.raise_for_status = lambda: None + mock_get.return_value.json.return_value = {"results": [...]} + + from scripts.custom_boards.adzuna import scrape + jobs = scrape(profile={...}, db_path="nonexistent.db") + + assert len(jobs) > 0 +``` + +### In-memory SQLite for DB tests + +```python +import sqlite3, tempfile, os + +def test_insert_job(): + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: + db_path = f.name + try: + from scripts.db import init_db, insert_job + init_db(db_path) + insert_job(db_path, title="CSM", company="Acme", url="https://example.com/1", ...) + # assert... + finally: + os.unlink(db_path) +``` + +--- + +## What NOT to Test + +- **Streamlit widget rendering** — Streamlit has no headless test support. Do not try to test `st.button()` or `st.text_input()` calls. Test the underlying script functions instead. +- **Real network calls** — always mock HTTP and LLM clients +- **Real GPU inference** — mock the vision service and LLM router + +--- + +## Adding Tests for New Code + +### New scraper + +Create `tests/test_myboard.py`. Required test cases: +1. Happy path: mock HTTP returns valid data → correct job dict shape +2. HTTP error: mock raises `Exception` → function returns `[]` (does not raise) +3. Empty results: API returns `{"results": []}` → function returns `[]` + +### New integration + +Add to `tests/test_integrations.py`. Required test cases: +1. `fields()` returns list of dicts with required keys +2. `connect()` returns `True` with valid config, `False` with missing required field +3. `test()` returns `True` with mocked successful HTTP, `False` with exception +4. `is_configured()` reflects file presence in `tmp_path` + +### New wizard step + +Add to `tests/test_wizard_steps.py`. Test the step's pure-logic functions (validation, data extraction). Do not test the Streamlit rendering. + +### New tier feature gate + +Add to `tests/test_tiers.py`: + +```python +from app.wizard.tiers import can_use + +def test_my_new_feature_requires_paid(): + assert can_use("free", "my_new_feature") is False + assert can_use("paid", "my_new_feature") is True + assert can_use("premium", "my_new_feature") is True +``` diff --git a/docs/getting-started/docker-profiles.md b/docs/getting-started/docker-profiles.md new file mode 100644 index 0000000..347c9a6 --- /dev/null +++ b/docs/getting-started/docker-profiles.md @@ -0,0 +1,118 @@ +# Docker Profiles + +Peregrine uses Docker Compose profiles to start only the services your hardware can support. Choose a profile with `make start PROFILE=`. + +--- + +## Profile Reference + +| Profile | Services started | Use case | +|---------|----------------|----------| +| `remote` | `app`, `searxng` | No GPU. LLM calls go to an external API (Anthropic, OpenAI-compatible). | +| `cpu` | `app`, `ollama`, `searxng` | No GPU. Runs local models on CPU — functional but slow. | +| `single-gpu` | `app`, `ollama`, `vision`, `searxng` | One NVIDIA GPU. Covers cover letters, research, and vision (survey screenshots). | +| `dual-gpu` | `app`, `ollama`, `vllm`, `vision`, `searxng` | Two NVIDIA GPUs. GPU 0 = Ollama (cover letters), GPU 1 = vLLM (research). | + +--- + +## Service Descriptions + +| Service | Image / Source | Port | Purpose | +|---------|---------------|------|---------| +| `app` | `Dockerfile` (Streamlit) | 8501 | The main Peregrine UI | +| `ollama` | `ollama/ollama` | 11434 | Local model inference — cover letters and general tasks | +| `vllm` | `vllm/vllm-openai` | 8000 | High-throughput local inference — research tasks | +| `vision` | `scripts/vision_service/` | 8002 | Moondream2 — survey screenshot analysis | +| `searxng` | `searxng/searxng` | 8888 | Private meta-search engine — company research web scraping | + +--- + +## Choosing a Profile + +### remote + +Use `remote` if: +- You have no NVIDIA GPU +- You plan to use Anthropic Claude or another API-hosted model exclusively +- You want the fastest startup (only two containers) + +You must configure at least one external LLM backend in **Settings → LLM Backends**. + +### cpu + +Use `cpu` if: +- You have no GPU but want to run models locally (e.g. for privacy) +- Acceptable for light use; cover letter generation may take several minutes per request + +Pull a model after the container starts: + +```bash +docker exec -it peregrine-ollama-1 ollama pull llama3.1:8b +``` + +### single-gpu + +Use `single-gpu` if: +- You have one NVIDIA GPU with at least 8 GB VRAM +- Recommended for most single-user installs +- The vision service (Moondream2) starts on the same GPU using 4-bit quantisation (~1.5 GB VRAM) + +### dual-gpu + +Use `dual-gpu` if: +- You have two or more NVIDIA GPUs +- GPU 0 handles Ollama (cover letters, quick tasks) +- GPU 1 handles vLLM (research, long-context tasks) +- The vision service shares GPU 0 with Ollama + +--- + +## GPU Memory Guidance + +| GPU VRAM | Recommended profile | Notes | +|----------|-------------------|-------| +| < 4 GB | `cpu` | GPU too small for practical model loading | +| 4–8 GB | `single-gpu` | Run smaller models (3B–8B parameters) | +| 8–16 GB | `single-gpu` | Run 8B–13B models comfortably | +| 16–24 GB | `single-gpu` | Run 13B–34B models | +| 24 GB+ | `single-gpu` or `dual-gpu` | 70B models with quantisation | + +--- + +## How preflight.py Works + +`make start` calls `scripts/preflight.py` before launching Docker. Preflight does the following: + +1. **Port conflict detection** — checks whether `STREAMLIT_PORT`, `OLLAMA_PORT`, `VLLM_PORT`, `SEARXNG_PORT`, and `VISION_PORT` are already in use. Reports any conflicts and suggests alternatives. + +2. **GPU enumeration** — queries `nvidia-smi` for GPU count and VRAM per card. + +3. **RAM check** — reads `/proc/meminfo` (Linux) or `vm_stat` (macOS) to determine available system RAM. + +4. **KV cache offload** — if GPU VRAM is less than 10 GB, preflight calculates `CPU_OFFLOAD_GB` (the amount of KV cache to spill to system RAM) and writes it to `.env`. The vLLM container picks this up via `--cpu-offload-gb`. + +5. **Profile recommendation** — writes `RECOMMENDED_PROFILE` to `.env`. This is informational; `make start` uses the `PROFILE` variable you specify (defaulting to `remote`). + +You can run preflight independently: + +```bash +make preflight +# or +python scripts/preflight.py +``` + +--- + +## Customising Ports + +Edit `.env` before running `make start`: + +```bash +STREAMLIT_PORT=8501 +OLLAMA_PORT=11434 +VLLM_PORT=8000 +SEARXNG_PORT=8888 +VISION_PORT=8002 +``` + +All containers read from `.env` via the `env_file` directive in `compose.yml`. diff --git a/docs/getting-started/first-run-wizard.md b/docs/getting-started/first-run-wizard.md new file mode 100644 index 0000000..aaa413c --- /dev/null +++ b/docs/getting-started/first-run-wizard.md @@ -0,0 +1,165 @@ +# First-Run Wizard + +When you open Peregrine for the first time, the setup wizard launches automatically. It walks through seven steps and saves your progress after each one — if your browser closes or the server restarts, it resumes where you left off. + +--- + +## Step 1 — Hardware + +Peregrine detects NVIDIA GPUs using `nvidia-smi` and reports: + +- Number of GPUs found +- VRAM per GPU +- Available system RAM + +Based on this, it recommends a Docker Compose profile: + +| Recommendation | Condition | +|---------------|-----------| +| `remote` | No GPU detected | +| `cpu` | GPU detected but VRAM < 4 GB | +| `single-gpu` | One GPU with VRAM >= 4 GB | +| `dual-gpu` | Two or more GPUs | + +You can override the recommendation and select any profile manually. The selection is written to `config/user.yaml` as `inference_profile`. + +--- + +## Step 2 — Tier + +Select your Peregrine tier: + +| Tier | Description | +|------|-------------| +| **Free** | Job discovery, matching, and basic pipeline — no LLM features | +| **Paid** | Adds cover letters, company research, email sync, integrations, and all AI features | +| **Premium** | Adds fine-tuning and multi-user support | + +Your tier is written to `config/user.yaml` as `tier`. + +**Dev tier override** — for local testing without a paid licence, set `dev_tier_override: premium` in `config/user.yaml`. This is for development use only and has no effect on production deployments. + +See [Tier System](../reference/tier-system.md) for the full feature gate table. + +--- + +## Step 3 — Identity + +Enter your personal details. These are stored locally in `config/user.yaml` and used to personalise cover letters and research briefs. + +| Field | Description | +|-------|-------------| +| Name | Your full name | +| Email | Primary contact email | +| Phone | Contact phone number | +| LinkedIn | LinkedIn profile URL | +| Career summary | 2–4 sentence professional summary — used in cover letters and interview prep | + +**LLM-assisted writing (Paid):** If you have a paid tier, the wizard offers to generate your career summary from a few bullet points using your configured LLM backend. + +--- + +## Step 4 — Resume + +Two paths are available: + +### Upload PDF or DOCX + +Upload your existing resume. The LLM parses it and extracts: +- Work experience (employer, title, dates, bullets) +- Education +- Skills +- Certifications + +The extracted data is stored in `config/user.yaml` and used when generating cover letters. + +### Guided form builder + +Fill in each section manually using structured form fields. Useful if you do not have a digital resume file ready, or if the parser misses something important. + +Both paths produce the same data structure. You can mix them — upload first, then edit the result in the form. + +--- + +## Step 5 — Inference + +Configure which LLM backends Peregrine uses. Backends are tried in priority order; if the first fails, Peregrine falls back to the next. + +Available backend types: + +| Type | Examples | Notes | +|------|---------|-------| +| `openai_compat` | Ollama, vLLM, Claude Code wrapper, Copilot wrapper | Any OpenAI-compatible API | +| `anthropic` | Claude via Anthropic API | Requires `ANTHROPIC_API_KEY` env var | +| `vision_service` | Moondream2 local service | Used for survey screenshot analysis only | + +For each backend you want to enable: + +1. Enter the base URL (e.g. `http://localhost:11434/v1` for Ollama) +2. Enter an API key if required (Anthropic, OpenAI) +3. Click **Test** — Peregrine pings the `/health` endpoint and attempts a short completion + +The full backend configuration is written to `config/llm.yaml`. You can edit it directly later via **Settings → LLM Backends**. + +!!! tip "Recommended minimum" + Enable at least Ollama with a general-purpose model (e.g. `llama3.1:8b`) for research tasks, and either Ollama or Anthropic for cover letter generation. The wizard will not block you if no backend is configured, but most features will not work. + +--- + +## Step 6 — Search + +Define what jobs to look for. Search configuration is written to `config/search_profiles.yaml`. + +| Field | Description | +|-------|-------------| +| Profile name | A label for this search profile (e.g. `cs_leadership`) | +| Job titles | List of titles to search for (e.g. `Customer Success Manager`, `TAM`) | +| Locations | City/region strings or `Remote` | +| Boards | Standard boards: `linkedin`, `indeed`, `glassdoor`, `zip_recruiter`, `google` | +| Custom boards | Additional scrapers: `adzuna`, `theladders`, `craigslist` | +| Exclude keywords | Jobs containing these words in the title are dropped | +| Results per board | Max jobs to fetch per board per run | +| Hours old | Only fetch jobs posted within this many hours | + +You can create multiple profiles (e.g. one for remote roles, one for a target industry). Run them all from the Home page or run a specific one. + +--- + +## Step 7 — Integrations + +Connect optional external services. All integrations are optional — skip this step if you want to use Peregrine without external accounts. + +Available integrations: + +**Job tracking (Paid):** Notion, Airtable, Google Sheets + +**Document storage (Free):** Google Drive, Dropbox, OneDrive, MEGA, Nextcloud + +**Calendar (Paid):** Google Calendar, Apple Calendar (CalDAV) + +**Notifications (Paid for Slack; Free for Discord and Home Assistant):** Slack, Discord, Home Assistant + +Each integration has a connection card with the required credentials. Click **Test** to verify the connection before saving. Credentials are written to `config/integrations/.yaml` (gitignored). + +See [Integrations](../user-guide/integrations.md) for per-service details. + +--- + +## Crash Recovery + +The wizard saves your progress to `config/user.yaml` after each step is completed (`wizard_step` field). If anything goes wrong: + +- Restart Peregrine and navigate to http://localhost:8501 +- The wizard resumes at the last completed step + +--- + +## Re-entering the Wizard + +To go through the wizard again (e.g. to change your search profile or swap LLM backends): + +1. Open **Settings** +2. Go to the **Developer** tab +3. Click **Reset wizard** + +This sets `wizard_complete: false` and `wizard_step: 0` in `config/user.yaml`. Your previously entered data is preserved as defaults. diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md new file mode 100644 index 0000000..bb106b7 --- /dev/null +++ b/docs/getting-started/installation.md @@ -0,0 +1,134 @@ +# Installation + +This page walks through a full Peregrine installation from scratch. + +--- + +## Prerequisites + +- **Git** — to clone the repository +- **Internet connection** — `setup.sh` downloads Docker and other dependencies +- **Operating system**: Ubuntu/Debian, Fedora/RHEL, Arch Linux, or macOS (with Docker Desktop) + +!!! warning "Windows" + Windows is not supported. Use [WSL2 with Ubuntu](https://docs.microsoft.com/windows/wsl/install) instead. + +--- + +## Step 1 — Clone the repository + +```bash +git clone https://git.circuitforge.io/circuitforge/peregrine +cd peregrine +``` + +--- + +## Step 2 — Run setup.sh + +```bash +bash setup.sh +``` + +`setup.sh` performs the following automatically: + +1. **Detects your platform** (Ubuntu/Debian, Fedora/RHEL, Arch, macOS) +2. **Installs Git** if not already present +3. **Installs Docker Engine** and the Docker Compose v2 plugin via the official Docker repositories +4. **Adds your user to the `docker` group** so you do not need `sudo` for docker commands (Linux only — log out and back in after this) +5. **Detects NVIDIA GPUs** — if `nvidia-smi` is present and working, installs the NVIDIA Container Toolkit and configures Docker to use it +6. **Creates `.env` from `.env.example`** — edit `.env` to customise ports and model storage paths before starting + +!!! note "macOS" + `setup.sh` installs Docker Desktop via Homebrew (`brew install --cask docker`) then exits. Open Docker Desktop, start it, then re-run the script. + +!!! note "GPU requirement" + For GPU support, `nvidia-smi` must return output before you run `setup.sh`. Install your NVIDIA driver first. The Container Toolkit installation will fail silently if the driver is not present. + +--- + +## Step 3 — (Optional) Edit .env + +The `.env` file controls ports and volume mount paths. The defaults work for most single-user installs: + +```bash +# Default ports +STREAMLIT_PORT=8501 +OLLAMA_PORT=11434 +VLLM_PORT=8000 +SEARXNG_PORT=8888 +VISION_PORT=8002 +``` + +Change `STREAMLIT_PORT` if 8501 is taken on your machine. + +--- + +## Step 4 — Start Peregrine + +Choose a profile based on your hardware: + +```bash +make start # remote — no GPU, use API-only LLMs +make start PROFILE=cpu # cpu — local models on CPU (slow) +make start PROFILE=single-gpu # single-gpu — one NVIDIA GPU +make start PROFILE=dual-gpu # dual-gpu — GPU 0 = Ollama, GPU 1 = vLLM +``` + +`make start` runs `preflight.py` first, which checks for port conflicts and writes GPU/RAM recommendations back to `.env`. Then it calls `docker compose --profile up -d`. + +--- + +## Step 5 — Open the UI + +Navigate to **http://localhost:8501** (or whatever `STREAMLIT_PORT` you set). + +The first-run wizard launches automatically. See [First-Run Wizard](first-run-wizard.md) for a step-by-step guide through all seven steps. + +--- + +## Supported Platforms + +| Platform | Tested | Notes | +|----------|--------|-------| +| Ubuntu 22.04 / 24.04 | Yes | Primary target | +| Debian 12 | Yes | | +| Fedora 39/40 | Yes | | +| RHEL / Rocky / AlmaLinux | Yes | | +| Arch Linux / Manjaro | Yes | | +| macOS (Apple Silicon) | Yes | Docker Desktop required; no GPU support | +| macOS (Intel) | Yes | Docker Desktop required; no GPU support | +| Windows | No | Use WSL2 with Ubuntu | + +--- + +## GPU Support + +Only NVIDIA GPUs are supported. AMD ROCm is not currently supported. + +Requirements: +- NVIDIA driver installed and `nvidia-smi` working before running `setup.sh` +- CUDA 12.x recommended (CUDA 11.x may work but is untested) +- Minimum 8 GB VRAM for `single-gpu` profile with default models +- For `dual-gpu`: GPU 0 is assigned to Ollama, GPU 1 to vLLM + +If your GPU has less than 10 GB VRAM, `preflight.py` will calculate a `CPU_OFFLOAD_GB` value and write it to `.env`. The vLLM container picks this up via `--cpu-offload-gb` to overflow KV cache to system RAM. + +--- + +## Stopping Peregrine + +```bash +make stop # stop all containers +make restart # stop then start again (runs preflight first) +``` + +--- + +## Reinstalling / Clean State + +```bash +make clean # removes containers, images, and data volumes (destructive) +``` + +You will be prompted to type `yes` to confirm. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..73d4fc8 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,65 @@ +# Peregrine + +**AI-powered job search pipeline — by [Circuit Forge LLC](https://circuitforge.io)** + +Peregrine automates the full job search lifecycle: discovery, matching, cover letter generation, application tracking, and interview preparation. It is privacy-first and local-first — your data never leaves your machine unless you configure an external integration. + +--- + +## Quick Start + +```bash +# 1. Clone and install dependencies +git clone https://git.circuitforge.io/circuitforge/peregrine +cd peregrine +bash setup.sh + +# 2. Start Peregrine +make start # no GPU, API-only +make start PROFILE=single-gpu # one NVIDIA GPU +make start PROFILE=dual-gpu # dual GPU (Ollama + vLLM) + +# 3. Open the UI +# http://localhost:8501 +``` + +The first-run wizard guides you through hardware detection, tier selection, identity, resume, LLM configuration, search profiles, and integrations. See [Installation](getting-started/installation.md) for the full walkthrough. + +--- + +## Feature Overview + +| Feature | Free | Paid | Premium | +|---------|------|------|---------| +| Job discovery (JobSpy + custom boards) | Yes | Yes | Yes | +| Resume keyword matching | Yes | Yes | Yes | +| Cover letter generation | - | Yes | Yes | +| Company research briefs | - | Yes | Yes | +| Interview prep & practice Q&A | - | Yes | Yes | +| Email sync & auto-classification | - | Yes | Yes | +| Survey assistant (culture-fit Q&A) | - | Yes | Yes | +| Integration connectors (Notion, Airtable, etc.) | Partial | Yes | Yes | +| Calendar sync (Google, Apple) | - | Yes | Yes | +| Cover letter model fine-tuning | - | - | Yes | +| Multi-user support | - | - | Yes | + +See [Tier System](reference/tier-system.md) for the full feature gate table. + +--- + +## Documentation Sections + +- **[Getting Started](getting-started/installation.md)** — Install, configure, and launch Peregrine +- **[User Guide](user-guide/job-discovery.md)** — How to use every feature in the UI +- **[Developer Guide](developer-guide/contributing.md)** — Add scrapers, integrations, and contribute code +- **[Reference](reference/tier-system.md)** — Tier system, LLM router, and config file schemas + +--- + +## License + +Core discovery pipeline: [MIT](https://git.circuitforge.io/circuitforge/peregrine/src/branch/main/LICENSE-MIT) + +AI features (cover letter generation, company research, interview prep, UI): [BSL 1.1](https://git.circuitforge.io/circuitforge/peregrine/src/branch/main/LICENSE-BSL) + +© 2026 Circuit Forge LLC diff --git a/docs/reference/config-files.md b/docs/reference/config-files.md new file mode 100644 index 0000000..26bf4f2 --- /dev/null +++ b/docs/reference/config-files.md @@ -0,0 +1,353 @@ +# Config Files + +All Peregrine configuration lives in the `config/` directory. Gitignored files contain secrets or personal data; `.example` files are committed as templates. + +--- + +## Gitignore Status + +| File | Gitignored | Notes | +|------|-----------|-------| +| `config/user.yaml` | Yes | Personal data + wizard state | +| `config/llm.yaml` | No | LLM backends (no secrets by default) | +| `config/search_profiles.yaml` | No | Search configuration (no secrets) | +| `config/resume_keywords.yaml` | No | Scoring keywords (no secrets) | +| `config/blocklist.yaml` | No | Excluded companies (no secrets) | +| `config/email.yaml` | Yes | IMAP credentials | +| `config/notion.yaml` | Yes | Notion token | +| `config/adzuna.yaml` | Yes | Adzuna API credentials | +| `config/craigslist.yaml` | Yes | Craigslist target cities | +| `config/integrations/*.yaml` | Yes | All integration credentials | +| `.env` | Yes | Docker port and path overrides | + +--- + +## config/user.yaml + +The primary personal data file. Created by the first-run wizard. + +```yaml +# Identity +name: "Your Name" +email: "you@example.com" +phone: "555-000-0000" +linkedin: "linkedin.com/in/yourprofile" +career_summary: > + Experienced professional with X years in [field]. + +# Privacy +nda_companies: [] # company names to redact from research briefs + +# Mission alignment +mission_preferences: + music: "" # personal note injected into cover letter para 3 + animal_welfare: "" + education: "" + +# Research brief options (personal decision-making only) +candidate_accessibility_focus: false # adds ADA/WCAG/ERG section +candidate_lgbtq_focus: false # adds LGBTQIA+ inclusion section + +# Tier +tier: free # free | paid | premium +dev_tier_override: null # overrides tier locally for testing + +# Wizard state +wizard_complete: false +wizard_step: 0 +dismissed_banners: [] + +# Storage paths +docs_dir: "~/Documents/JobSearch" +ollama_models_dir: "~/models/ollama" +vllm_models_dir: "~/models/vllm" + +# Inference +inference_profile: "remote" # remote | cpu | single-gpu | dual-gpu + +# Service connection settings +services: + streamlit_port: 8501 + ollama_host: localhost + ollama_port: 11434 + ollama_ssl: false + ollama_ssl_verify: true + vllm_host: localhost + vllm_port: 8000 + vllm_ssl: false + vllm_ssl_verify: true + searxng_host: localhost + searxng_port: 8888 + searxng_ssl: false + searxng_ssl_verify: true +``` + +All personal data access in `scripts/` goes through `scripts/user_profile.py` (`UserProfile` class) — never read this file directly in scripts. + +--- + +## config/llm.yaml + +LLM backend definitions and fallback chains. Not gitignored (contains no secrets by default — API keys come from environment variables). + +```yaml +backends: + ollama: + type: openai_compat + base_url: http://localhost:11434/v1 + api_key: ollama # placeholder; Ollama ignores the key + model: llama3.1:8b + enabled: true + supports_images: false + + ollama_research: + type: openai_compat + base_url: http://localhost:11434/v1 + api_key: ollama + model: llama3.1:8b # can be a different model for research + enabled: true + supports_images: false + + vllm: + type: openai_compat + base_url: http://localhost:8000/v1 + api_key: "" + model: __auto__ # auto-detect first loaded model + enabled: true + supports_images: false + + claude_code: + type: openai_compat + base_url: http://localhost:3009/v1 + api_key: any + model: claude-code-terminal + enabled: false + supports_images: true + + github_copilot: + type: openai_compat + base_url: http://localhost:3010/v1 + api_key: any + model: gpt-4o + enabled: false + supports_images: false + + anthropic: + type: anthropic + api_key_env: ANTHROPIC_API_KEY # name of environment variable + model: claude-sonnet-4-6 + enabled: false + supports_images: true + + vision_service: + type: vision_service + base_url: http://localhost:8002 + enabled: true + supports_images: true + +fallback_order: + - ollama + - claude_code + - vllm + - github_copilot + - anthropic + +research_fallback_order: + - claude_code + - vllm + - ollama_research + - github_copilot + - anthropic + +vision_fallback_order: + - vision_service + - claude_code + - anthropic +``` + +See [LLM Router](llm-router.md) for full documentation. + +--- + +## config/search_profiles.yaml + +Defines what jobs to search for. Multiple profiles can coexist. + +```yaml +profiles: + - name: cs_leadership # unique profile identifier + titles: + - Customer Success Manager + - Director of Customer Success + locations: + - Remote + - San Francisco Bay Area, CA + boards: + - linkedin + - indeed + - glassdoor + - zip_recruiter + - google + custom_boards: + - adzuna + - theladders + - craigslist + exclude_keywords: # job titles containing these are dropped + - sales + - account executive + - SDR + results_per_board: 75 + hours_old: 240 # only fetch jobs posted in last N hours + mission_tags: # optional: links to mission_preferences + - music +``` + +--- + +## config/resume_keywords.yaml + +Keywords extracted from your resume, used for match scoring. Managed via **Settings → Skills**. + +```yaml +keywords: + - Customer Success + - Churn reduction + - Salesforce + - SQL + - Stakeholder management + - QBR + - onboarding +``` + +--- + +## config/blocklist.yaml + +Companies or domains to exclude from discovery results entirely. + +```yaml +blocked_companies: + - "Pyramid Scheme Inc" + - "Sketchy Startup" + +blocked_domains: + - "mlm-company.com" +``` + +--- + +## config/email.yaml + +IMAP email sync credentials. Gitignored. See [Email Sync](../user-guide/email-sync.md) for setup. + +```yaml +host: imap.gmail.com +port: 993 +use_ssl: true +username: your.email@gmail.com +password: xxxx-xxxx-xxxx-xxxx # Gmail App Password (16 chars, no spaces) +sent_folder: "" # leave blank to auto-detect +lookback_days: 90 +todo_label: "" # optional: Gmail label to monitor +``` + +--- + +## config/notion.yaml + +Notion integration credentials. Gitignored. + +```yaml +token: "secret_..." # Notion integration token +database_id: "1bd75cff-..." # database ID from the URL + +# Notion property names → Peregrine field names +field_map: + title: "Salary" # Notion title property (unusual — it's the page title) + status: "Status of Application" + company: "Company" + url: "Role Link" + source: "Job Source" # multi_select type + location: "Location" + applied_at: "Date Applied" +``` + +Field names in Notion are non-obvious. Always read them from `field_map` rather than guessing. + +--- + +## config/adzuna.yaml + +Adzuna Jobs API credentials. Gitignored. + +```yaml +app_id: "12345678" +app_key: "abcdefgh1234567890abcdefgh123456" +country: "us" # two-letter country code +``` + +Get credentials at [developer.adzuna.com](https://developer.adzuna.com/). + +--- + +## config/craigslist.yaml + +Target city slugs for the Craigslist scraper. Gitignored. + +```yaml +cities: + - sfbay + - nyc + - seattle + - chicago +``` + +Find slugs at `https://www.craigslist.org/about/sites`. + +--- + +## config/integrations/ + +One YAML file per integration, created when you test and save credentials in the wizard or Settings. All files in this directory are gitignored. + +``` +config/integrations/ + notion.yaml + airtable.yaml + google_sheets.yaml + google_drive.yaml + dropbox.yaml + onedrive.yaml + mega.yaml + nextcloud.yaml + google_calendar.yaml + apple_calendar.yaml + slack.yaml + discord.yaml + home_assistant.yaml +``` + +Each file contains only the fields defined by that integration's `fields()` method. Example for Discord: + +```yaml +webhook_url: "https://discord.com/api/webhooks/..." +``` + +--- + +## .env + +Docker port and path overrides. Created from `.env.example` by `setup.sh`. Gitignored. + +```bash +# Ports (change if defaults conflict with existing services) +STREAMLIT_PORT=8501 +OLLAMA_PORT=11434 +VLLM_PORT=8000 +SEARXNG_PORT=8888 +VISION_PORT=8002 + +# GPU settings (written by preflight.py) +RECOMMENDED_PROFILE=single-gpu +CPU_OFFLOAD_GB=0 # KV cache RAM offload for low-VRAM GPUs +``` diff --git a/docs/reference/llm-router.md b/docs/reference/llm-router.md new file mode 100644 index 0000000..e44050e --- /dev/null +++ b/docs/reference/llm-router.md @@ -0,0 +1,231 @@ +# LLM Router + +`scripts/llm_router.py` provides a unified LLM interface with automatic fallback. All LLM calls in Peregrine go through `LLMRouter.complete()`. + +--- + +## How It Works + +`LLMRouter` reads `config/llm.yaml` on instantiation. When `complete()` is called: + +1. It iterates through the active fallback order +2. For each backend, it checks: + - Is the backend `enabled`? + - Is it reachable (health check ping)? + - Does it support the request type (text-only vs. vision)? +3. On the first backend that succeeds, it returns the completion +4. On any error (network, model error, timeout), it logs the failure and tries the next backend +5. If all backends are exhausted, it raises `RuntimeError("All LLM backends exhausted")` + +``` +fallback_order: [ollama, claude_code, vllm, github_copilot, anthropic] + ↓ try + ↓ unreachable? → skip + ↓ disabled? → skip + ↓ error? → next + → return completion +``` + +--- + +## Backend Types + +### `openai_compat` + +Any backend that speaks the OpenAI Chat Completions API. This includes: +- Ollama (`http://localhost:11434/v1`) +- vLLM (`http://localhost:8000/v1`) +- Claude Code wrapper (`http://localhost:3009/v1`) +- GitHub Copilot wrapper (`http://localhost:3010/v1`) + +Health check: `GET {base_url}/health` (strips `/v1` suffix) + +### `anthropic` + +Calls the Anthropic Python SDK directly. Reads the API key from the environment variable named in `api_key_env`. + +Health check: skips health check; proceeds if `api_key_env` is set in the environment. + +### `vision_service` + +The local Moondream2 inference service. Only used when `images` is provided to `complete()`. + +Health check: `GET {base_url}/health` + +Request: `POST {base_url}/analyze` with `{"prompt": ..., "image_base64": ...}` + +--- + +## `complete()` Signature + +```python +def complete( + prompt: str, + system: str | None = None, + model_override: str | None = None, + fallback_order: list[str] | None = None, + images: list[str] | None = None, +) -> str: +``` + +| Parameter | Description | +|-----------|-------------| +| `prompt` | The user message | +| `system` | Optional system prompt (passed as the `system` role) | +| `model_override` | Overrides the configured model for `openai_compat` backends (e.g. pass a research-specific Ollama model) | +| `fallback_order` | Override the fallback chain for this call only (e.g. `config["research_fallback_order"]`) | +| `images` | Optional list of base64-encoded PNG/JPG strings. When provided, backends without `supports_images: true` are skipped automatically. | + +--- + +## Fallback Chains + +Three named chains are defined in `config/llm.yaml`: + +| Config key | Used for | +|-----------|---------| +| `fallback_order` | Cover letter generation and general tasks | +| `research_fallback_order` | Company research briefs | +| `vision_fallback_order` | Survey screenshot analysis (requires `images`) | + +Pass a chain explicitly: + +```python +router = LLMRouter() + +# Use the research chain +result = router.complete( + prompt=research_prompt, + system=system_prompt, + fallback_order=router.config["research_fallback_order"], +) + +# Use the vision chain with an image +result = router.complete( + prompt="Describe what you see in this survey", + fallback_order=router.config["vision_fallback_order"], + images=[base64_image_string], +) +``` + +--- + +## Vision Routing + +When `images` is provided: + +- Backends with `supports_images: false` are skipped +- `vision_service` backends are tried (POST to `/analyze`) +- `openai_compat` backends with `supports_images: true` receive images as multipart content in the user message +- `anthropic` backends with `supports_images: true` receive images as base64 content blocks + +When `images` is NOT provided: + +- `vision_service` backends are skipped entirely + +--- + +## `__auto__` Model Resolution + +vLLM can serve different models depending on what is loaded. Set `model: __auto__` in `config/llm.yaml` for the vLLM backend: + +```yaml +vllm: + type: openai_compat + base_url: http://localhost:8000/v1 + model: __auto__ +``` + +`LLMRouter` calls `client.models.list()` and uses the first model returned. This avoids hard-coding a model name that may change when you swap the loaded model. + +--- + +## Adding a Backend + +1. Add an entry to `config/llm.yaml`: + +```yaml +backends: + my_backend: + type: openai_compat # or "anthropic" | "vision_service" + base_url: http://localhost:9000/v1 + api_key: my-key + model: my-model-name + enabled: true + supports_images: false +``` + +2. Add it to one or more fallback chains: + +```yaml +fallback_order: + - ollama + - my_backend # add here + - claude_code + - anthropic +``` + +3. No code changes are needed — the router reads the config at startup. + +--- + +## Module-Level Convenience Function + +A module-level singleton is provided for simple one-off calls: + +```python +from scripts.llm_router import complete + +result = complete("Write a brief summary of this company.", system="You are a research assistant.") +``` + +This uses the default `fallback_order` from `config/llm.yaml`. For per-task chain overrides, instantiate `LLMRouter` directly. + +--- + +## Config Reference + +```yaml +# config/llm.yaml + +backends: + ollama: + type: openai_compat + base_url: http://localhost:11434/v1 + api_key: ollama + model: llama3.1:8b + enabled: true + supports_images: false + + anthropic: + type: anthropic + api_key_env: ANTHROPIC_API_KEY # env var name (not the key itself) + model: claude-sonnet-4-6 + enabled: false + supports_images: true + + vision_service: + type: vision_service + base_url: http://localhost:8002 + enabled: true + supports_images: true + +fallback_order: + - ollama + - claude_code + - vllm + - github_copilot + - anthropic + +research_fallback_order: + - claude_code + - vllm + - ollama_research + - github_copilot + - anthropic + +vision_fallback_order: + - vision_service + - claude_code + - anthropic +``` diff --git a/docs/reference/tier-system.md b/docs/reference/tier-system.md new file mode 100644 index 0000000..6cc406a --- /dev/null +++ b/docs/reference/tier-system.md @@ -0,0 +1,159 @@ +# Tier System + +Peregrine uses a three-tier feature gate system defined in `app/wizard/tiers.py`. + +--- + +## Tiers + +``` +free < paid < premium +``` + +| Tier | Description | +|------|-------------| +| `free` | Core discovery pipeline, resume matching, and basic UI — no LLM features | +| `paid` | All AI features: cover letters, research, email, integrations, calendar, notifications | +| `premium` | Adds fine-tuning and multi-user support | + +--- + +## Feature Gate Table + +Features listed here require a minimum tier. Features not in this table are available to all tiers (free by default). + +### Wizard LLM generation + +| Feature key | Minimum tier | Description | +|-------------|-------------|-------------| +| `llm_career_summary` | paid | LLM-assisted career summary generation in the wizard | +| `llm_expand_bullets` | paid | LLM expansion of resume bullet points | +| `llm_suggest_skills` | paid | LLM skill suggestions from resume content | +| `llm_voice_guidelines` | premium | LLM writing voice and tone guidelines | +| `llm_job_titles` | paid | LLM-suggested job title variations for search | +| `llm_keywords_blocklist` | paid | LLM-suggested blocklist keywords | +| `llm_mission_notes` | paid | LLM-generated mission alignment notes | + +### App features + +| Feature key | Minimum tier | Description | +|-------------|-------------|-------------| +| `company_research` | paid | Auto-generated company research briefs pre-interview | +| `interview_prep` | paid | Live reference sheet and practice Q&A during calls | +| `email_classifier` | paid | IMAP email sync with LLM classification | +| `survey_assistant` | paid | Culture-fit survey Q&A helper (text + screenshot) | +| `model_fine_tuning` | premium | Cover letter model fine-tuning on personal writing | +| `shared_cover_writer_model` | paid | Access to shared fine-tuned cover letter model | +| `multi_user` | premium | Multiple user profiles on one instance | + +### Integrations (paid) + +| Feature key | Minimum tier | Description | +|-------------|-------------|-------------| +| `notion_sync` | paid | Sync jobs to Notion database | +| `google_sheets_sync` | paid | Sync jobs to Google Sheets | +| `airtable_sync` | paid | Sync jobs to Airtable | +| `google_calendar_sync` | paid | Create interview events in Google Calendar | +| `apple_calendar_sync` | paid | Create interview events in Apple Calendar (CalDAV) | +| `slack_notifications` | paid | Pipeline event notifications via Slack | + +### Free integrations (not gated) + +The following integrations are free for all tiers and are not in the `FEATURES` dict: + +- `google_drive_sync` — upload documents to Google Drive +- `dropbox_sync` — upload documents to Dropbox +- `onedrive_sync` — upload documents to OneDrive +- `mega_sync` — upload documents to MEGA +- `nextcloud_sync` — upload documents to Nextcloud +- `discord_notifications` — pipeline notifications via Discord webhook +- `home_assistant` — pipeline events to Home Assistant REST API + +--- + +## API Reference + +### `can_use(tier, feature) -> bool` + +Returns `True` if the given tier has access to the feature. + +```python +from app.wizard.tiers import can_use + +can_use("free", "company_research") # False +can_use("paid", "company_research") # True +can_use("premium", "company_research") # True + +can_use("free", "unknown_feature") # True — ungated features return True +can_use("invalid", "company_research") # False — invalid tier string +``` + +### `tier_label(feature) -> str` + +Returns a display badge string for locked features, or `""` if the feature is free or unknown. + +```python +from app.wizard.tiers import tier_label + +tier_label("company_research") # "🔒 Paid" +tier_label("model_fine_tuning") # "⭐ Premium" +tier_label("job_discovery") # "" (ungated) +``` + +--- + +## Dev Tier Override + +For local development and testing without a paid licence, set `dev_tier_override` in `config/user.yaml`: + +```yaml +tier: free +dev_tier_override: premium # overrides tier locally for testing +``` + +`UserProfile.tier` returns `dev_tier_override` when set, falling back to `tier` otherwise. + +!!! warning + `dev_tier_override` is for local development only. It has no effect on production deployments that validate licences server-side. + +--- + +## Adding a New Feature Gate + +1. Add the feature to `FEATURES` in `app/wizard/tiers.py`: + +```python +FEATURES: dict[str, str] = { + # ...existing entries... + "my_new_feature": "paid", # or "free" | "premium" +} +``` + +2. Guard the feature in the UI: + +```python +from app.wizard.tiers import can_use, tier_label +from scripts.user_profile import UserProfile + +user = UserProfile() +if can_use(user.tier, "my_new_feature"): + # show the feature + pass +else: + st.info(f"My New Feature requires a {tier_label('my_new_feature').replace('🔒 ', '').replace('⭐ ', '')} plan.") +``` + +3. Add a test in `tests/test_tiers.py`: + +```python +def test_my_new_feature_requires_paid(): + assert can_use("free", "my_new_feature") is False + assert can_use("paid", "my_new_feature") is True + assert can_use("premium", "my_new_feature") is True +``` + +--- + +## Future: Ultra Tier + +An `ultra` tier is reserved for future use (e.g. enterprise SLA, dedicated inference). The tier ordering in `TIERS = ["free", "paid", "premium"]` can be extended without breaking `can_use()`, since it uses `list.index()` for comparison. diff --git a/docs/user-guide/apply-workspace.md b/docs/user-guide/apply-workspace.md new file mode 100644 index 0000000..899b637 --- /dev/null +++ b/docs/user-guide/apply-workspace.md @@ -0,0 +1,76 @@ +# Apply Workspace + +The Apply Workspace is where you generate cover letters, export application documents, and record that you have applied to a job. + +--- + +## Accessing the Workspace + +Navigate to page **4 — Apply** in the sidebar. The workspace lists all jobs with status `approved`, sorted by date approved. + +--- + +## Cover Letter Generation + +Click **Generate Cover Letter** on any job card. Peregrine runs as a background task so you can continue navigating the UI. + +### What the generator uses + +- Your **career summary** and **resume data** from `config/user.yaml` +- The **job title** and **job description** +- **Company name** — used to detect mission-aligned industries +- **Mission alignment notes** from `config/user.yaml` (e.g. a personal note about why you care about music-industry companies) + +### Fallback chain + +Cover letters use the cover letter fallback order from `config/llm.yaml`. By default: `ollama → claude_code → vllm → github_copilot → anthropic`. See [LLM Router](../reference/llm-router.md) for details. + +### Mission alignment + +If the company or job description matches one of your configured mission industries (music, animal welfare, education), the generator injects a personalised paragraph 3 hint into the prompt. This produces a cover letter that reflects authentic alignment rather than generic enthusiasm. + +--- + +## Editing the Cover Letter + +After generation, the cover letter appears in an editable text area. Edit freely — changes are saved locally and do not trigger a re-generation. + +Click **Save** to write the updated text back to the database. + +--- + +## PDF Export + +Click **Export PDF** to generate a formatted PDF of the cover letter. The PDF is saved to your `docs_dir` (configured in `config/user.yaml`, default: `~/Documents/JobSearch`). + +The filename format is: `{Company}_{Title}_{Date}_CoverLetter.pdf` + +--- + +## Marking Applied + +Once you have submitted your application externally, click **Mark Applied**. This: + +- Sets the job status to `applied` +- Records `applied_at` timestamp +- Moves the job out of the Apply Workspace and into the Interviews kanban (in `applied` pre-stage) + +--- + +## Rejecting a Listing + +Changed your mind about a job you approved? Click **Reject Listing** to set it to `rejected` status. This removes it from the workspace without affecting your cover letter draft (the text remains in the database). + +--- + +## Cover Letter Background Task Status + +The sidebar shows a live indicator (updated every 3 seconds) of running and queued background tasks. If a cover letter generation is in progress you will see it there. + +A task can have these statuses: +- **queued** — waiting to start +- **running** — actively generating +- **completed** — finished; reload the page to see the result +- **failed** — generation failed; check the logs + +Only one queued or running task per job is allowed at a time. Clicking **Generate Cover Letter** on a job that already has a task in progress is a no-op. diff --git a/docs/user-guide/email-sync.md b/docs/user-guide/email-sync.md new file mode 100644 index 0000000..8da0c1e --- /dev/null +++ b/docs/user-guide/email-sync.md @@ -0,0 +1,119 @@ +# Email Sync + +Peregrine monitors your inbox for job-related emails and automatically updates job stages when it detects interview requests, rejections, offers, and survey links. + +--- + +## Configuration + +Email sync is configured in `config/email.yaml` (gitignored). Copy the example template to get started: + +```bash +cp config/email.yaml.example config/email.yaml +``` + +Then fill in your credentials: + +```yaml +host: imap.gmail.com +port: 993 +use_ssl: true +username: your.email@gmail.com +password: xxxx-xxxx-xxxx-xxxx # see Gmail App Password below +sent_folder: "" # leave blank to auto-detect +lookback_days: 90 # how many days back to scan +todo_label: "" # optional Gmail label to monitor +``` + +You can also configure email sync via **Settings → Email** in the UI. + +--- + +## Gmail Setup + +Gmail requires an **App Password** instead of your regular account password. Your regular password will not work. + +1. Enable **2-Step Verification** on your Google Account at [myaccount.google.com/security](https://myaccount.google.com/security) +2. Go to [myaccount.google.com/apppasswords](https://myaccount.google.com/apppasswords) +3. Create a new app password — name it "Peregrine" or similar +4. Copy the 16-character code (no spaces) and paste it as `password` in `config/email.yaml` +5. Enable IMAP in Gmail: **Settings → See all settings → Forwarding and POP/IMAP → Enable IMAP** + +--- + +## Outlook / Office 365 + +```yaml +host: outlook.office365.com +port: 993 +use_ssl: true +username: your.email@company.com +password: your-password # or App Password if MFA is enabled +``` + +--- + +## Gmail Label Monitoring (Optional) + +If you use a Gmail label to flag action-needed job emails (e.g. "TO DO JOBS"), set: + +```yaml +todo_label: "TO DO JOBS" +``` + +Emails in this label are matched to pipeline jobs by company name, then filtered by action keywords in the subject line (e.g. "interview", "next steps", "offer"). + +--- + +## Email Classification Labels + +The email classifier assigns one of six labels to each relevant email: + +| Label | Meaning | +|-------|---------| +| `interview_request` | Recruiter or hiring manager requesting a call or interview | +| `rejection` | Automated or personal rejection | +| `offer` | Job offer letter or verbal offer notification | +| `follow_up` | Candidate or recruiter follow-up with no stage change | +| `survey_received` | Link or request to complete a culture-fit or skills assessment | +| `other` | Job-related but does not fit any category above | + +Classification is performed by your configured LLM backend. The classifier uses the email subject and body as input. + +!!! note "Tier requirement" + Email classification is a Paid feature. + +--- + +## Stage Auto-Updates + +When a classified email is matched to a job in your pipeline, Peregrine updates the job stage automatically: + +| Classification | Stage action | +|---------------|-------------| +| `interview_request` | Moves `applied` → `phone_screen` | +| `rejection` | Moves job → `rejected` (captures `rejection_stage`) | +| `offer` | Flags job for review; moves toward `offer` stage | +| `survey_received` | Moves job → `survey` pre-stage | + +Emails are matched to jobs by comparing the sender domain and company name in the email body against company names in your pipeline. + +--- + +## Running Email Sync + +### From the UI + +Click **Sync Emails** on the Home page. This runs as a background task — you can navigate away while it processes. + +### Non-blocking background sync + +Email sync runs in a daemon thread via `scripts/task_runner.py` and does not block the UI. The sidebar background task indicator shows sync progress. + +--- + +## Email Thread Log + +All matched emails are stored in the `job_contacts` table (one row per email thread per job). You can view the thread log for any job from the Job Review detail view or the Interviews kanban card. + +Columns stored: `direction` (inbound/outbound), `subject`, `from`, `to`, `body`, `received_at`. diff --git a/docs/user-guide/integrations.md b/docs/user-guide/integrations.md new file mode 100644 index 0000000..a45bf5c --- /dev/null +++ b/docs/user-guide/integrations.md @@ -0,0 +1,147 @@ +# Integrations + +Peregrine supports 13 optional integration connectors for job tracking, document storage, calendar sync, and notifications. Configure them in **Settings → Integrations** or during the first-run wizard (Step 7). + +All integration credentials are stored in `config/integrations/.yaml` (gitignored — never committed). + +--- + +## Job Tracking + +### Notion + +**Tier:** Paid + +Syncs approved and applied jobs to a Notion database. Peregrine creates or updates a Notion page per job with status, salary, company, URL, and cover letter text. + +Required credentials: Notion integration token and database ID. + +Configure in `config/integrations/notion.yaml`. + +### Airtable + +**Tier:** Paid + +Syncs the job pipeline to an Airtable base. Each job maps to a row in your configured table. + +Required credentials: Airtable personal access token, base ID, and table name. + +### Google Sheets + +**Tier:** Paid + +Appends job data to a Google Sheet. Useful for sharing pipeline data or building custom dashboards. + +Required credentials: Google service account JSON key file, spreadsheet ID, and sheet name. + +--- + +## Document Storage + +### Google Drive + +**Tier:** Free + +Uploads generated cover letters and exported PDFs to a Google Drive folder automatically when you export from the Apply Workspace. + +Required credentials: Google service account JSON key file and target folder ID. + +### Dropbox + +**Tier:** Free + +Uploads cover letters and PDFs to a Dropbox folder. + +Required credentials: Dropbox access token and target folder path. + +### OneDrive + +**Tier:** Free + +Uploads cover letters and PDFs to a OneDrive folder via the Microsoft Graph API. + +Required credentials: Microsoft OAuth client ID, client secret, tenant ID, and target folder path. + +### MEGA + +**Tier:** Free + +Uploads documents to MEGA cloud storage. + +Required credentials: MEGA account email and password, target folder path. + +### Nextcloud + +**Tier:** Free + +Uploads documents to a self-hosted Nextcloud instance via WebDAV. + +Required credentials: Nextcloud server URL, username, password, and target folder path. + +--- + +## Calendar + +### Google Calendar + +**Tier:** Paid + +Creates calendar events for scheduled interviews. When you set an `interview_date` on a job in the kanban, Peregrine creates a Google Calendar event with a reminder. + +Required credentials: Google service account JSON key file and calendar ID. + +### Apple Calendar (CalDAV) + +**Tier:** Paid + +Creates calendar events on an Apple Calendar or any CalDAV-compatible server. + +Required credentials: CalDAV server URL, username, and password. For iCloud, use an app-specific password. + +--- + +## Notifications + +### Slack + +**Tier:** Paid + +Sends notifications to a Slack channel for key pipeline events: new high-match jobs discovered, stage changes, and research completion. + +Required credentials: Slack incoming webhook URL. + +### Discord + +**Tier:** Free + +Sends notifications to a Discord channel via a webhook. Same events as Slack. + +Required credentials: Discord webhook URL. + +### Home Assistant + +**Tier:** Free + +Sends pipeline events to Home Assistant via the REST API. Useful for smart home dashboards or custom automation triggers. + +Required credentials: Home Assistant base URL and long-lived access token. + +--- + +## Integration Status + +The Settings → Integrations tab shows the connection status of each integration: + +| Status | Meaning | +|--------|---------| +| Connected | Credentials file exists and last test passed | +| Not configured | No credentials file found | +| Error | Credentials file exists but last test failed | + +Click **Test** to re-verify the connection at any time. + +--- + +## Adding a Custom Integration + +See [Adding an Integration](../developer-guide/adding-integrations.md) in the developer guide. diff --git a/docs/user-guide/interviews.md b/docs/user-guide/interviews.md new file mode 100644 index 0000000..58512fe --- /dev/null +++ b/docs/user-guide/interviews.md @@ -0,0 +1,96 @@ +# Interviews + +The Interviews page is a kanban board that tracks your progress through the interview pipeline after you have applied to a job. + +--- + +## Kanban Stages + +Jobs move left to right through the pipeline: + +``` +applied → phone_screen → interviewing → offer → hired + ↓ + (any stage) → rejected +``` + +| Stage | Description | +|-------|-------------| +| `applied` | Pre-kanban holding area — job applied to but no response yet | +| `phone_screen` | Initial recruiter/HR screen scheduled or completed | +| `interviewing` | Active interview loop (first-round, technical, panel, etc.) | +| `offer` | Offer received; evaluating | +| `hired` | Offer accepted | +| `rejected` | Declined or ghosted at any stage (captures `rejection_stage`) | + +--- + +## Moving Jobs Between Stages + +Drag a job card to the target column, or use the stage-advance button on each card. Moving a job to `phone_screen` triggers an automatic company research task (see below). + +--- + +## Company Research (Auto-trigger) + +When a job moves to `phone_screen`, Peregrine automatically queues a **company research** background task (`scripts/company_research.py`). The research brief is generated in three phases: + +1. **SearXNG web scrape** — queries the SearXNG meta-search engine (running locally on port 8888) for company information from public sources +2. **SearXNG news snippets** — fetches recent news about the company +3. **LLM synthesis** — combines the scraped content into a structured brief + +The brief includes: +- Company overview (mission, size, funding stage) +- CEO / leadership summary +- Talking points tailored to your role +- Optional: Inclusion and Accessibility section (ADA signals, WCAG, ERGs) +- Optional: LGBTQIA+ inclusion section (non-discrimination policies, culture signals) + +Both optional sections are controlled by `candidate_accessibility_focus` and `candidate_lgbtq_focus` booleans in `config/user.yaml`. They are for personal decision-making only and are never included in applications. + +--- + +## Interview Prep Page + +Navigate to page **6 — Interview Prep** for a job in the `phone_screen` or `interviewing` stage. This page provides: + +- The full company research brief (generated automatically when the job moved to `phone_screen`) +- A live reference sheet you can keep open during a call +- **Practice Q&A** — a back-and-forth interview simulation powered by your LLM backend + +!!! note "Tier requirement" + Interview prep is a Paid feature. See [Tier System](../reference/tier-system.md). + +--- + +## Survey Assistant + +When a job moves to the `survey` stage (via the "Survey" button on an applied job), the Survey Assistant page (page 7) becomes active for that job. It helps you complete culture-fit surveys by: + +- Accepting pasted survey text +- Accepting screenshot uploads (analysed by the Moondream2 vision service) +- Generating suggested answers via your configured LLM backend + +After completing the survey, move the job to `phone_screen` to continue the pipeline. + +!!! note "Tier requirement" + Survey assistant is a Paid feature. + +--- + +## Rejection Tracking + +When you reject a job from the kanban (at any stage), Peregrine captures the `rejection_stage` — the stage at which the rejection occurred. This data is available for pipeline analytics. + +--- + +## Email-Driven Stage Updates + +If email sync is configured (see [Email Sync](email-sync.md)), Peregrine can automatically advance jobs based on incoming email: + +| Email classification | Stage action | +|---------------------|-------------| +| `interview_request` | Moves job toward `phone_screen` if still `applied` | +| `rejection` | Moves job to `rejected` (captures `rejection_stage`) | +| `offer` | Flags job for review; moves toward `offer` | +| `survey_received` | Moves job to `survey` stage | diff --git a/docs/user-guide/job-discovery.md b/docs/user-guide/job-discovery.md new file mode 100644 index 0000000..1a6fd89 --- /dev/null +++ b/docs/user-guide/job-discovery.md @@ -0,0 +1,123 @@ +# Job Discovery + +Peregrine discovers new job listings by running search profiles against multiple job boards simultaneously. Results are deduplicated by URL and stored in the local SQLite database (`staging.db`). + +--- + +## How Discovery Works + +1. **Search profiles** in `config/search_profiles.yaml` define what to search for +2. The Home page **Run Discovery** button triggers `scripts/discover.py` +3. `discover.py` calls each configured board (standard + custom) for each active profile +4. Results are inserted into the `jobs` table with status `pending` +5. Jobs with URLs already in the database are silently skipped (URL is the unique key) +6. After insertion, `scripts/match.py` runs keyword scoring on all new jobs + +--- + +## Search Profiles + +Profiles are defined in `config/search_profiles.yaml`. You can have multiple profiles running simultaneously. + +### Profile fields + +```yaml +profiles: + - name: cs_leadership # unique identifier + titles: + - Customer Success Manager + - Director of Customer Success + locations: + - Remote + - San Francisco Bay Area, CA + boards: + - linkedin + - indeed + - glassdoor + - zip_recruiter + - google + custom_boards: + - adzuna + - theladders + - craigslist + exclude_keywords: # titles containing these words are dropped + - sales + - account executive + - SDR + results_per_board: 75 # max jobs per board per run + hours_old: 240 # only fetch jobs posted in last N hours + mission_tags: # optional — triggers mission-alignment cover letter hints + - music +``` + +### Adding a new profile + +Open `config/search_profiles.yaml` and add an entry under `profiles:`. The next discovery run picks it up automatically — no restart required. + +### Mission tags + +`mission_tags` links a profile to industries you care about. When cover letters are generated for jobs from a mission-tagged profile, the LLM prompt includes a personal alignment note (configured in `config/user.yaml` under `mission_preferences`). Supported tags: `music`, `animal_welfare`, `education`. + +--- + +## Standard Job Boards + +These boards are powered by the [JobSpy](https://github.com/Bunsly/JobSpy) library: + +| Board key | Source | +|-----------|--------| +| `linkedin` | LinkedIn Jobs | +| `indeed` | Indeed | +| `glassdoor` | Glassdoor | +| `zip_recruiter` | ZipRecruiter | +| `google` | Google Jobs | + +--- + +## Custom Job Board Scrapers + +Custom scrapers are in `scripts/custom_boards/`. They are registered in `discover.py` and activated per-profile via the `custom_boards` list. + +| Key | Source | Notes | +|-----|--------|-------| +| `adzuna` | [Adzuna Jobs API](https://developer.adzuna.com/) | Requires `config/adzuna.yaml` with `app_id` and `app_key` | +| `theladders` | The Ladders | SSR scraper via `curl_cffi`; no credentials needed | +| `craigslist` | Craigslist | Requires `config/craigslist.yaml` with target city slugs | + +To add your own scraper, see [Adding a Scraper](../developer-guide/adding-scrapers.md). + +--- + +## Running Discovery + +### From the UI + +1. Open the **Home** page +2. Click **Run Discovery** +3. Peregrine runs all active search profiles in sequence +4. A progress bar shows board-by-board status +5. A summary shows how many new jobs were inserted vs. already known + +### From the command line + +```bash +conda run -n job-seeker python scripts/discover.py +``` + +--- + +## Filling Missing Descriptions + +Some boards (particularly Glassdoor) return only a short description snippet. Click **Fill Missing Descriptions** on the Home page to trigger the `enrich_descriptions` background task. + +The enricher visits each job URL and attempts to extract the full description from the page HTML. This runs as a background task so you can continue using the UI. + +You can also enrich a specific job from the Job Review page by clicking the refresh icon next to its description. + +--- + +## Keyword Matching + +After discovery, `scripts/match.py` scores each new job by comparing the job description against your resume keywords (from `config/resume_keywords.yaml`). The score is stored as `match_score` (0–100). Gaps are stored as `keyword_gaps` (comma-separated missing keywords). + +Both fields appear in the Job Review queue and can be used to sort and prioritise jobs. diff --git a/docs/user-guide/job-review.md b/docs/user-guide/job-review.md new file mode 100644 index 0000000..f58bcdb --- /dev/null +++ b/docs/user-guide/job-review.md @@ -0,0 +1,70 @@ +# Job Review + +The Job Review page is where you approve or reject newly discovered jobs before they enter the application pipeline. + +--- + +## The Pending Queue + +All jobs with status `pending` appear in the review queue. Jobs with email leads (matching email threads already in the `job_contacts` table) are sorted to the top of the queue automatically. + +--- + +## Sorting Options + +Use the sort control at the top of the page to order the queue: + +| Sort option | Description | +|-------------|-------------| +| **Match score (high to low)** | Jobs with the strongest keyword match appear first | +| **Match score (low to high)** | Useful for finding niche roles that scored low but are still interesting | +| **Date found (newest)** | Most recently discovered jobs first | +| **Date found (oldest)** | Oldest jobs first (useful for clearing a backlog) | +| **Company (A-Z)** | Alphabetical by company name | + +--- + +## Match Score and Keyword Gaps + +Each job card shows: + +- **Match score** (0–100) — percentage of your resume keywords found in the job description +- **Keyword gaps** — specific keywords from your profile that the job description is missing + +A high match score does not guarantee a good fit; use it as a signal to prioritise your review, not as a final filter. + +--- + +## Reviewing Jobs + +For each job in the queue you can: + +- **Approve** — moves the job to `approved` status, making it available in the Apply Workspace +- **Reject** — moves the job to `rejected` status and removes it from the queue +- **Skip** — leaves the job in `pending` for a later review session + +### Batch actions + +Use the checkboxes to select multiple jobs at once, then click **Approve selected** or **Reject selected** to process them in bulk. + +--- + +## Job Detail View + +Click a job title to expand the full detail view, which shows: + +- Full job description +- Company name and location +- Source board and original URL +- Salary (if available) +- Remote/on-site status +- Match score and keyword gaps +- Any email threads already linked to this job + +--- + +## After Approval + +Approved jobs appear in the **Apply Workspace** (page 4). From there you can generate a cover letter, export a PDF, and mark the job as applied. + +If you decide not to apply after approving, you can reject the listing from within the Apply Workspace without losing your cover letter draft. diff --git a/docs/user-guide/settings.md b/docs/user-guide/settings.md new file mode 100644 index 0000000..23ab8eb --- /dev/null +++ b/docs/user-guide/settings.md @@ -0,0 +1,152 @@ +# Settings + +The Settings page is accessible from the sidebar. It contains all configuration for Peregrine, organised into tabs. + +--- + +## My Profile + +Personal information used in cover letters, research briefs, and interview prep. + +| Field | Description | +|-------|-------------| +| Name | Your full name | +| Email | Contact email address | +| Phone | Contact phone number | +| LinkedIn | LinkedIn profile URL | +| Career summary | 2–4 sentence professional summary | +| NDA companies | Companies you cannot mention in research briefs (previous employers under NDA) | +| Docs directory | Where PDFs and exported documents are saved (default: `~/Documents/JobSearch`) | + +### Mission Preferences + +Optional notes about industries you genuinely care about. When the cover letter generator detects alignment with one of these industries, it injects your note into paragraph 3 of the cover letter. + +| Field | Tag | Example | +|-------|-----|---------| +| Music industry note | `music` | "I've played in bands for 15 years and care deeply about how artists get paid" | +| Animal welfare note | `animal_welfare` | "I volunteer at my local shelter every weekend" | +| Education note | `education` | "I tutored underserved kids and care deeply about literacy" | + +Leave a field blank to use a generic default when alignment is detected. + +### Research Brief Preferences + +Controls optional sections in company research briefs. Both are for personal decision-making only and are never included in applications. + +| Setting | Section added | +|---------|--------------| +| Candidate accessibility focus | Disability inclusion and accessibility signals (ADA, ERGs, WCAG) | +| Candidate LGBTQIA+ focus | LGBTQIA+ inclusion signals (ERGs, non-discrimination policies, culture) | + +--- + +## Search + +Manage search profiles. Equivalent to editing `config/search_profiles.yaml` directly, but with a form UI. + +- Add, edit, and delete profiles +- Configure titles, locations, boards, custom boards, exclude keywords, and mission tags +- Changes are saved to `config/search_profiles.yaml` + +--- + +## LLM Backends + +Configure which LLM backends Peregrine uses and in what order. + +| Setting | Description | +|---------|-------------| +| Enabled toggle | Whether a backend is considered in the fallback chain | +| Base URL | API endpoint (for `openai_compat` backends) | +| Model | Model name or `__auto__` (vLLM auto-detects the loaded model) | +| API key | API key if required | +| Test button | Sends a short ping to verify the backend is reachable | + +### Fallback chains + +Three independent fallback chains are configured: + +| Chain | Used for | +|-------|---------| +| `fallback_order` | Cover letter generation and general tasks | +| `research_fallback_order` | Company research briefs | +| `vision_fallback_order` | Survey screenshot analysis | + +--- + +## Notion + +Configure Notion integration credentials. Requires: +- Notion integration token (from [notion.so/my-integrations](https://www.notion.so/my-integrations)) +- Database ID (from the Notion database URL) + +The field map controls which Notion properties correspond to which Peregrine fields. Edit `config/notion.yaml` directly for advanced field mapping. + +--- + +## Services + +Connection settings for local services: + +| Service | Default host:port | +|---------|-----------------| +| Ollama | localhost:11434 | +| vLLM | localhost:8000 | +| SearXNG | localhost:8888 | + +Each service has SSL and SSL-verify toggles for reverse-proxy setups. + +--- + +## Resume Profile + +Edit your parsed resume data (work experience, education, skills, certifications). This is the same data extracted during the first-run wizard Resume step. + +Changes here affect all future cover letter generations. + +--- + +## Email + +Configure IMAP email sync. See [Email Sync](email-sync.md) for full setup instructions. + +--- + +## Skills + +Manage your `config/resume_keywords.yaml` — the list of skills and keywords used for match scoring. + +Add or remove keywords. Higher-weighted keywords count more toward the match score. + +--- + +## Integrations + +Connection cards for all 13 integrations. See [Integrations](integrations.md) for per-service details. + +--- + +## Fine-Tune + +**Tier: Premium** + +Tools for fine-tuning a cover letter model on your personal writing style. + +- Export cover letter training data as JSONL +- Configure training parameters (rank, epochs, learning rate) +- Start a fine-tuning run (requires `ogma` conda environment with Unsloth) +- Register the output model with Ollama + +--- + +## Developer + +Developer and debugging tools. + +| Option | Description | +|--------|-------------| +| Reset wizard | Sets `wizard_complete: false` and `wizard_step: 0`; resumes at step 1 on next page load | +| Dev tier override | Set `dev_tier_override` to `paid` or `premium` to test tier-gated features locally | +| Clear stuck tasks | Manually sets any `running` or `queued` background tasks to `failed` (also runs on app startup) | +| View raw config | Shows the current `config/user.yaml` contents | diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..b908b75 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,67 @@ +site_name: Peregrine +site_description: AI-powered job search pipeline +site_author: Circuit Forge LLC +site_url: https://docs.circuitforge.io/peregrine +repo_url: https://git.circuitforge.io/circuitforge/peregrine +repo_name: circuitforge/peregrine + +theme: + name: material + palette: + - scheme: default + primary: indigo + accent: indigo + toggle: + icon: material/brightness-7 + name: Switch to dark mode + - scheme: slate + primary: indigo + accent: indigo + toggle: + icon: material/brightness-4 + name: Switch to light mode + features: + - navigation.tabs + - navigation.sections + - navigation.expand + - navigation.top + - search.suggest + - search.highlight + - content.code.copy + +markdown_extensions: + - admonition + - pymdownx.details + - pymdownx.superfences + - pymdownx.highlight: + anchor_linenums: true + - pymdownx.tabbed: + alternate_style: true + - tables + - toc: + permalink: true + +nav: + - Home: index.md + - Getting Started: + - Installation: getting-started/installation.md + - First-Run Wizard: getting-started/first-run-wizard.md + - Docker Profiles: getting-started/docker-profiles.md + - User Guide: + - Job Discovery: user-guide/job-discovery.md + - Job Review: user-guide/job-review.md + - Apply Workspace: user-guide/apply-workspace.md + - Interviews: user-guide/interviews.md + - Email Sync: user-guide/email-sync.md + - Integrations: user-guide/integrations.md + - Settings: user-guide/settings.md + - Developer Guide: + - Contributing: developer-guide/contributing.md + - Architecture: developer-guide/architecture.md + - Adding a Scraper: developer-guide/adding-scrapers.md + - Adding an Integration: developer-guide/adding-integrations.md + - Testing: developer-guide/testing.md + - Reference: + - Tier System: reference/tier-system.md + - LLM Router: reference/llm-router.md + - Config Files: reference/config-files.md