Compare commits
No commits in common. "v0.2.2" and "main" have entirely different histories.
284 changed files with 42227 additions and 18705 deletions
16
.env.e2e.example
Normal file
16
.env.e2e.example
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
# Peregrine E2E test harness credentials
|
||||
# Copy to .env.e2e and fill in real values — .env.e2e is gitignored
|
||||
|
||||
HEIMDALL_ADMIN_TOKEN=changeme
|
||||
HEIMDALL_URL=http://localhost:8900
|
||||
|
||||
# Cloud auth — Strategy A (preferred): Directus user/pass → fresh JWT per run
|
||||
E2E_DIRECTUS_EMAIL=e2e@circuitforge.tech
|
||||
E2E_DIRECTUS_PASSWORD=changeme
|
||||
E2E_DIRECTUS_URL=http://172.31.0.2:8055
|
||||
|
||||
# Cloud auth — Strategy B (fallback): persistent JWT (uncomment to use)
|
||||
# E2E_DIRECTUS_JWT=changeme
|
||||
|
||||
E2E_HEADLESS=true
|
||||
E2E_SLOW_MO=0
|
||||
37
.env.example
37
.env.example
|
|
@ -12,11 +12,44 @@ VISION_REVISION=2025-01-09
|
|||
|
||||
DOCS_DIR=~/Documents/JobSearch
|
||||
OLLAMA_MODELS_DIR=~/models/ollama
|
||||
VLLM_MODELS_DIR=~/models/vllm
|
||||
VLLM_MODEL=Ouro-1.4B
|
||||
VLLM_MODELS_DIR=~/models/vllm # override with full path to your model dir
|
||||
VLLM_MODEL=Ouro-1.4B # cover letters — fast 1.4B model
|
||||
VLLM_RESEARCH_MODEL=Ouro-2.6B-Thinking # research — reasoning 2.6B model; restart vllm to switch
|
||||
VLLM_MAX_MODEL_LEN=4096 # increase to 8192 for Thinking models with long CoT
|
||||
VLLM_GPU_MEM_UTIL=0.75 # lower to 0.6 if sharing GPU with other services
|
||||
OLLAMA_DEFAULT_MODEL=llama3.2:3b
|
||||
|
||||
# ── LLM env-var auto-config (alternative to config/llm.yaml) ─────────────────
|
||||
# Set any of these to configure LLM backends without needing a config/llm.yaml.
|
||||
# Priority: Anthropic > OpenAI-compat > Ollama (always tried as local fallback).
|
||||
OLLAMA_HOST=http://localhost:11434 # Ollama host; override if on a different machine
|
||||
OLLAMA_MODEL=llama3.2:3b # model to request from Ollama
|
||||
OPENAI_MODEL=gpt-4o-mini # model override for OpenAI-compat backend
|
||||
ANTHROPIC_MODEL=claude-haiku-4-5-20251001 # model override for Anthropic backend
|
||||
|
||||
# API keys (required for remote profile)
|
||||
ANTHROPIC_API_KEY=
|
||||
OPENAI_COMPAT_URL=
|
||||
OPENAI_COMPAT_KEY=
|
||||
|
||||
# Feedback button — Forgejo issue filing
|
||||
FORGEJO_API_TOKEN=
|
||||
FORGEJO_REPO=pyr0ball/peregrine
|
||||
FORGEJO_API_URL=https://git.opensourcesolarpunk.com/api/v1
|
||||
# GITHUB_TOKEN= # future — enable when public mirror is active
|
||||
# GITHUB_REPO= # future
|
||||
|
||||
# ── CF-hosted coordinator (Paid+ tier) ───────────────────────────────────────
|
||||
# Set CF_LICENSE_KEY to authenticate with the hosted coordinator.
|
||||
# Leave both blank for local self-hosted cf-orch or bare-metal inference.
|
||||
CF_LICENSE_KEY=
|
||||
CF_ORCH_URL=https://orch.circuitforge.tech
|
||||
|
||||
# Cloud multi-tenancy (compose.cloud.yml only — do not set for local installs)
|
||||
CLOUD_MODE=false
|
||||
CLOUD_DATA_ROOT=/devl/menagerie-data
|
||||
DIRECTUS_JWT_SECRET= # must match website/.env DIRECTUS_SECRET value
|
||||
CF_SERVER_SECRET= # random 64-char hex — generate: openssl rand -hex 32
|
||||
PLATFORM_DB_URL=postgresql://cf_platform:<password>@host.docker.internal:5433/circuitforge_platform
|
||||
HEIMDALL_URL=http://cf-license:8000 # internal Docker URL; override for external access
|
||||
HEIMDALL_ADMIN_TOKEN= # must match ADMIN_TOKEN in circuitforge-license .env
|
||||
|
|
|
|||
30
.gitea/ISSUE_TEMPLATE/bug_report.md
Normal file
30
.gitea/ISSUE_TEMPLATE/bug_report.md
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
---
|
||||
name: Bug report
|
||||
about: Something isn't working correctly
|
||||
labels: bug
|
||||
---
|
||||
|
||||
## Describe the bug
|
||||
|
||||
<!-- A clear description of what went wrong. -->
|
||||
|
||||
## Steps to reproduce
|
||||
|
||||
1.
|
||||
2.
|
||||
3.
|
||||
|
||||
## Expected behaviour
|
||||
|
||||
## Actual behaviour
|
||||
|
||||
<!-- Paste relevant log output below (redact any API keys or personal info): -->
|
||||
|
||||
```
|
||||
|
||||
## Environment
|
||||
|
||||
- Peregrine version: <!-- output of `./manage.sh status` or git tag -->
|
||||
- OS:
|
||||
- Runtime: Docker / conda-direct
|
||||
- GPU profile: remote / cpu / single-gpu / dual-gpu
|
||||
26
.gitea/ISSUE_TEMPLATE/feature_request.md
Normal file
26
.gitea/ISSUE_TEMPLATE/feature_request.md
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
---
|
||||
name: Feature request
|
||||
about: Suggest an improvement or new capability
|
||||
labels: enhancement
|
||||
---
|
||||
|
||||
## Problem statement
|
||||
|
||||
<!-- What are you trying to do that's currently hard or impossible? -->
|
||||
|
||||
## Proposed solution
|
||||
|
||||
## Alternatives considered
|
||||
|
||||
## Which tier would this belong to?
|
||||
|
||||
- [ ] Free
|
||||
- [ ] Paid
|
||||
- [ ] Premium
|
||||
- [ ] Ultra (human-in-the-loop)
|
||||
- [ ] Not sure
|
||||
|
||||
## Would you be willing to contribute a PR?
|
||||
|
||||
- [ ] Yes
|
||||
- [ ] No
|
||||
32
.githooks/commit-msg
Executable file
32
.githooks/commit-msg
Executable file
|
|
@ -0,0 +1,32 @@
|
|||
#!/usr/bin/env bash
|
||||
# .githooks/commit-msg — enforces conventional commit format
|
||||
# Format: type: description OR type(scope): description
|
||||
set -euo pipefail
|
||||
|
||||
RED='\033[0;31m'; YELLOW='\033[1;33m'; NC='\033[0m'
|
||||
|
||||
VALID_TYPES="feat|fix|docs|chore|test|refactor|perf|ci|build"
|
||||
MSG_FILE="$1"
|
||||
MSG=$(head -1 "$MSG_FILE")
|
||||
|
||||
if [[ -z "${MSG// }" ]]; then
|
||||
echo -e "${RED}Commit rejected:${NC} Commit message is empty."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! echo "$MSG" | grep -qE "^($VALID_TYPES)(\(.+\))?: .+"; then
|
||||
echo -e "${RED}Commit rejected:${NC} Message does not follow conventional commit format."
|
||||
echo ""
|
||||
echo -e " Required: ${YELLOW}type: description${NC} or ${YELLOW}type(scope): description${NC}"
|
||||
echo -e " Valid types: ${YELLOW}$VALID_TYPES${NC}"
|
||||
echo ""
|
||||
echo -e " Your message: ${YELLOW}$MSG${NC}"
|
||||
echo ""
|
||||
echo -e " Examples:"
|
||||
echo -e " ${YELLOW}feat: add cover letter refinement${NC}"
|
||||
echo -e " ${YELLOW}fix(wizard): handle missing user.yaml gracefully${NC}"
|
||||
echo -e " ${YELLOW}docs: update tier system reference${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
||||
84
.githooks/pre-commit
Executable file
84
.githooks/pre-commit
Executable file
|
|
@ -0,0 +1,84 @@
|
|||
#!/usr/bin/env bash
|
||||
# .githooks/pre-commit — blocks sensitive files and credential patterns from being committed
|
||||
set -euo pipefail
|
||||
|
||||
RED='\033[0;31m'; YELLOW='\033[1;33m'; BOLD='\033[1m'; NC='\033[0m'
|
||||
|
||||
BLOCKED=0
|
||||
STAGED=$(git diff --cached --name-only --diff-filter=ACM 2>/dev/null)
|
||||
|
||||
if [[ -z "$STAGED" ]]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# ── Blocked filenames ──────────────────────────────────────────────────────────
|
||||
BLOCKED_FILES=(
|
||||
".env"
|
||||
".env.local"
|
||||
".env.production"
|
||||
".env.staging"
|
||||
"*.pem"
|
||||
"*.key"
|
||||
"*.p12"
|
||||
"*.pfx"
|
||||
"id_rsa"
|
||||
"id_ecdsa"
|
||||
"id_ed25519"
|
||||
"id_dsa"
|
||||
"*.ppk"
|
||||
"secrets.yml"
|
||||
"secrets.yaml"
|
||||
"credentials.json"
|
||||
"service-account*.json"
|
||||
"*.keystore"
|
||||
"htpasswd"
|
||||
".htpasswd"
|
||||
)
|
||||
|
||||
while IFS= read -r file; do
|
||||
filename="$(basename "$file")"
|
||||
for pattern in "${BLOCKED_FILES[@]}"; do
|
||||
# shellcheck disable=SC2254
|
||||
case "$filename" in
|
||||
$pattern)
|
||||
echo -e "${RED}BLOCKED:${NC} ${BOLD}$file${NC} matches blocked filename pattern '${YELLOW}$pattern${NC}'"
|
||||
BLOCKED=1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
done <<< "$STAGED"
|
||||
|
||||
# ── Blocked content patterns ───────────────────────────────────────────────────
|
||||
declare -A CONTENT_PATTERNS=(
|
||||
["RSA/EC private key header"]="-----BEGIN (RSA|EC|DSA|OPENSSH) PRIVATE KEY"
|
||||
["AWS access key"]="AKIA[0-9A-Z]{16}"
|
||||
["GitHub token"]="ghp_[A-Za-z0-9]{36}"
|
||||
["Generic API key assignment"]="(api_key|API_KEY|secret_key|SECRET_KEY)\s*=\s*['\"][A-Za-z0-9_\-]{16,}"
|
||||
["Stripe secret key"]="sk_(live|test)_[A-Za-z0-9]{24,}"
|
||||
["Forgejo/Gitea token (40 hex chars)"]="[a-f0-9]{40}"
|
||||
)
|
||||
|
||||
while IFS= read -r file; do
|
||||
# Skip binary files
|
||||
if git diff --cached -- "$file" | grep -qP "^\+.*\x00"; then
|
||||
continue
|
||||
fi
|
||||
for label in "${!CONTENT_PATTERNS[@]}"; do
|
||||
pattern="${CONTENT_PATTERNS[$label]}"
|
||||
matches=$(git diff --cached -- "$file" | grep "^+" | grep -cP "$pattern" 2>/dev/null || true)
|
||||
if [[ "$matches" -gt 0 ]]; then
|
||||
echo -e "${RED}BLOCKED:${NC} ${BOLD}$file${NC} contains pattern matching '${YELLOW}$label${NC}'"
|
||||
BLOCKED=1
|
||||
fi
|
||||
done
|
||||
done <<< "$STAGED"
|
||||
|
||||
# ── Result ─────────────────────────────────────────────────────────────────────
|
||||
if [[ "$BLOCKED" -eq 1 ]]; then
|
||||
echo ""
|
||||
echo -e "${RED}Commit rejected.${NC} Remove sensitive files/content before committing."
|
||||
echo -e "To bypass in an emergency: ${YELLOW}git commit --no-verify${NC} (use with extreme caution)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
||||
30
.github/ISSUE_TEMPLATE/bug_report.md
vendored
Normal file
30
.github/ISSUE_TEMPLATE/bug_report.md
vendored
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
---
|
||||
name: Bug report
|
||||
about: Something isn't working correctly
|
||||
labels: bug
|
||||
---
|
||||
|
||||
## Describe the bug
|
||||
|
||||
<!-- A clear description of what went wrong. -->
|
||||
|
||||
## Steps to reproduce
|
||||
|
||||
1.
|
||||
2.
|
||||
3.
|
||||
|
||||
## Expected behaviour
|
||||
|
||||
## Actual behaviour
|
||||
|
||||
<!-- Paste relevant log output below (redact any API keys or personal info): -->
|
||||
|
||||
```
|
||||
|
||||
## Environment
|
||||
|
||||
- Peregrine version: <!-- output of `./manage.sh status` or git tag -->
|
||||
- OS:
|
||||
- Runtime: Docker / conda-direct
|
||||
- GPU profile: remote / cpu / single-gpu / dual-gpu
|
||||
5
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
5
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
blank_issues_enabled: false
|
||||
contact_links:
|
||||
- name: Security vulnerability
|
||||
url: mailto:security@circuitforge.tech
|
||||
about: Do not open a public issue for security vulnerabilities. Email us instead.
|
||||
26
.github/ISSUE_TEMPLATE/feature_request.md
vendored
Normal file
26
.github/ISSUE_TEMPLATE/feature_request.md
vendored
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
---
|
||||
name: Feature request
|
||||
about: Suggest an improvement or new capability
|
||||
labels: enhancement
|
||||
---
|
||||
|
||||
## Problem statement
|
||||
|
||||
<!-- What are you trying to do that's currently hard or impossible? -->
|
||||
|
||||
## Proposed solution
|
||||
|
||||
## Alternatives considered
|
||||
|
||||
## Which tier would this belong to?
|
||||
|
||||
- [ ] Free
|
||||
- [ ] Paid
|
||||
- [ ] Premium
|
||||
- [ ] Ultra (human-in-the-loop)
|
||||
- [ ] Not sure
|
||||
|
||||
## Would you be willing to contribute a PR?
|
||||
|
||||
- [ ] Yes
|
||||
- [ ] No
|
||||
26
.github/ISSUE_TEMPLATE/support_request.md
vendored
Normal file
26
.github/ISSUE_TEMPLATE/support_request.md
vendored
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
---
|
||||
name: Support Request
|
||||
about: Ask a question or get help using Peregrine
|
||||
title: '[Support] '
|
||||
labels: question
|
||||
assignees: ''
|
||||
---
|
||||
|
||||
## What are you trying to do?
|
||||
|
||||
<!-- Describe what you're trying to accomplish -->
|
||||
|
||||
## What have you tried?
|
||||
|
||||
<!-- Steps you've already taken, docs you've read, etc. -->
|
||||
|
||||
## Environment
|
||||
|
||||
- OS: <!-- e.g. Ubuntu 22.04, macOS 14 -->
|
||||
- Install method: <!-- Docker / Podman / source -->
|
||||
- Peregrine version: <!-- run `./manage.sh status` or check the UI footer -->
|
||||
- LLM backend: <!-- Ollama / vLLM / OpenAI / other -->
|
||||
|
||||
## Logs or screenshots
|
||||
|
||||
<!-- Paste relevant output from `./manage.sh logs` or attach a screenshot -->
|
||||
27
.github/pull_request_template.md
vendored
Normal file
27
.github/pull_request_template.md
vendored
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
## Summary
|
||||
|
||||
<!-- What does this PR do? -->
|
||||
|
||||
## Related issue(s)
|
||||
|
||||
Closes #
|
||||
|
||||
## Type of change
|
||||
|
||||
- [ ] feat — new feature
|
||||
- [ ] fix — bug fix
|
||||
- [ ] docs — documentation only
|
||||
- [ ] chore — tooling, deps, refactor
|
||||
- [ ] test — test coverage
|
||||
|
||||
## Testing
|
||||
|
||||
<!-- What did you run to verify this works? -->
|
||||
|
||||
```bash
|
||||
pytest tests/ -v
|
||||
```
|
||||
|
||||
## CLA
|
||||
|
||||
- [ ] I agree that my contribution is licensed under the project's [BSL 1.1](./LICENSE-BSL) terms.
|
||||
35
.github/workflows/ci.yml
vendored
Normal file
35
.github/workflows/ci.yml
vendored
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
pull_request:
|
||||
branches: [main]
|
||||
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Install system dependencies
|
||||
run: sudo apt-get update -q && sudo apt-get install -y libsqlcipher-dev
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
cache: pip
|
||||
|
||||
- name: Configure git credentials for Forgejo
|
||||
env:
|
||||
FORGEJO_TOKEN: ${{ secrets.FORGEJO_TOKEN }}
|
||||
run: |
|
||||
git config --global url."https://oauth2:${FORGEJO_TOKEN}@git.opensourcesolarpunk.com/".insteadOf "https://git.opensourcesolarpunk.com/"
|
||||
|
||||
- name: Install dependencies
|
||||
run: pip install -r requirements.txt
|
||||
|
||||
- name: Run tests
|
||||
run: pytest tests/ -v --tb=short
|
||||
22
.gitignore
vendored
22
.gitignore
vendored
|
|
@ -35,3 +35,25 @@ config/user.yaml.working
|
|||
|
||||
# Claude context files — kept out of version control
|
||||
CLAUDE.md
|
||||
.superpowers/
|
||||
pytest-output.txt
|
||||
docs/superpowers/
|
||||
|
||||
data/email_score.jsonl
|
||||
data/email_label_queue.jsonl
|
||||
data/email_compare_sample.jsonl
|
||||
|
||||
config/label_tool.yaml
|
||||
config/server.yaml
|
||||
|
||||
demo/data/*.db
|
||||
demo/seed_demo.py
|
||||
|
||||
# Git worktrees
|
||||
.worktrees/
|
||||
.env.e2e
|
||||
|
||||
# E2E test result artifacts
|
||||
tests/e2e/results/demo/
|
||||
tests/e2e/results/cloud/
|
||||
tests/e2e/results/local/
|
||||
|
|
|
|||
32
.gitleaks.toml
Normal file
32
.gitleaks.toml
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
# peregrine/.gitleaks.toml — per-repo allowlists extending the shared base config
|
||||
[extend]
|
||||
path = "/Library/Development/CircuitForge/circuitforge-hooks/gitleaks.toml"
|
||||
|
||||
[allowlist]
|
||||
description = "Peregrine-specific allowlists"
|
||||
paths = [
|
||||
'docs/plans/.*', # plan docs contain example tokens and placeholders
|
||||
'docs/reference/.*', # reference docs (globally excluded in base config)
|
||||
'tests/.*', # test fixtures use fake phone numbers as job IDs
|
||||
'scripts/integrations/apple_calendar\.py', # you@icloud.com is a placeholder comment
|
||||
# Streamlit app files: key= params are widget identifiers, not secrets
|
||||
'app/feedback\.py',
|
||||
'app/pages/2_Settings\.py',
|
||||
'app/pages/7_Survey\.py',
|
||||
# SearXNG default config: change-me-in-production is a well-known public placeholder
|
||||
'docker/searxng/settings\.yml',
|
||||
]
|
||||
regexes = [
|
||||
# Job listing numeric IDs (look like phone numbers to the phone rule)
|
||||
'\d{10}\.html', # Craigslist listing IDs
|
||||
'\d{10}\/', # LinkedIn job IDs in URLs
|
||||
# Localhost port patterns (look like phone numbers)
|
||||
'localhost:\d{4,5}',
|
||||
# Unix epoch timestamps in the 2025–2026 range (10-digit, look like phone numbers)
|
||||
'174\d{7}',
|
||||
# Example / placeholder license key patterns
|
||||
'CFG-[A-Z]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}',
|
||||
# Phone number false positives: 555 area code variants not caught by base allowlist
|
||||
'555\) \d{3}-\d{4}',
|
||||
'555-\d{3}-\d{4}',
|
||||
]
|
||||
365
CHANGELOG.md
365
CHANGELOG.md
|
|
@ -7,8 +7,371 @@ Format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
---
|
||||
|
||||
## [0.8.5] — 2026-04-02
|
||||
|
||||
### Added
|
||||
- Cover letter iterative refinement: "Refine with Feedback" expander in Apply Workspace; `generate()` accepts `previous_result`/`feedback`; task params passed through `submit_task`
|
||||
|
||||
- **Vue onboarding wizard** — 7-step first-run setup replaces the Streamlit wizard
|
||||
in the Vue SPA: Hardware detection → Tier → Resume upload/build → Identity →
|
||||
Inference & API keys → Search preferences → Integrations. Progress saves to
|
||||
`user.yaml` on every step; crash-recovery resumes from the last completed step.
|
||||
- **Wizard API endpoints** — `GET /api/wizard/status`, `POST /api/wizard/step`,
|
||||
`GET /api/wizard/hardware`, `POST /api/wizard/inference/test`,
|
||||
`POST /api/wizard/complete`. Inference test always soft-fails so Ollama being
|
||||
unreachable never blocks setup completion.
|
||||
- **Cloud auto-skip** — cloud instances automatically complete steps 1 (hardware),
|
||||
2 (tier), and 5 (inference) and drop the user directly on the Resume step.
|
||||
- **`wizardGuard` router gate** — all Vue routes require wizard completion; completed
|
||||
users are bounced away from `/setup` to `/`.
|
||||
- **Chip-input search step** — job titles and locations entered as press-Enter/comma
|
||||
chips; validates at least one title before advancing.
|
||||
- **Integrations tile grid** — optional step 7 shows Notion, Calendar, Slack, Discord,
|
||||
Drive with paid-tier badges; skippable on Finish.
|
||||
|
||||
### Fixed
|
||||
|
||||
- **User config isolation: dangerous fallback removed** — `_user_yaml_path()` fell
|
||||
back to `/devl/job-seeker/config/user.yaml` (legacy profile) when `user.yaml`
|
||||
didn't exist at the expected path; new users now get an empty dict instead of
|
||||
another user's data. Affects profile, resume, search, and all wizard endpoints.
|
||||
- **Resume path not user-isolated** — `RESUME_PATH = Path("config/plain_text_resume.yaml")`
|
||||
was a relative CWD path shared across all users. Replaced with `_resume_path()`
|
||||
derived from `_user_yaml_path()` / `STAGING_DB`.
|
||||
- **Resume upload silently returned empty data** — `upload_resume` was passing a
|
||||
file path string to `structure_resume()` which expects raw text; now reads bytes
|
||||
and dispatches to the correct extractor (`extract_text_from_pdf` / `_docx` / `_odt`).
|
||||
- **Wizard resume step read wrong envelope field** — `WizardResumeStep.vue` read
|
||||
`data.experience` but the upload response wraps parsed data under `data.data`.
|
||||
|
||||
---
|
||||
|
||||
## [0.8.4] — 2026-04-02
|
||||
|
||||
### Fixed
|
||||
|
||||
- **Cloud: cover letter used wrong user's profile** — `generate_cover_letter.generate()`
|
||||
loaded `_profile` from the global `config/user.yaml` at module import time, so all
|
||||
cloud users got the default user's name, voice, and mission preferences in their
|
||||
generated letters. `generate()` now accepts a `user_yaml_path` parameter; `task_runner`
|
||||
derives it from the per-user config directory (`db_path/../config/user.yaml`) and
|
||||
passes it through. `_build_system_context`, `_build_mission_notes`, `detect_mission_alignment`,
|
||||
`build_prompt`, and `_trim_to_letter_end` all accept a `profile` override so the
|
||||
per-call profile is used end-to-end without breaking CLI mode.
|
||||
- **Apply Workspace: hardcoded config paths in cloud mode** — `4_Apply.py` was loading
|
||||
`_USER_YAML` and `RESUME_YAML` from the repo-root `config/` before `resolve_session()`
|
||||
ran, so cloud users saw the global (Meg's) resume in the Apply tab. Both paths now
|
||||
derive from `get_config_dir()` after session resolution.
|
||||
|
||||
### Changed
|
||||
|
||||
- **Vue SPA open to all tiers** — Vue 3 frontend is no longer gated behind the beta
|
||||
flag; all tier users can switch to the Vue UI from Settings.
|
||||
- **LLM model candidates** — vllm backend now tries Qwen2.5-3B first, Phi-4-mini
|
||||
as fallback (was reversed). cf_orch allocation block added to vllm config.
|
||||
- **Preflight** — removed `vllm` from Docker adoption list; vllm is now managed
|
||||
entirely by cf-orch and should not be stubbed by preflight.
|
||||
|
||||
---
|
||||
|
||||
## [0.8.3] — 2026-04-01
|
||||
|
||||
### Fixed
|
||||
- **CI: Forgejo auth** — GitHub Actions `pip install` was failing to fetch
|
||||
`circuitforge-core` from the private Forgejo VCS URL. Added `FORGEJO_TOKEN`
|
||||
repository secret and a `git config insteadOf` step to inject credentials
|
||||
before `pip install`.
|
||||
- **CI: settings API tests** — 6 `test_dev_api_settings` PUT/POST tests were
|
||||
returning HTTP 500 in CI because `_user_yaml_path()` read the module-level
|
||||
`DB_PATH` constant (frozen at import time), so `monkeypatch.setenv("STAGING_DB")`
|
||||
had no effect. Fixed by reading `os.environ` at call time.
|
||||
|
||||
---
|
||||
|
||||
## [0.8.2] — 2026-04-01
|
||||
|
||||
### Fixed
|
||||
- **CI pipeline** — `pip install -r requirements.txt` was failing in GitHub Actions
|
||||
because `-e ../circuitforge-core` requires a sibling directory that doesn't exist
|
||||
in a single-repo checkout. Replaced with a `git+https://` VCS URL fallback;
|
||||
`Dockerfile.cfcore` still installs from the local `COPY` to avoid redundant
|
||||
network fetches during Docker builds.
|
||||
- **Vue-nav reload loop** — `sync_ui_cookie()` was calling
|
||||
`window.parent.location.reload()` on every render when `user.yaml` has
|
||||
`ui_preference: vue` but no Caddy proxy is in the traffic path (test instances,
|
||||
bare Docker). Gated the reload on `PEREGRINE_CADDY_PROXY=1`; instances without
|
||||
the env var set the cookie silently and skip the reload.
|
||||
|
||||
### Changed
|
||||
- **cfcore VRAM lease integration** — the task scheduler now acquires a VRAM lease
|
||||
from the cf-orch coordinator before running a batch of LLM tasks and releases it
|
||||
when the batch completes. Visible in the coordinator dashboard at `:7700`.
|
||||
- **`CF_ORCH_URL` env var** — scheduler reads coordinator address from
|
||||
`CF_ORCH_URL` (default `http://localhost:7700`); set to
|
||||
`http://host.docker.internal:7700` in Docker compose files so containers can
|
||||
reach the host coordinator.
|
||||
- **All compose files on `Dockerfile.cfcore`** — `compose.yml`, `compose.cloud.yml`,
|
||||
and `compose.test-cfcore.yml` all use the parent-context build. `build: .` is
|
||||
removed from `compose.yml`.
|
||||
|
||||
---
|
||||
|
||||
## [0.8.1] — 2026-04-01
|
||||
|
||||
### Fixed
|
||||
- **Job title suggester silent failure** — when the LLM returned empty arrays or
|
||||
non-JSON text, the spinner would complete with zero UI feedback. Now shows an
|
||||
explicit "No new suggestions found" info message with a resume-upload hint for
|
||||
new users who haven't uploaded a resume yet.
|
||||
- **Suggester exception handling** — catch `Exception` instead of only
|
||||
`RuntimeError` so connection errors and `FileNotFoundError` (missing llm.yaml)
|
||||
surface as error messages rather than crashing the page silently.
|
||||
|
||||
### Added
|
||||
- **`Dockerfile.cfcore`** — parent-context Dockerfile that copies
|
||||
`circuitforge-core/` alongside `peregrine/` before `pip install`, resolving
|
||||
the `-e ../circuitforge-core` editable requirement inside Docker.
|
||||
- **`compose.test-cfcore.yml`** — single-user test instance on port 8516 for
|
||||
smoke-testing cfcore shim integration before promoting to the cloud instance.
|
||||
|
||||
---
|
||||
|
||||
## [0.8.0] — 2026-04-01
|
||||
|
||||
### Added
|
||||
- **ATS Resume Optimizer** (gap report free; LLM rewrite paid+)
|
||||
- `scripts/resume_optimizer.py` — full pipeline: TF-IDF gap extraction →
|
||||
`prioritize_gaps` → `rewrite_for_ats` → hallucination guard (anchor-set
|
||||
diffing on employers, institutions, and dates)
|
||||
- `scripts/db.py` — `optimized_resume` + `ats_gap_report` columns;
|
||||
`save_optimized_resume` / `get_optimized_resume` helpers
|
||||
- `GET /api/jobs/{id}/resume_optimizer` — fetch gap report + rewrite
|
||||
- `POST /api/jobs/{id}/resume_optimizer/generate` — queue rewrite task
|
||||
- `GET /api/jobs/{id}/resume_optimizer/task` — poll task status
|
||||
- `web/src/components/ResumeOptimizerPanel.vue` — gap report (all tiers),
|
||||
LLM rewrite section (paid+), hallucination warning badge, `.txt` download
|
||||
- `ResumeOptimizerPanel` integrated into `ApplyWorkspace`
|
||||
|
||||
- **Vue SPA full merge** (closes #8) — `feature/vue-spa` merged to `main`
|
||||
- `dev-api.py` — full FastAPI backend (settings, jobs, interviews, prep,
|
||||
survey, digest, resume optimizer); cloud session middleware (JWT → per-user
|
||||
SQLite); BYOK credential store
|
||||
- `dev_api.py` — symlink → `dev-api.py` for importable module alias
|
||||
- `scripts/job_ranker.py` — two-stage ranking for `/api/jobs/stack`
|
||||
- `scripts/credential_store.py` — per-user BYOK API key management
|
||||
- `scripts/user_profile.py` — `load_user_profile` / `save_user_profile`
|
||||
- `web/src/components/TaskIndicator.vue` + `web/src/stores/tasks.ts` —
|
||||
live background task queue display
|
||||
- `web/public/` — peregrine logo assets (SVG + PNG)
|
||||
|
||||
- **API test suite** — 5 new test modules (622 tests total)
|
||||
- `tests/test_dev_api_settings.py` (38 tests)
|
||||
- `tests/test_dev_api_interviews.py`, `test_dev_api_prep.py`,
|
||||
`test_dev_api_survey.py`, `test_dev_api_digest.py`
|
||||
|
||||
### Fixed
|
||||
- **Cloud DB routing** — `app/pages/1_Job_Review.py`, `5_Interviews.py`,
|
||||
`6_Interview_Prep.py`, `7_Survey.py` were hardcoding `DEFAULT_DB`; now
|
||||
use `get_db_path()` for correct per-user routing in cloud mode (#24)
|
||||
- **Test isolation** — `importlib.reload(dev_api)` in digest/interviews
|
||||
fixtures reset all module globals, silently breaking `monkeypatch.setattr`
|
||||
in subsequent test files; replaced with targeted `monkeypatch.setattr(dev_api,
|
||||
"DB_PATH", tmp_db)` (#26)
|
||||
|
||||
---
|
||||
|
||||
## [0.7.0] — 2026-03-22
|
||||
|
||||
### Added
|
||||
- **Vue 3 SPA — beta access for paid tier** — The new Vue 3 frontend (built with
|
||||
Vite + UnoCSS) is now merged into `main` and available to paid-tier subscribers
|
||||
as an opt-in beta. The Streamlit UI remains the default and will continue to
|
||||
receive full support.
|
||||
- `web/` — full Vue 3 SPA source (components, stores, router, composables,
|
||||
views) from `feature/vue-spa`
|
||||
- `web/src/components/ClassicUIButton.vue` — one-click switch back to the
|
||||
Classic (Streamlit) UI; sets `prgn_ui=streamlit` cookie and appends
|
||||
`?prgn_switch=streamlit` so `user.yaml` stays in sync
|
||||
- `web/src/composables/useFeatureFlag.ts` — reads `prgn_demo_tier` cookie for
|
||||
demo toolbar visual consistency (display-only, not an authoritative gate)
|
||||
|
||||
- **UI switcher** — Reddit-style opt-in to the Vue SPA with durable preference
|
||||
persistence and graceful fallback.
|
||||
- `app/components/ui_switcher.py` — `sync_ui_cookie()`, `switch_ui()`,
|
||||
`render_banner()`, `render_settings_toggle()`
|
||||
- `scripts/user_profile.py` — `ui_preference` field (`streamlit` | `vue`,
|
||||
default: `streamlit`) with round-trip `save()`
|
||||
- `app/wizard/tiers.py` — `vue_ui_beta: "paid"` feature key; `demo_tier`
|
||||
keyword arg on `can_use()` for thread-safe demo mode simulation
|
||||
- Banner (dismissible, paid tier only) + Settings → System → Deployment toggle
|
||||
- Caddy cookie routing: `prgn_ui=vue` → nginx Vue SPA; absent/`streamlit` →
|
||||
Streamlit. 502 fallback clears cookie and redirects with `?ui_fallback=1`
|
||||
|
||||
- **Demo toolbar** — slim full-width tier-simulation bar for `DEMO_MODE`
|
||||
instances. Free / Paid / Premium pills let demo visitors explore all feature
|
||||
tiers without an account. Persists via `prgn_demo_tier` cookie. Default: Paid
|
||||
(most compelling first impression). `app/components/demo_toolbar.py`
|
||||
|
||||
- **Docker `web` service** — multi-stage nginx container serving the Vue SPA
|
||||
`dist/` build. Added to `compose.yml` (port 8506), `compose.demo.yml`
|
||||
(port 8507), `compose.cloud.yml` (port 8508). `manage.sh build` now includes
|
||||
the `web` service alongside `app`.
|
||||
|
||||
### Changed
|
||||
- **Caddy routing** — `menagerie.circuitforge.tech` and
|
||||
`demo.circuitforge.tech` peregrine blocks now inspect the `prgn_ui` cookie
|
||||
and fan-out to the Vue SPA service or Streamlit accordingly.
|
||||
|
||||
---
|
||||
|
||||
## [0.6.2] — 2026-03-18
|
||||
|
||||
### Added
|
||||
- **Playwright E2E test harness** — smoke + interaction test suite covering all
|
||||
three Peregrine instances (demo / cloud / local). Navigates every page, checks
|
||||
for DOM errors on load, clicks every interactable element, diffs errors
|
||||
before/after each click, and XFAIL-marks expected demo-mode failures so
|
||||
neutering-guard regressions are surfaced as XPASSes. Screenshots on failure.
|
||||
- `tests/e2e/test_smoke.py` — page-load error detection
|
||||
- `tests/e2e/test_interactions.py` — full click-through with XFAIL/XPASS bucketing
|
||||
- `tests/e2e/conftest.py` — Streamlit-aware wait helpers, error scanner, fixtures
|
||||
- `tests/e2e/models.py` — `ErrorRecord`, `ModeConfig`, `diff_errors`
|
||||
- `tests/e2e/modes/` — per-mode configs (demo / cloud / local)
|
||||
- `tests/e2e/pages/` — page objects for all 7 pages including Settings tabs
|
||||
|
||||
### Fixed
|
||||
- **Demo: "Discovery failed" error on Home page load** — `task_runner.py` now
|
||||
checks `DEMO_MODE` before importing `discover.py`; returns a friendly error
|
||||
immediately instead of crashing on missing `search_profiles.yaml` (#21)
|
||||
- **Demo: silent `st.error()` in collapsed Practice Q&A expander** — Interview
|
||||
Prep no longer auto-triggers the LLM on page render in demo mode; shows an
|
||||
`st.info` placeholder instead, eliminating the hidden error element (#22)
|
||||
- **Cloud: auth wall shown to E2E test browser** — `cloud_session.py` now falls
|
||||
back to the `Cookie` header when `X-CF-Session` is absent (direct access
|
||||
without Caddy). Playwright's `set_extra_http_headers()` does not propagate to
|
||||
WebSocket handshakes; cookies do. Test harness uses `ctx.add_cookies()`.
|
||||
- **E2E error scanner returned empty text for collapsed expanders** — switched
|
||||
from `inner_text()` (respects CSS `display:none`) to `text_content()` so
|
||||
errors inside collapsed Streamlit expanders are captured with their full text.
|
||||
|
||||
---
|
||||
|
||||
## [0.6.1] — 2026-03-16
|
||||
|
||||
### Fixed
|
||||
- **Keyword suggestions not visible on first render** — `✨ Suggest` in
|
||||
Settings → Search now calls `st.rerun()` after storing results; chips appear
|
||||
immediately without requiring a tab switch (#18)
|
||||
- **Wizard identity step required manual re-entry of resume data** — step 4
|
||||
(Identity) now prefills name, email, and phone from the parsed resume when
|
||||
those fields are blank; existing saved values are not overwritten (#17)
|
||||
- **"Send to Notion" hardcoded on Home dashboard** — sync section now shows the
|
||||
connected provider name, or a "Set up a sync integration" prompt with a
|
||||
Settings link when no integration is configured (#16)
|
||||
- **`test_generate_calls_llm_router` flaky in full suite** — resolved by queue
|
||||
optimizer merge; mock state pollution eliminated (#12)
|
||||
|
||||
---
|
||||
|
||||
## [0.6.0] — 2026-03-16
|
||||
|
||||
### Added
|
||||
- **Calendar integration** — push interview events to Apple Calendar (CalDAV) or
|
||||
Google Calendar directly from the Interviews kanban. Idempotent: a second push
|
||||
updates the existing event rather than creating a duplicate. Button shows
|
||||
"📅 Add to Calendar" on first push and "🔄 Update Calendar" thereafter.
|
||||
Event title: `{Stage}: {Job Title} @ {Company}`; 1hr duration at noon UTC;
|
||||
job URL and company research brief included in event description.
|
||||
- `scripts/calendar_push.py` — push/update orchestration
|
||||
- `scripts/integrations/apple_calendar.py` — `create_event()` / `update_event()`
|
||||
via `caldav` + `icalendar`
|
||||
- `scripts/integrations/google_calendar.py` — `create_event()` / `update_event()`
|
||||
via `google-api-python-client` (service account); `test()` now makes a real API call
|
||||
- `scripts/db.py` — `calendar_event_id TEXT` column (auto-migration) +
|
||||
`set_calendar_event_id()` helper
|
||||
- `environment.yml` — pin `caldav>=1.3`, `icalendar>=5.0`,
|
||||
`google-api-python-client>=2.0`, `google-auth>=2.0`
|
||||
|
||||
---
|
||||
|
||||
## [0.4.1] — 2026-03-13
|
||||
|
||||
### Added
|
||||
- **LinkedIn profile import** — one-click import from a public LinkedIn profile URL
|
||||
(Playwright headless Chrome, no login required) or from a LinkedIn data export zip.
|
||||
Staged to `linkedin_stage.json` so the profile is parsed once and reused across
|
||||
sessions without repeated network requests. Available on all tiers including Free.
|
||||
- `scripts/linkedin_utils.py` — HTML parser with ordered CSS selector fallbacks;
|
||||
extracts name, experience, education, skills, certifications, summary
|
||||
- `scripts/linkedin_scraper.py` — Playwright URL scraper + export zip CSV parser;
|
||||
atomic staging file write; URL validation; robust error handling
|
||||
- `scripts/linkedin_parser.py` — staging file reader; re-runs HTML parser on stored
|
||||
raw HTML so selector improvements apply without re-scraping
|
||||
- `app/components/linkedin_import.py` — shared Streamlit widget (status bar, preview,
|
||||
URL import, advanced zip upload) used by both wizard and Settings
|
||||
- Wizard step 3: new "🔗 LinkedIn" tab alongside Upload and Build Manually
|
||||
- Settings → Resume Profile: collapsible "Import from LinkedIn" expander
|
||||
- Dockerfile: Playwright Chromium install added to Docker image
|
||||
|
||||
### Fixed
|
||||
- **Cloud mode perpetual onboarding loop** — wizard gate in `app.py` now reads
|
||||
`get_config_dir()/user.yaml` (per-user in cloud, repo-level locally) instead of a
|
||||
hardcoded repo path; completing the wizard now correctly exits it in cloud mode
|
||||
- **Cloud resume YAML path** — wizard step 3 writes resume to per-user `CONFIG_DIR`
|
||||
instead of the shared repo `config/` (would have merged all cloud users' data)
|
||||
- **Cloud session redirect** — missing/invalid session token now JS-redirects to
|
||||
`circuitforge.tech/login` instead of showing a raw error message
|
||||
- Removed remaining AIHawk UI references (`Home.py`, `4_Apply.py`, `migrate.py`)
|
||||
|
||||
---
|
||||
|
||||
## [0.3.0] — 2026-03-06
|
||||
|
||||
### Added
|
||||
- **Feedback button** — in-app issue reporting with screenshot paste support; posts
|
||||
directly to Forgejo as structured issues; available from sidebar on all pages
|
||||
(`app/feedback.py`, `scripts/feedback_api.py`, `app/components/paste_image.py`)
|
||||
- **BYOK cloud backend detection** — `scripts/byok_guard.py`: pure Python detection
|
||||
engine with full unit test coverage (18 tests); classifies backends as cloud or local
|
||||
based on type, `base_url` heuristic, and opt-out `local: true` flag
|
||||
- **BYOK activation warning** — one-time acknowledgment required in Settings when a
|
||||
new cloud LLM backend is enabled; shows data inventory (what leaves your machine,
|
||||
what stays local), provider policy links; ack state persisted to `config/user.yaml`
|
||||
under `byok_acknowledged_backends`
|
||||
- **Sidebar cloud LLM indicator** — amber badge on every page when any cloud backend
|
||||
is active; links to Settings; disappears when reverted to local-only config
|
||||
- **LLM suggest: search terms** — three-angle analysis from resume (job titles,
|
||||
skills keywords, and exclude terms to filter irrelevant listings)
|
||||
- **LLM suggest: resume keywords** — skills gap analysis against job descriptions
|
||||
- **LLM Suggest button** in Settings → Search → Skills & Keywords section
|
||||
- **Backup/restore script** (`scripts/backup.py`) — multi-instance and legacy support
|
||||
- `PRIVACY.md` — short-form privacy notice linked from Settings
|
||||
|
||||
### Changed
|
||||
- Settings save button for LLM Backends now gates on cloud acknowledgment before
|
||||
writing `config/llm.yaml`
|
||||
|
||||
### Fixed
|
||||
- Settings widget crash on certain rerun paths
|
||||
- Docker service controls in Settings → System tab
|
||||
- `DEFAULT_DB` now respects `STAGING_DB` environment variable (was silently ignoring it)
|
||||
- `generate()` in cover letter refinement now correctly passes `max_tokens` kwarg
|
||||
|
||||
### Security / Privacy
|
||||
- Full test suite anonymized — fictional "Alex Rivera" replaces all real personal data
|
||||
in test fixtures (`tests/test_cover_letter.py`, `test_imap_sync.py`,
|
||||
`test_classifier_adapters.py`, `test_db.py`)
|
||||
- Complete PII scrub from git history: real name, email address, and phone number
|
||||
removed from all 161 commits across both branches via `git filter-repo`
|
||||
|
||||
---
|
||||
|
||||
## [0.2.0] — 2026-02-26
|
||||
|
||||
### Added
|
||||
- Cover letter iterative refinement: "Refine with Feedback" expander in Apply Workspace;
|
||||
`generate()` accepts `previous_result`/`feedback`; task params passed through `submit_task`
|
||||
- Expanded first-run wizard: 7-step onboarding with GPU detection, tier selection,
|
||||
resume upload/parsing, LLM inference test, search profile builder, integration cards
|
||||
- Tier system: free / paid / premium feature gates (`app/wizard/tiers.py`)
|
||||
|
|
|
|||
|
|
@ -1,13 +1,83 @@
|
|||
# Contributing to Peregrine
|
||||
|
||||
See the full contributing guide in the documentation:
|
||||
https://docs.circuitforge.io/peregrine/developer-guide/contributing/
|
||||
Thanks for your interest. Peregrine is developed primarily at
|
||||
[git.opensourcesolarpunk.com](https://git.opensourcesolarpunk.com/pyr0ball/peregrine).
|
||||
GitHub and Codeberg are push mirrors — issues and PRs are welcome on either platform.
|
||||
|
||||
## Quick start
|
||||
---
|
||||
|
||||
1. Fork the repo and create a feature branch (`feat/my-feature`)
|
||||
2. Set up the dev environment: `conda env create -f environment.yml`
|
||||
3. Run tests: `conda run -n job-seeker python -m pytest tests/ -v`
|
||||
4. Open a pull request — all CI checks must pass
|
||||
## License
|
||||
|
||||
See the docs for: adding custom scrapers, adding integrations, code style, and PR checklist.
|
||||
Peregrine is licensed under **[BSL 1.1](./LICENSE-BSL)** — Business Source License.
|
||||
|
||||
What this means for you:
|
||||
|
||||
| Use case | Allowed? |
|
||||
|----------|----------|
|
||||
| Personal self-hosting, non-commercial | ✅ Free |
|
||||
| Contributing code, fixing bugs, writing docs | ✅ Free |
|
||||
| Commercial SaaS / hosted service | 🔒 Requires a paid license |
|
||||
| After 4 years from each release date | ✅ Converts to MIT |
|
||||
|
||||
**By submitting a pull request you agree that your contribution is licensed under the
|
||||
project's BSL 1.1 terms.** The PR template includes this as a checkbox.
|
||||
|
||||
---
|
||||
|
||||
## Dev Setup
|
||||
|
||||
See [`docs/getting-started/installation.md`](docs/getting-started/installation.md) for
|
||||
full instructions.
|
||||
|
||||
**Quick start (Docker — recommended):**
|
||||
|
||||
```bash
|
||||
git clone https://git.opensourcesolarpunk.com/pyr0ball/peregrine.git
|
||||
cd peregrine
|
||||
./setup.sh # installs deps, activates git hooks
|
||||
./manage.sh start
|
||||
```
|
||||
|
||||
**Conda (no Docker):**
|
||||
|
||||
```bash
|
||||
conda run -n job-seeker pip install -r requirements.txt
|
||||
streamlit run app/app.py
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Commit Format
|
||||
|
||||
Hooks enforce [Conventional Commits](https://www.conventionalcommits.org/):
|
||||
|
||||
```
|
||||
type: short description
|
||||
type(scope): short description
|
||||
```
|
||||
|
||||
Valid types: `feat` `fix` `docs` `chore` `test` `refactor` `perf` `ci` `build`
|
||||
|
||||
The hook will tell you exactly what went wrong if your message is rejected.
|
||||
|
||||
---
|
||||
|
||||
## Pull Request Process
|
||||
|
||||
1. Fork and branch from `main`
|
||||
2. Write tests first (we use `pytest`)
|
||||
3. Run `pytest tests/ -v` — all tests must pass
|
||||
4. Open a PR on GitHub or Codeberg
|
||||
5. PRs are reviewed and cherry-picked to Forgejo (the canonical repo) — you don't need a Forgejo account
|
||||
|
||||
---
|
||||
|
||||
## Reporting Issues
|
||||
|
||||
Use the issue templates:
|
||||
|
||||
- **Bug** — steps to reproduce, version, OS, Docker or conda, logs
|
||||
- **Feature** — problem statement, proposed solution, which tier it belongs to
|
||||
|
||||
**Security issues:** Do **not** open a public issue. Email `security@circuitforge.tech`.
|
||||
See [SECURITY.md](./SECURITY.md).
|
||||
|
|
|
|||
|
|
@ -4,13 +4,19 @@ FROM python:3.11-slim
|
|||
WORKDIR /app
|
||||
|
||||
# System deps for companyScraper (beautifulsoup4, fake-useragent, lxml) and PDF gen
|
||||
# libsqlcipher-dev: required to build pysqlcipher3 (SQLCipher AES-256 encryption for cloud mode)
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
gcc libffi-dev curl \
|
||||
gcc libffi-dev curl libsqlcipher-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY requirements.txt .
|
||||
# Install Python dependencies
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Install Playwright browser (cached separately from Python deps so requirements
|
||||
# changes don't bust the ~600–900 MB Chromium layer and vice versa)
|
||||
RUN playwright install chromium && playwright install-deps chromium
|
||||
|
||||
# Bundle companyScraper (company research web scraper)
|
||||
COPY scrapers/ /app/scrapers/
|
||||
|
||||
|
|
|
|||
47
Dockerfile.cfcore
Normal file
47
Dockerfile.cfcore
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
# Dockerfile.cfcore — build context must be the PARENT directory of peregrine/
|
||||
#
|
||||
# Used when circuitforge-core is installed from source (not PyPI).
|
||||
# Both repos must be siblings on the build host:
|
||||
# /devl/peregrine/ → WORKDIR /app
|
||||
# /devl/circuitforge-core/ → installed to /circuitforge-core
|
||||
#
|
||||
# Build manually:
|
||||
# docker build -f peregrine/Dockerfile.cfcore -t peregrine-cfcore ..
|
||||
#
|
||||
# Via compose (compose.test-cfcore.yml sets context: ..):
|
||||
# docker compose -f compose.test-cfcore.yml build
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# System deps for companyScraper (beautifulsoup4, fake-useragent, lxml) and PDF gen
|
||||
# libsqlcipher-dev: required to build pysqlcipher3 (SQLCipher AES-256 encryption for cloud mode)
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
gcc libffi-dev curl libsqlcipher-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy circuitforge-core and install it from the local path before requirements.txt.
|
||||
# requirements.txt has a git+https:// fallback URL for CI (where circuitforge-core
|
||||
# is not a sibling directory), but Docker always has the local copy available here.
|
||||
COPY circuitforge-core/ /circuitforge-core/
|
||||
RUN pip install --no-cache-dir /circuitforge-core
|
||||
|
||||
COPY peregrine/requirements.txt .
|
||||
# Skip the cfcore line — already installed above from the local copy
|
||||
RUN grep -v 'circuitforge-core' requirements.txt | pip install --no-cache-dir -r /dev/stdin
|
||||
|
||||
# Install Playwright browser (cached separately from Python deps so requirements
|
||||
# changes don't bust the ~600–900 MB Chromium layer and vice versa)
|
||||
RUN playwright install chromium && playwright install-deps chromium
|
||||
|
||||
# Bundle companyScraper (company research web scraper)
|
||||
COPY peregrine/scrapers/ /app/scrapers/
|
||||
|
||||
COPY peregrine/ .
|
||||
|
||||
EXPOSE 8501
|
||||
|
||||
CMD ["streamlit", "run", "app/app.py", \
|
||||
"--server.port=8501", \
|
||||
"--server.headless=true", \
|
||||
"--server.fileWatcherType=none"]
|
||||
13
Makefile
13
Makefile
|
|
@ -23,6 +23,7 @@ COMPOSE ?= $(shell \
|
|||
# compose.override.yml. We must include it explicitly when present.
|
||||
OVERRIDE_FILE := $(wildcard compose.override.yml)
|
||||
COMPOSE_OVERRIDE := $(if $(OVERRIDE_FILE),-f compose.override.yml,)
|
||||
DUAL_GPU_MODE ?= $(shell grep -m1 '^DUAL_GPU_MODE=' .env 2>/dev/null | cut -d= -f2 || echo ollama)
|
||||
|
||||
COMPOSE_FILES := -f compose.yml $(COMPOSE_OVERRIDE)
|
||||
ifneq (,$(findstring podman,$(COMPOSE)))
|
||||
|
|
@ -34,6 +35,14 @@ else
|
|||
COMPOSE_FILES := -f compose.yml $(COMPOSE_OVERRIDE) -f compose.gpu.yml
|
||||
endif
|
||||
endif
|
||||
ifeq ($(PROFILE),dual-gpu)
|
||||
COMPOSE_FILES += --profile dual-gpu-$(DUAL_GPU_MODE)
|
||||
endif
|
||||
|
||||
# 'remote' means base services only — no services are tagged 'remote' in compose.yml,
|
||||
# so --profile remote is a no-op with Docker and a fatal error on old podman-compose.
|
||||
# Only pass --profile for profiles that actually activate optional services.
|
||||
PROFILE_ARG := $(if $(filter remote,$(PROFILE)),,--profile $(PROFILE))
|
||||
|
||||
setup: ## Install dependencies (Docker or Podman + NVIDIA toolkit)
|
||||
@bash setup.sh
|
||||
|
|
@ -42,7 +51,7 @@ preflight: ## Check ports + system resources; write .env
|
|||
@$(PYTHON) scripts/preflight.py
|
||||
|
||||
start: preflight ## Preflight check then start Peregrine (PROFILE=remote|cpu|single-gpu|dual-gpu)
|
||||
$(COMPOSE) $(COMPOSE_FILES) --profile $(PROFILE) up -d
|
||||
$(COMPOSE) $(COMPOSE_FILES) $(PROFILE_ARG) up -d
|
||||
|
||||
stop: ## Stop all Peregrine services
|
||||
$(COMPOSE) down
|
||||
|
|
@ -50,7 +59,7 @@ stop: ## Stop all Peregrine services
|
|||
restart: ## Stop services, re-run preflight (ports now free), then start
|
||||
$(COMPOSE) down
|
||||
@$(PYTHON) scripts/preflight.py
|
||||
$(COMPOSE) $(COMPOSE_FILES) --profile $(PROFILE) up -d
|
||||
$(COMPOSE) $(COMPOSE_FILES) $(PROFILE_ARG) up -d
|
||||
|
||||
logs: ## Tail app logs
|
||||
$(COMPOSE) logs -f app
|
||||
|
|
|
|||
7
PRIVACY.md
Normal file
7
PRIVACY.md
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
# Privacy Policy
|
||||
|
||||
CircuitForge LLC's privacy policy applies to this product and is published at:
|
||||
|
||||
**<https://circuitforge.tech/privacy>**
|
||||
|
||||
Last reviewed: March 2026.
|
||||
132
README.md
132
README.md
|
|
@ -1,11 +1,33 @@
|
|||
# Peregrine
|
||||
|
||||
**AI-powered job search pipeline — by [Circuit Forge LLC](https://circuitforge.io)**
|
||||
> **Primary development** happens at [git.opensourcesolarpunk.com](https://git.opensourcesolarpunk.com/Circuit-Forge/peregrine) — GitHub and Codeberg are push mirrors. Issues and PRs are welcome on either platform.
|
||||
|
||||
> *"Don't be evil, for real and forever."*
|
||||
[](./LICENSE-BSL)
|
||||
[](https://github.com/CircuitForge/peregrine/actions/workflows/ci.yml)
|
||||
|
||||
Automates the full job search lifecycle: discovery → matching → cover letters → applications → interview prep.
|
||||
Privacy-first, local-first. Your data never leaves your machine.
|
||||
**Job search pipeline — by [Circuit Forge LLC](https://circuitforge.tech)**
|
||||
|
||||
> *"Tools for the jobs that the system made hard on purpose."*
|
||||
|
||||
---
|
||||
|
||||
Job search is a second job nobody hired you for.
|
||||
|
||||
ATS filters designed to reject. Job boards that show the same listing eight times. Cover letter number forty-seven for a role that might already be filled. Hours of prep for a phone screen that lasts twelve minutes.
|
||||
|
||||
Peregrine handles the pipeline — discovery, matching, tracking, drafting, and prep — so you can spend your time doing the work you actually want to be doing.
|
||||
|
||||
**LLM support is optional.** The full discovery and tracking pipeline works without one. When you do configure a backend, the LLM drafts the parts that are genuinely miserable — cover letters, company research briefs, interview prep sheets — and waits for your approval before anything goes anywhere.
|
||||
|
||||
### What Peregrine does not do
|
||||
|
||||
Peregrine does **not** submit job applications for you. You still have to go to each employer's site and click apply yourself.
|
||||
|
||||
This is intentional. Automated mass-applying is a bad experience for everyone — it's also a trust violation with employers who took the time to post a real role. Peregrine is a preparation and organization tool, not a bot.
|
||||
|
||||
What it *does* cover is everything before and after that click: finding the jobs, matching them against your resume, generating cover letters and prep materials, and once you've applied — tracking where you stand, classifying the emails that come back, and surfacing company research when an interview lands on your calendar. The submit button is yours. The rest of the grind is ours.
|
||||
|
||||
> **Exception:** [AIHawk](https://github.com/nicolomantini/LinkedIn-Easy-Apply) is a separate, optional tool that handles LinkedIn Easy Apply automation. Peregrine integrates with it for AIHawk-compatible profiles, but it is not part of Peregrine's core pipeline.
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -14,7 +36,7 @@ Privacy-first, local-first. Your data never leaves your machine.
|
|||
**1. Clone and install dependencies** (Docker, NVIDIA toolkit if needed):
|
||||
|
||||
```bash
|
||||
git clone https://git.opensourcesolarpunk.com/pyr0ball/peregrine
|
||||
git clone https://git.opensourcesolarpunk.com/Circuit-Forge/peregrine
|
||||
cd peregrine
|
||||
./manage.sh setup
|
||||
```
|
||||
|
|
@ -23,9 +45,9 @@ cd peregrine
|
|||
|
||||
```bash
|
||||
./manage.sh start # remote profile (API-only, no GPU)
|
||||
./manage.sh start --profile cpu # local Ollama on CPU
|
||||
./manage.sh start --profile single-gpu # Ollama + Vision on GPU 0
|
||||
./manage.sh start --profile dual-gpu # Ollama + Vision + vLLM (GPU 0 + 1)
|
||||
./manage.sh start --profile cpu # local Ollama (CPU, or Metal GPU on Apple Silicon — see below)
|
||||
./manage.sh start --profile single-gpu # Ollama + Vision on GPU 0 (NVIDIA only)
|
||||
./manage.sh start --profile dual-gpu # Ollama + Vision + vLLM (GPU 0 + 1) (NVIDIA only)
|
||||
```
|
||||
|
||||
Or use `make` directly:
|
||||
|
|
@ -37,9 +59,35 @@ make start PROFILE=single-gpu
|
|||
|
||||
**3.** Open http://localhost:8501 — the setup wizard guides you through the rest.
|
||||
|
||||
> **macOS:** Docker Desktop must be running before starting.
|
||||
> **macOS / Apple Silicon:** Docker Desktop must be running. For Metal GPU-accelerated inference, install Ollama natively before starting — `setup.sh` will prompt you to do this. See [Apple Silicon GPU](#apple-silicon-gpu) below.
|
||||
> **Windows:** Not supported — use WSL2 with Ubuntu.
|
||||
|
||||
### Installing to `/opt` or other system directories
|
||||
|
||||
If you clone into a root-owned directory (e.g. `sudo git clone ... /opt/peregrine`), two things need fixing:
|
||||
|
||||
**1. Git ownership warning** (`fatal: detected dubious ownership`) — `./manage.sh setup` fixes this automatically. If you need git to work *before* running setup:
|
||||
|
||||
```bash
|
||||
git config --global --add safe.directory /opt/peregrine
|
||||
```
|
||||
|
||||
**2. Preflight write access** — preflight writes `.env` and `compose.override.yml` into the repo directory. Fix ownership once:
|
||||
|
||||
```bash
|
||||
sudo chown -R $USER:$USER /opt/peregrine
|
||||
```
|
||||
|
||||
After that, run everything without `sudo`.
|
||||
|
||||
### Podman
|
||||
|
||||
Podman is rootless by default — **no `sudo` needed.** `./manage.sh setup` will configure `podman-compose` if it isn't already present.
|
||||
|
||||
### Docker
|
||||
|
||||
After `./manage.sh setup`, log out and back in for docker group membership to take effect. Until then, prefix commands with `sudo`. After re-login, `sudo` is no longer required.
|
||||
|
||||
---
|
||||
|
||||
## Inference Profiles
|
||||
|
|
@ -47,9 +95,25 @@ make start PROFILE=single-gpu
|
|||
| Profile | Services started | Use case |
|
||||
|---------|-----------------|----------|
|
||||
| `remote` | app + searxng | No GPU; LLM calls go to Anthropic / OpenAI |
|
||||
| `cpu` | app + ollama + searxng | No GPU; local models on CPU (slow) |
|
||||
| `single-gpu` | app + ollama + vision + searxng | One GPU: cover letters, research, vision |
|
||||
| `dual-gpu` | app + ollama + vllm + vision + searxng | GPU 0 = Ollama, GPU 1 = vLLM |
|
||||
| `cpu` | app + ollama + searxng | No GPU; local models on CPU. On Apple Silicon, use with native Ollama for Metal acceleration — see below. |
|
||||
| `single-gpu` | app + ollama + vision + searxng | One **NVIDIA** GPU: cover letters, research, vision |
|
||||
| `dual-gpu` | app + ollama + vllm + vision + searxng | Two **NVIDIA** GPUs: GPU 0 = Ollama, GPU 1 = vLLM |
|
||||
|
||||
### Apple Silicon GPU
|
||||
|
||||
Docker Desktop on macOS runs in a Linux VM — it cannot access the Apple GPU. Metal-accelerated inference requires Ollama to run **natively** on the host.
|
||||
|
||||
`setup.sh` handles this automatically: it offers to install Ollama via Homebrew, starts it as a background service, and explains what happens next. If Ollama is running on port 11434 when you start Peregrine, preflight detects it, stubs out the Docker Ollama container, and routes inference through the native process — which uses Metal automatically.
|
||||
|
||||
To do it manually:
|
||||
|
||||
```bash
|
||||
brew install ollama
|
||||
brew services start ollama # starts at login, uses Metal GPU
|
||||
./manage.sh start --profile cpu # preflight adopts native Ollama; Docker container is skipped
|
||||
```
|
||||
|
||||
The `cpu` profile label is a slight misnomer in this context — Ollama will be running on the GPU. `single-gpu` and `dual-gpu` profiles are NVIDIA-specific and not applicable on Mac.
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -57,7 +121,7 @@ make start PROFILE=single-gpu
|
|||
|
||||
On first launch the setup wizard walks through seven steps:
|
||||
|
||||
1. **Hardware** — detects NVIDIA GPUs and recommends a profile
|
||||
1. **Hardware** — detects NVIDIA GPUs (Linux) or Apple Silicon GPU (macOS) and recommends a profile
|
||||
2. **Tier** — choose free, paid, or premium (or use `dev_tier_override` for local testing)
|
||||
3. **Identity** — name, email, phone, LinkedIn, career summary
|
||||
4. **Resume** — upload a PDF/DOCX for LLM parsing, or use the guided form builder
|
||||
|
|
@ -75,17 +139,33 @@ Re-enter the wizard any time via **Settings → Developer → Reset wizard**.
|
|||
| Feature | Tier |
|
||||
|---------|------|
|
||||
| Job discovery (JobSpy + custom boards) | Free |
|
||||
| Resume keyword matching | Free |
|
||||
| Cover letter generation | Paid |
|
||||
| Company research briefs | Paid |
|
||||
| Interview prep & practice Q&A | Paid |
|
||||
| Resume keyword matching & gap analysis | Free |
|
||||
| Document storage sync (Google Drive, Dropbox, OneDrive, MEGA, Nextcloud) | Free |
|
||||
| Webhook notifications (Discord, Home Assistant) | Free |
|
||||
| **Cover letter generation** | Free with LLM¹ |
|
||||
| **Company research briefs** | Free with LLM¹ |
|
||||
| **Interview prep & practice Q&A** | Free with LLM¹ |
|
||||
| **Survey assistant** (culture-fit Q&A, screenshot analysis) | Free with LLM¹ |
|
||||
| **Wizard helpers** (career summary, bullet expansion, skill suggestions, job title suggestions, mission notes) | Free with LLM¹ |
|
||||
| Managed cloud LLM (no API key needed) | Paid |
|
||||
| Email sync & auto-classification | Paid |
|
||||
| Survey assistant (culture-fit Q&A) | Paid |
|
||||
| Integration connectors (Notion, Airtable, Google Sheets, etc.) | Paid |
|
||||
| LLM-powered keyword blocklist | Paid |
|
||||
| Job tracking integrations (Notion, Airtable, Google Sheets) | Paid |
|
||||
| Calendar sync (Google, Apple) | Paid |
|
||||
| Cover letter model fine-tuning | Premium |
|
||||
| Slack notifications | Paid |
|
||||
| CircuitForge shared cover-letter model | Paid |
|
||||
| Vue 3 SPA — full UI with onboarding wizard, job board, apply workspace, sort/filter, research modal, draft cover letter | Free |
|
||||
| **Voice guidelines** (custom writing style & tone) | Premium with LLM¹ ² |
|
||||
| Cover letter model fine-tuning (your writing, your model) | Premium |
|
||||
| Multi-user support | Premium |
|
||||
|
||||
¹ **BYOK (bring your own key/backend) unlock:** configure any LLM backend — a local [Ollama](https://ollama.com) or vLLM instance,
|
||||
or your own API key (Anthropic, OpenAI-compatible) — and all features marked **Free with LLM** or **Premium with LLM**
|
||||
unlock at no charge. The paid tier earns its price by providing managed cloud inference so you
|
||||
don't need a key at all, plus integrations and email sync.
|
||||
|
||||
² **Voice guidelines** requires Premium tier without a configured LLM backend. With BYOK, it unlocks at any tier.
|
||||
|
||||
---
|
||||
|
||||
## Email Sync
|
||||
|
|
@ -131,18 +211,18 @@ Connect external services in **Settings → Integrations**:
|
|||
|
||||
## Developer Docs
|
||||
|
||||
Full documentation at: https://docs.circuitforge.io/peregrine
|
||||
Full documentation at: https://docs.circuitforge.tech/peregrine
|
||||
|
||||
- [Installation guide](https://docs.circuitforge.io/peregrine/getting-started/installation/)
|
||||
- [Adding a custom job board scraper](https://docs.circuitforge.io/peregrine/developer-guide/adding-scrapers/)
|
||||
- [Adding an integration](https://docs.circuitforge.io/peregrine/developer-guide/adding-integrations/)
|
||||
- [Contributing](https://docs.circuitforge.io/peregrine/developer-guide/contributing/)
|
||||
- [Installation guide](https://docs.circuitforge.tech/peregrine/getting-started/installation/)
|
||||
- [Adding a custom job board scraper](https://docs.circuitforge.tech/peregrine/developer-guide/adding-scrapers/)
|
||||
- [Adding an integration](https://docs.circuitforge.tech/peregrine/developer-guide/adding-integrations/)
|
||||
- [Contributing](https://docs.circuitforge.tech/peregrine/developer-guide/contributing/)
|
||||
|
||||
---
|
||||
|
||||
## License
|
||||
|
||||
Core discovery pipeline: [MIT](LICENSE-MIT)
|
||||
AI features (cover letter generation, company research, interview prep, UI): [BSL 1.1](LICENSE-BSL)
|
||||
LLM features (cover letter generation, company research, interview prep, UI): [BSL 1.1](LICENSE-BSL)
|
||||
|
||||
© 2026 Circuit Forge LLC
|
||||
|
|
|
|||
26
SECURITY.md
Normal file
26
SECURITY.md
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
# Security Policy
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
**Do not open a GitHub or Codeberg issue for security vulnerabilities.**
|
||||
|
||||
Email: `security@circuitforge.tech`
|
||||
|
||||
Include:
|
||||
- A description of the vulnerability
|
||||
- Steps to reproduce
|
||||
- Potential impact
|
||||
- Any suggested fix (optional)
|
||||
|
||||
**Response target:** 72 hours for acknowledgement, 14 days for triage.
|
||||
|
||||
We follow responsible disclosure — we will coordinate a fix and release before any
|
||||
public disclosure and will credit you in the release notes unless you prefer to remain
|
||||
anonymous.
|
||||
|
||||
## Supported Versions
|
||||
|
||||
| Version | Supported |
|
||||
|---------|-----------|
|
||||
| Latest release | ✅ |
|
||||
| Older releases | ❌ — please upgrade |
|
||||
411
app/Home.py
411
app/Home.py
|
|
@ -18,28 +18,61 @@ _USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
|
|||
_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
|
||||
_name = _profile.name if _profile else "Job Seeker"
|
||||
|
||||
from scripts.db import DEFAULT_DB, init_db, get_job_counts, purge_jobs, purge_email_data, \
|
||||
purge_non_remote, archive_jobs, kill_stuck_tasks, get_task_for_job, get_active_tasks, \
|
||||
insert_job, get_existing_urls
|
||||
from scripts.db import init_db, get_job_counts, purge_jobs, purge_email_data, \
|
||||
purge_non_remote, archive_jobs, kill_stuck_tasks, cancel_task, \
|
||||
get_task_for_job, get_active_tasks, insert_job, get_existing_urls
|
||||
from scripts.task_runner import submit_task
|
||||
from app.cloud_session import resolve_session, get_db_path
|
||||
|
||||
init_db(DEFAULT_DB)
|
||||
_CONFIG_DIR = Path(__file__).parent.parent / "config"
|
||||
_NOTION_CONNECTED = (_CONFIG_DIR / "integrations" / "notion.yaml").exists()
|
||||
|
||||
resolve_session("peregrine")
|
||||
init_db(get_db_path())
|
||||
|
||||
def _email_configured() -> bool:
|
||||
_e = Path(__file__).parent.parent / "config" / "email.yaml"
|
||||
if not _e.exists():
|
||||
return False
|
||||
import yaml as _yaml
|
||||
_cfg = _yaml.safe_load(_e.read_text()) or {}
|
||||
return bool(_cfg.get("username") or _cfg.get("user") or _cfg.get("imap_host"))
|
||||
|
||||
def _notion_configured() -> bool:
|
||||
_n = Path(__file__).parent.parent / "config" / "notion.yaml"
|
||||
if not _n.exists():
|
||||
return False
|
||||
import yaml as _yaml
|
||||
_cfg = _yaml.safe_load(_n.read_text()) or {}
|
||||
return bool(_cfg.get("token"))
|
||||
|
||||
def _keywords_configured() -> bool:
|
||||
_k = Path(__file__).parent.parent / "config" / "resume_keywords.yaml"
|
||||
if not _k.exists():
|
||||
return False
|
||||
import yaml as _yaml
|
||||
_cfg = _yaml.safe_load(_k.read_text()) or {}
|
||||
return bool(_cfg.get("keywords") or _cfg.get("required") or _cfg.get("preferred"))
|
||||
|
||||
_SETUP_BANNERS = [
|
||||
{"key": "connect_cloud", "text": "Connect a cloud service for resume/cover letter storage",
|
||||
"link_label": "Settings → Integrations"},
|
||||
"link_label": "Settings → Integrations",
|
||||
"done": _notion_configured},
|
||||
{"key": "setup_email", "text": "Set up email sync to catch recruiter outreach",
|
||||
"link_label": "Settings → Email"},
|
||||
"link_label": "Settings → Email",
|
||||
"done": _email_configured},
|
||||
{"key": "setup_email_labels", "text": "Set up email label filters for auto-classification",
|
||||
"link_label": "Settings → Email (label guide)"},
|
||||
"link_label": "Settings → Email (label guide)",
|
||||
"done": _email_configured},
|
||||
{"key": "tune_mission", "text": "Tune your mission preferences for better cover letters",
|
||||
"link_label": "Settings → My Profile"},
|
||||
{"key": "configure_keywords", "text": "Configure keywords and blocklist for smarter search",
|
||||
"link_label": "Settings → Search"},
|
||||
"link_label": "Settings → Search",
|
||||
"done": _keywords_configured},
|
||||
{"key": "upload_corpus", "text": "Upload your cover letter corpus for voice fine-tuning",
|
||||
"link_label": "Settings → Fine-Tune"},
|
||||
{"key": "configure_linkedin", "text": "Configure LinkedIn Easy Apply automation",
|
||||
"link_label": "Settings → AIHawk"},
|
||||
"link_label": "Settings → Integrations"},
|
||||
{"key": "setup_searxng", "text": "Set up company research with SearXNG",
|
||||
"link_label": "Settings → Services"},
|
||||
{"key": "target_companies", "text": "Build a target company list for focused outreach",
|
||||
|
|
@ -108,7 +141,7 @@ st.divider()
|
|||
|
||||
@st.fragment(run_every=10)
|
||||
def _live_counts():
|
||||
counts = get_job_counts(DEFAULT_DB)
|
||||
counts = get_job_counts(get_db_path())
|
||||
col1, col2, col3, col4, col5 = st.columns(5)
|
||||
col1.metric("Pending Review", counts.get("pending", 0))
|
||||
col2.metric("Approved", counts.get("approved", 0))
|
||||
|
|
@ -127,18 +160,18 @@ with left:
|
|||
st.subheader("Find New Jobs")
|
||||
st.caption("Scrapes all configured boards and adds new listings to your review queue.")
|
||||
|
||||
_disc_task = get_task_for_job(DEFAULT_DB, "discovery", 0)
|
||||
_disc_task = get_task_for_job(get_db_path(), "discovery", 0)
|
||||
_disc_running = _disc_task and _disc_task["status"] in ("queued", "running")
|
||||
|
||||
if st.button("🚀 Run Discovery", use_container_width=True, type="primary",
|
||||
disabled=bool(_disc_running)):
|
||||
submit_task(DEFAULT_DB, "discovery", 0)
|
||||
submit_task(get_db_path(), "discovery", 0)
|
||||
st.rerun()
|
||||
|
||||
if _disc_running:
|
||||
@st.fragment(run_every=4)
|
||||
def _disc_status():
|
||||
t = get_task_for_job(DEFAULT_DB, "discovery", 0)
|
||||
t = get_task_for_job(get_db_path(), "discovery", 0)
|
||||
if t and t["status"] in ("queued", "running"):
|
||||
lbl = "Queued…" if t["status"] == "queued" else "Scraping job boards… this may take a minute"
|
||||
st.info(f"⏳ {lbl}")
|
||||
|
|
@ -156,18 +189,18 @@ with enrich_col:
|
|||
st.subheader("Enrich Descriptions")
|
||||
st.caption("Re-fetch missing descriptions for any listing (LinkedIn, Indeed, Glassdoor, Adzuna, The Ladders, generic).")
|
||||
|
||||
_enrich_task = get_task_for_job(DEFAULT_DB, "enrich_descriptions", 0)
|
||||
_enrich_task = get_task_for_job(get_db_path(), "enrich_descriptions", 0)
|
||||
_enrich_running = _enrich_task and _enrich_task["status"] in ("queued", "running")
|
||||
|
||||
if st.button("🔍 Fill Missing Descriptions", use_container_width=True, type="primary",
|
||||
disabled=bool(_enrich_running)):
|
||||
submit_task(DEFAULT_DB, "enrich_descriptions", 0)
|
||||
submit_task(get_db_path(), "enrich_descriptions", 0)
|
||||
st.rerun()
|
||||
|
||||
if _enrich_running:
|
||||
@st.fragment(run_every=4)
|
||||
def _enrich_status():
|
||||
t = get_task_for_job(DEFAULT_DB, "enrich_descriptions", 0)
|
||||
t = get_task_for_job(get_db_path(), "enrich_descriptions", 0)
|
||||
if t and t["status"] in ("queued", "running"):
|
||||
st.info("⏳ Fetching descriptions…")
|
||||
else:
|
||||
|
|
@ -182,7 +215,7 @@ with enrich_col:
|
|||
|
||||
with mid:
|
||||
unscored = sum(1 for j in __import__("scripts.db", fromlist=["get_jobs_by_status"])
|
||||
.get_jobs_by_status(DEFAULT_DB, "pending")
|
||||
.get_jobs_by_status(get_db_path(), "pending")
|
||||
if j.get("match_score") is None and j.get("description"))
|
||||
st.subheader("Score Listings")
|
||||
st.caption(f"Run TF-IDF match scoring against {_name}'s resume. {unscored} pending job{'s' if unscored != 1 else ''} unscored.")
|
||||
|
|
@ -190,7 +223,7 @@ with mid:
|
|||
disabled=unscored == 0):
|
||||
with st.spinner("Scoring…"):
|
||||
result = subprocess.run(
|
||||
["conda", "run", "-n", "job-seeker", "python", "scripts/match.py"],
|
||||
[sys.executable, "scripts/match.py"],
|
||||
capture_output=True, text=True,
|
||||
cwd=str(Path(__file__).parent.parent),
|
||||
)
|
||||
|
|
@ -203,21 +236,27 @@ with mid:
|
|||
st.rerun()
|
||||
|
||||
with right:
|
||||
approved_count = get_job_counts(DEFAULT_DB).get("approved", 0)
|
||||
st.subheader("Send to Notion")
|
||||
st.caption("Push all approved jobs to your Notion tracking database.")
|
||||
if approved_count == 0:
|
||||
st.info("No approved jobs yet. Review and approve some listings first.")
|
||||
approved_count = get_job_counts(get_db_path()).get("approved", 0)
|
||||
if _NOTION_CONNECTED:
|
||||
st.subheader("Send to Notion")
|
||||
st.caption("Push all approved jobs to your Notion tracking database.")
|
||||
if approved_count == 0:
|
||||
st.info("No approved jobs yet. Review and approve some listings first.")
|
||||
else:
|
||||
if st.button(
|
||||
f"📤 Sync {approved_count} approved job{'s' if approved_count != 1 else ''} → Notion",
|
||||
use_container_width=True, type="primary",
|
||||
):
|
||||
with st.spinner("Syncing to Notion…"):
|
||||
from scripts.sync import sync_to_notion
|
||||
count = sync_to_notion(get_db_path())
|
||||
st.success(f"Synced {count} job{'s' if count != 1 else ''} to Notion!")
|
||||
st.rerun()
|
||||
else:
|
||||
if st.button(
|
||||
f"📤 Sync {approved_count} approved job{'s' if approved_count != 1 else ''} → Notion",
|
||||
use_container_width=True, type="primary",
|
||||
):
|
||||
with st.spinner("Syncing to Notion…"):
|
||||
from scripts.sync import sync_to_notion
|
||||
count = sync_to_notion(DEFAULT_DB)
|
||||
st.success(f"Synced {count} job{'s' if count != 1 else ''} to Notion!")
|
||||
st.rerun()
|
||||
st.subheader("Set up a sync integration")
|
||||
st.caption("Connect an integration to push approved jobs to your tracking database.")
|
||||
if st.button("⚙️ Go to Integrations", use_container_width=True):
|
||||
st.switch_page("pages/2_Settings.py")
|
||||
|
||||
st.divider()
|
||||
|
||||
|
|
@ -230,18 +269,18 @@ with email_left:
|
|||
"New recruiter outreach is added to your Job Review queue.")
|
||||
|
||||
with email_right:
|
||||
_email_task = get_task_for_job(DEFAULT_DB, "email_sync", 0)
|
||||
_email_task = get_task_for_job(get_db_path(), "email_sync", 0)
|
||||
_email_running = _email_task and _email_task["status"] in ("queued", "running")
|
||||
|
||||
if st.button("📧 Sync Emails", use_container_width=True, type="primary",
|
||||
disabled=bool(_email_running)):
|
||||
submit_task(DEFAULT_DB, "email_sync", 0)
|
||||
submit_task(get_db_path(), "email_sync", 0)
|
||||
st.rerun()
|
||||
|
||||
if _email_running:
|
||||
@st.fragment(run_every=4)
|
||||
def _email_status():
|
||||
t = get_task_for_job(DEFAULT_DB, "email_sync", 0)
|
||||
t = get_task_for_job(get_db_path(), "email_sync", 0)
|
||||
if t and t["status"] in ("queued", "running"):
|
||||
st.info("⏳ Syncing emails…")
|
||||
else:
|
||||
|
|
@ -276,7 +315,7 @@ with url_tab:
|
|||
disabled=not (url_text or "").strip()):
|
||||
_urls = [u.strip() for u in url_text.strip().splitlines() if u.strip().startswith("http")]
|
||||
if _urls:
|
||||
_n = _queue_url_imports(DEFAULT_DB, _urls)
|
||||
_n = _queue_url_imports(get_db_path(), _urls)
|
||||
if _n:
|
||||
st.success(f"Queued {_n} job{'s' if _n != 1 else ''} for import. Check Job Review shortly.")
|
||||
else:
|
||||
|
|
@ -299,7 +338,7 @@ with csv_tab:
|
|||
if _csv_urls:
|
||||
st.caption(f"Found {len(_csv_urls)} URL(s) in CSV.")
|
||||
if st.button("📥 Import CSV Jobs", key="add_csv_btn", use_container_width=True):
|
||||
_n = _queue_url_imports(DEFAULT_DB, _csv_urls)
|
||||
_n = _queue_url_imports(get_db_path(),_csv_urls)
|
||||
st.success(f"Queued {_n} job{'s' if _n != 1 else ''} for import.")
|
||||
st.rerun()
|
||||
else:
|
||||
|
|
@ -309,7 +348,7 @@ with csv_tab:
|
|||
@st.fragment(run_every=3)
|
||||
def _scrape_status():
|
||||
import sqlite3 as _sq
|
||||
conn = _sq.connect(DEFAULT_DB)
|
||||
conn = _sq.connect(get_db_path())
|
||||
conn.row_factory = _sq.Row
|
||||
rows = conn.execute(
|
||||
"""SELECT bt.status, bt.error, j.title, j.company, j.url
|
||||
|
|
@ -337,190 +376,164 @@ _scrape_status()
|
|||
|
||||
st.divider()
|
||||
|
||||
# ── Danger zone: purge + re-scrape ────────────────────────────────────────────
|
||||
# ── Danger zone ───────────────────────────────────────────────────────────────
|
||||
with st.expander("⚠️ Danger Zone", expanded=False):
|
||||
|
||||
# ── Queue reset (the common case) ─────────────────────────────────────────
|
||||
st.markdown("**Queue reset**")
|
||||
st.caption(
|
||||
"**Purge** permanently deletes jobs from the local database. "
|
||||
"Applied and synced jobs are never touched."
|
||||
"Archive clears your review queue while keeping job URLs for dedup, "
|
||||
"so the same listings won't resurface on the next discovery run. "
|
||||
"Use hard purge only if you want a full clean slate including dedup history."
|
||||
)
|
||||
|
||||
purge_col, rescrape_col, email_col, tasks_col = st.columns(4)
|
||||
_scope = st.radio(
|
||||
"Clear scope",
|
||||
["Pending only", "Pending + approved (stale search)"],
|
||||
horizontal=True,
|
||||
label_visibility="collapsed",
|
||||
)
|
||||
_scope_statuses = (
|
||||
["pending"] if _scope == "Pending only" else ["pending", "approved"]
|
||||
)
|
||||
|
||||
with purge_col:
|
||||
st.markdown("**Purge pending & rejected**")
|
||||
st.caption("Removes all _pending_ and _rejected_ listings so the next discovery starts fresh.")
|
||||
if st.button("🗑 Purge Pending + Rejected", use_container_width=True):
|
||||
st.session_state["confirm_purge"] = "partial"
|
||||
_qc1, _qc2, _qc3 = st.columns([2, 2, 4])
|
||||
if _qc1.button("📦 Archive & reset", use_container_width=True, type="primary"):
|
||||
st.session_state["confirm_dz"] = "archive"
|
||||
if _qc2.button("🗑 Hard purge (delete)", use_container_width=True):
|
||||
st.session_state["confirm_dz"] = "purge"
|
||||
|
||||
if st.session_state.get("confirm_purge") == "partial":
|
||||
st.warning("Are you sure? This cannot be undone.")
|
||||
c1, c2 = st.columns(2)
|
||||
if c1.button("Yes, purge", type="primary", use_container_width=True):
|
||||
deleted = purge_jobs(DEFAULT_DB, statuses=["pending", "rejected"])
|
||||
st.success(f"Purged {deleted} jobs.")
|
||||
st.session_state.pop("confirm_purge", None)
|
||||
st.rerun()
|
||||
if c2.button("Cancel", use_container_width=True):
|
||||
st.session_state.pop("confirm_purge", None)
|
||||
st.rerun()
|
||||
|
||||
with email_col:
|
||||
st.markdown("**Purge email data**")
|
||||
st.caption("Clears all email thread logs and email-sourced pending jobs so the next sync starts fresh.")
|
||||
if st.button("📧 Purge Email Data", use_container_width=True):
|
||||
st.session_state["confirm_purge"] = "email"
|
||||
|
||||
if st.session_state.get("confirm_purge") == "email":
|
||||
st.warning("This deletes all email contacts and email-sourced jobs. Cannot be undone.")
|
||||
c1, c2 = st.columns(2)
|
||||
if c1.button("Yes, purge emails", type="primary", use_container_width=True):
|
||||
contacts, jobs = purge_email_data(DEFAULT_DB)
|
||||
st.success(f"Purged {contacts} email contacts, {jobs} email jobs.")
|
||||
st.session_state.pop("confirm_purge", None)
|
||||
st.rerun()
|
||||
if c2.button("Cancel ", use_container_width=True):
|
||||
st.session_state.pop("confirm_purge", None)
|
||||
st.rerun()
|
||||
|
||||
with tasks_col:
|
||||
_active = get_active_tasks(DEFAULT_DB)
|
||||
st.markdown("**Kill stuck tasks**")
|
||||
st.caption(f"Force-fail all queued/running background tasks. Currently **{len(_active)}** active.")
|
||||
if st.button("⏹ Kill All Tasks", use_container_width=True, disabled=len(_active) == 0):
|
||||
killed = kill_stuck_tasks(DEFAULT_DB)
|
||||
st.success(f"Killed {killed} task(s).")
|
||||
if st.session_state.get("confirm_dz") == "archive":
|
||||
st.info(
|
||||
f"Archive **{', '.join(_scope_statuses)}** jobs? "
|
||||
"URLs are kept for dedup — nothing is permanently deleted."
|
||||
)
|
||||
_dc1, _dc2 = st.columns(2)
|
||||
if _dc1.button("Yes, archive", type="primary", use_container_width=True, key="dz_archive_confirm"):
|
||||
n = archive_jobs(get_db_path(), statuses=_scope_statuses)
|
||||
st.success(f"Archived {n} jobs.")
|
||||
st.session_state.pop("confirm_dz", None)
|
||||
st.rerun()
|
||||
if _dc2.button("Cancel", use_container_width=True, key="dz_archive_cancel"):
|
||||
st.session_state.pop("confirm_dz", None)
|
||||
st.rerun()
|
||||
|
||||
with rescrape_col:
|
||||
st.markdown("**Purge all & re-scrape**")
|
||||
st.caption("Wipes _all_ non-applied, non-synced jobs then immediately runs a fresh discovery.")
|
||||
if st.button("🔄 Purge All + Re-scrape", use_container_width=True):
|
||||
st.session_state["confirm_purge"] = "full"
|
||||
|
||||
if st.session_state.get("confirm_purge") == "full":
|
||||
st.warning("This will delete ALL pending, approved, and rejected jobs, then re-scrape. Applied and synced records are kept.")
|
||||
c1, c2 = st.columns(2)
|
||||
if c1.button("Yes, wipe + scrape", type="primary", use_container_width=True):
|
||||
purge_jobs(DEFAULT_DB, statuses=["pending", "approved", "rejected"])
|
||||
submit_task(DEFAULT_DB, "discovery", 0)
|
||||
st.session_state.pop("confirm_purge", None)
|
||||
st.rerun()
|
||||
if c2.button("Cancel ", use_container_width=True):
|
||||
st.session_state.pop("confirm_purge", None)
|
||||
st.rerun()
|
||||
|
||||
st.divider()
|
||||
|
||||
pending_col, nonremote_col, approved_col, _ = st.columns(4)
|
||||
|
||||
with pending_col:
|
||||
st.markdown("**Purge pending review**")
|
||||
st.caption("Removes only _pending_ listings, keeping your rejected history intact.")
|
||||
if st.button("🗑 Purge Pending Only", use_container_width=True):
|
||||
st.session_state["confirm_purge"] = "pending_only"
|
||||
|
||||
if st.session_state.get("confirm_purge") == "pending_only":
|
||||
st.warning("Deletes all pending jobs. Rejected jobs are kept. Cannot be undone.")
|
||||
c1, c2 = st.columns(2)
|
||||
if c1.button("Yes, purge pending", type="primary", use_container_width=True):
|
||||
deleted = purge_jobs(DEFAULT_DB, statuses=["pending"])
|
||||
st.success(f"Purged {deleted} pending jobs.")
|
||||
st.session_state.pop("confirm_purge", None)
|
||||
st.rerun()
|
||||
if c2.button("Cancel ", use_container_width=True):
|
||||
st.session_state.pop("confirm_purge", None)
|
||||
st.rerun()
|
||||
|
||||
with nonremote_col:
|
||||
st.markdown("**Purge non-remote**")
|
||||
st.caption("Removes pending/approved/rejected jobs where remote is not set. Keeps anything already in the pipeline.")
|
||||
if st.button("🏢 Purge On-site Jobs", use_container_width=True):
|
||||
st.session_state["confirm_purge"] = "non_remote"
|
||||
|
||||
if st.session_state.get("confirm_purge") == "non_remote":
|
||||
st.warning("Deletes all non-remote jobs not yet applied to. Cannot be undone.")
|
||||
c1, c2 = st.columns(2)
|
||||
if c1.button("Yes, purge on-site", type="primary", use_container_width=True):
|
||||
deleted = purge_non_remote(DEFAULT_DB)
|
||||
st.success(f"Purged {deleted} non-remote jobs.")
|
||||
st.session_state.pop("confirm_purge", None)
|
||||
st.rerun()
|
||||
if c2.button("Cancel ", use_container_width=True):
|
||||
st.session_state.pop("confirm_purge", None)
|
||||
st.rerun()
|
||||
|
||||
with approved_col:
|
||||
st.markdown("**Purge approved (unapplied)**")
|
||||
st.caption("Removes _approved_ jobs you haven't applied to yet — e.g. to reset after a review pass.")
|
||||
if st.button("🗑 Purge Approved", use_container_width=True):
|
||||
st.session_state["confirm_purge"] = "approved_only"
|
||||
|
||||
if st.session_state.get("confirm_purge") == "approved_only":
|
||||
st.warning("Deletes all approved-but-not-applied jobs. Cannot be undone.")
|
||||
c1, c2 = st.columns(2)
|
||||
if c1.button("Yes, purge approved", type="primary", use_container_width=True):
|
||||
deleted = purge_jobs(DEFAULT_DB, statuses=["approved"])
|
||||
st.success(f"Purged {deleted} approved jobs.")
|
||||
st.session_state.pop("confirm_purge", None)
|
||||
st.rerun()
|
||||
if c2.button("Cancel ", use_container_width=True):
|
||||
st.session_state.pop("confirm_purge", None)
|
||||
st.rerun()
|
||||
|
||||
st.divider()
|
||||
|
||||
archive_col1, archive_col2, _, _ = st.columns(4)
|
||||
|
||||
with archive_col1:
|
||||
st.markdown("**Archive remaining**")
|
||||
st.caption(
|
||||
"Move all _pending_ and _rejected_ jobs to archived status. "
|
||||
"Archived jobs stay in the DB for dedup — they just won't appear in Job Review."
|
||||
if st.session_state.get("confirm_dz") == "purge":
|
||||
st.warning(
|
||||
f"Permanently delete **{', '.join(_scope_statuses)}** jobs? "
|
||||
"This removes the URLs from dedup history too. Cannot be undone."
|
||||
)
|
||||
if st.button("📦 Archive Pending + Rejected", use_container_width=True):
|
||||
st.session_state["confirm_purge"] = "archive_remaining"
|
||||
_dc1, _dc2 = st.columns(2)
|
||||
if _dc1.button("Yes, delete", type="primary", use_container_width=True, key="dz_purge_confirm"):
|
||||
n = purge_jobs(get_db_path(), statuses=_scope_statuses)
|
||||
st.success(f"Deleted {n} jobs.")
|
||||
st.session_state.pop("confirm_dz", None)
|
||||
st.rerun()
|
||||
if _dc2.button("Cancel", use_container_width=True, key="dz_purge_cancel"):
|
||||
st.session_state.pop("confirm_dz", None)
|
||||
st.rerun()
|
||||
|
||||
if st.session_state.get("confirm_purge") == "archive_remaining":
|
||||
st.info("Jobs will be archived (not deleted) — URLs are kept for dedup.")
|
||||
c1, c2 = st.columns(2)
|
||||
if c1.button("Yes, archive", type="primary", use_container_width=True):
|
||||
archived = archive_jobs(DEFAULT_DB, statuses=["pending", "rejected"])
|
||||
st.success(f"Archived {archived} jobs.")
|
||||
st.session_state.pop("confirm_purge", None)
|
||||
st.rerun()
|
||||
if c2.button("Cancel ", use_container_width=True):
|
||||
st.session_state.pop("confirm_purge", None)
|
||||
st.rerun()
|
||||
st.divider()
|
||||
|
||||
with archive_col2:
|
||||
st.markdown("**Archive approved (unapplied)**")
|
||||
st.caption("Archive _approved_ listings you decided to skip — keeps history without cluttering the apply queue.")
|
||||
if st.button("📦 Archive Approved", use_container_width=True):
|
||||
st.session_state["confirm_purge"] = "archive_approved"
|
||||
# ── Background tasks ──────────────────────────────────────────────────────
|
||||
_active = get_active_tasks(get_db_path())
|
||||
st.markdown(f"**Background tasks** — {len(_active)} active")
|
||||
|
||||
if st.session_state.get("confirm_purge") == "archive_approved":
|
||||
st.info("Approved jobs will be archived (not deleted).")
|
||||
c1, c2 = st.columns(2)
|
||||
if c1.button("Yes, archive approved", type="primary", use_container_width=True):
|
||||
archived = archive_jobs(DEFAULT_DB, statuses=["approved"])
|
||||
st.success(f"Archived {archived} approved jobs.")
|
||||
st.session_state.pop("confirm_purge", None)
|
||||
st.rerun()
|
||||
if c2.button("Cancel ", use_container_width=True):
|
||||
st.session_state.pop("confirm_purge", None)
|
||||
if _active:
|
||||
_task_icons = {"cover_letter": "✉️", "research": "🔍", "discovery": "🌐", "enrich_descriptions": "📝"}
|
||||
for _t in _active:
|
||||
_tc1, _tc2, _tc3 = st.columns([3, 4, 2])
|
||||
_icon = _task_icons.get(_t["task_type"], "⚙️")
|
||||
_tc1.caption(f"{_icon} `{_t['task_type']}`")
|
||||
_job_label = f"{_t['title']} @ {_t['company']}" if _t.get("title") else f"job #{_t['job_id']}"
|
||||
_tc2.caption(_job_label)
|
||||
_tc3.caption(f"_{_t['status']}_")
|
||||
if st.button("✕ Cancel", key=f"dz_cancel_task_{_t['id']}", use_container_width=True):
|
||||
cancel_task(get_db_path(), _t["id"])
|
||||
st.rerun()
|
||||
st.caption("")
|
||||
|
||||
_kill_col, _ = st.columns([2, 6])
|
||||
if _kill_col.button("⏹ Kill all stuck", use_container_width=True, disabled=len(_active) == 0):
|
||||
killed = kill_stuck_tasks(get_db_path())
|
||||
st.success(f"Killed {killed} task(s).")
|
||||
st.rerun()
|
||||
|
||||
st.divider()
|
||||
|
||||
# ── Rarely needed (collapsed) ─────────────────────────────────────────────
|
||||
with st.expander("More options", expanded=False):
|
||||
_rare1, _rare2, _rare3 = st.columns(3)
|
||||
|
||||
with _rare1:
|
||||
st.markdown("**Purge email data**")
|
||||
st.caption("Clears all email thread logs and email-sourced pending jobs.")
|
||||
if st.button("📧 Purge Email Data", use_container_width=True):
|
||||
st.session_state["confirm_dz"] = "email"
|
||||
if st.session_state.get("confirm_dz") == "email":
|
||||
st.warning("Deletes all email contacts and email-sourced jobs. Cannot be undone.")
|
||||
_ec1, _ec2 = st.columns(2)
|
||||
if _ec1.button("Yes, purge emails", type="primary", use_container_width=True, key="dz_email_confirm"):
|
||||
contacts, jobs = purge_email_data(get_db_path())
|
||||
st.success(f"Purged {contacts} email contacts, {jobs} email jobs.")
|
||||
st.session_state.pop("confirm_dz", None)
|
||||
st.rerun()
|
||||
if _ec2.button("Cancel", use_container_width=True, key="dz_email_cancel"):
|
||||
st.session_state.pop("confirm_dz", None)
|
||||
st.rerun()
|
||||
|
||||
with _rare2:
|
||||
st.markdown("**Purge non-remote**")
|
||||
st.caption("Removes pending/approved/rejected on-site listings from the DB.")
|
||||
if st.button("🏢 Purge On-site Jobs", use_container_width=True):
|
||||
st.session_state["confirm_dz"] = "non_remote"
|
||||
if st.session_state.get("confirm_dz") == "non_remote":
|
||||
st.warning("Deletes all non-remote jobs not yet applied to. Cannot be undone.")
|
||||
_rc1, _rc2 = st.columns(2)
|
||||
if _rc1.button("Yes, purge on-site", type="primary", use_container_width=True, key="dz_nonremote_confirm"):
|
||||
deleted = purge_non_remote(get_db_path())
|
||||
st.success(f"Purged {deleted} non-remote jobs.")
|
||||
st.session_state.pop("confirm_dz", None)
|
||||
st.rerun()
|
||||
if _rc2.button("Cancel", use_container_width=True, key="dz_nonremote_cancel"):
|
||||
st.session_state.pop("confirm_dz", None)
|
||||
st.rerun()
|
||||
|
||||
with _rare3:
|
||||
st.markdown("**Wipe all + re-scrape**")
|
||||
st.caption("Deletes all non-applied jobs then immediately runs a fresh discovery.")
|
||||
if st.button("🔄 Wipe + Re-scrape", use_container_width=True):
|
||||
st.session_state["confirm_dz"] = "rescrape"
|
||||
if st.session_state.get("confirm_dz") == "rescrape":
|
||||
st.warning("Wipes ALL pending, approved, and rejected jobs, then re-scrapes. Applied and synced records are kept.")
|
||||
_wc1, _wc2 = st.columns(2)
|
||||
if _wc1.button("Yes, wipe + scrape", type="primary", use_container_width=True, key="dz_rescrape_confirm"):
|
||||
purge_jobs(get_db_path(), statuses=["pending", "approved", "rejected"])
|
||||
submit_task(get_db_path(), "discovery", 0)
|
||||
st.session_state.pop("confirm_dz", None)
|
||||
st.rerun()
|
||||
if _wc2.button("Cancel", use_container_width=True, key="dz_rescrape_cancel"):
|
||||
st.session_state.pop("confirm_dz", None)
|
||||
st.rerun()
|
||||
|
||||
# ── Setup banners ─────────────────────────────────────────────────────────────
|
||||
if _profile and _profile.wizard_complete:
|
||||
_dismissed = set(_profile.dismissed_banners)
|
||||
_pending_banners = [b for b in _SETUP_BANNERS if b["key"] not in _dismissed]
|
||||
_pending_banners = [
|
||||
b for b in _SETUP_BANNERS
|
||||
if b["key"] not in _dismissed and not b.get("done", lambda: False)()
|
||||
]
|
||||
if _pending_banners:
|
||||
st.divider()
|
||||
st.markdown("#### Finish setting up Peregrine")
|
||||
for banner in _pending_banners:
|
||||
_bcol, _bdismiss = st.columns([10, 1])
|
||||
with _bcol:
|
||||
st.info(f"💡 {banner['text']} → _{banner['link_label']}_")
|
||||
_ic, _lc = st.columns([3, 1])
|
||||
_ic.info(f"💡 {banner['text']}")
|
||||
with _lc:
|
||||
st.write("")
|
||||
st.page_link("pages/2_Settings.py", label=banner['link_label'], icon="⚙️")
|
||||
with _bdismiss:
|
||||
st.write("")
|
||||
if st.button("✕", key=f"dismiss_banner_{banner['key']}", help="Dismiss"):
|
||||
|
|
|
|||
178
app/app.py
178
app/app.py
|
|
@ -8,6 +8,7 @@ Run: streamlit run app/app.py
|
|||
bash scripts/manage-ui.sh start
|
||||
"""
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
|
@ -16,17 +17,39 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
|
|||
|
||||
logging.basicConfig(level=logging.WARNING, format="%(name)s %(levelname)s: %(message)s")
|
||||
|
||||
# Load .env before any os.environ reads — safe to call inside Docker too
|
||||
# (uses setdefault, so Docker-injected vars take precedence over .env values)
|
||||
from circuitforge_core.config.settings import load_env as _load_env
|
||||
_load_env(Path(__file__).parent.parent / ".env")
|
||||
|
||||
IS_DEMO = os.environ.get("DEMO_MODE", "").lower() in ("1", "true", "yes")
|
||||
|
||||
import streamlit as st
|
||||
from scripts.db import DEFAULT_DB, init_db, get_active_tasks
|
||||
from scripts.db_migrate import migrate_db
|
||||
from app.feedback import inject_feedback_button
|
||||
from app.cloud_session import resolve_session, get_db_path, get_config_dir, get_cloud_tier
|
||||
import sqlite3
|
||||
|
||||
_LOGO_CIRCLE = Path(__file__).parent / "static" / "peregrine_logo_circle.png"
|
||||
_LOGO_FULL = Path(__file__).parent / "static" / "peregrine_logo.png"
|
||||
|
||||
st.set_page_config(
|
||||
page_title="Job Seeker",
|
||||
page_icon="💼",
|
||||
page_title="Peregrine",
|
||||
page_icon=str(_LOGO_CIRCLE) if _LOGO_CIRCLE.exists() else "💼",
|
||||
layout="wide",
|
||||
)
|
||||
|
||||
init_db(DEFAULT_DB)
|
||||
resolve_session("peregrine")
|
||||
init_db(get_db_path())
|
||||
migrate_db(Path(get_db_path()))
|
||||
|
||||
# Demo tier — initialize once per session (cookie persistence handled client-side)
|
||||
if IS_DEMO and "simulated_tier" not in st.session_state:
|
||||
st.session_state["simulated_tier"] = "paid"
|
||||
|
||||
if _LOGO_CIRCLE.exists():
|
||||
st.logo(str(_LOGO_CIRCLE), icon_image=str(_LOGO_CIRCLE))
|
||||
|
||||
# ── Startup cleanup — runs once per server process via cache_resource ──────────
|
||||
@st.cache_resource
|
||||
|
|
@ -36,12 +59,12 @@ def _startup() -> None:
|
|||
2. Auto-queues re-runs for any research generated without SearXNG data,
|
||||
if SearXNG is now reachable.
|
||||
"""
|
||||
conn = sqlite3.connect(DEFAULT_DB)
|
||||
conn.execute(
|
||||
"UPDATE background_tasks SET status='failed', error='Interrupted by server restart',"
|
||||
" finished_at=datetime('now') WHERE status IN ('queued','running')"
|
||||
)
|
||||
conn.commit()
|
||||
# Reset only in-flight tasks — queued tasks survive for the scheduler to resume.
|
||||
# MUST run before any submit_task() call in this function.
|
||||
from scripts.db import reset_running_tasks
|
||||
reset_running_tasks(get_db_path())
|
||||
|
||||
conn = sqlite3.connect(get_db_path())
|
||||
|
||||
# Auto-recovery: re-run LLM-only research when SearXNG is available
|
||||
try:
|
||||
|
|
@ -57,7 +80,7 @@ def _startup() -> None:
|
|||
_ACTIVE_STAGES,
|
||||
).fetchall()
|
||||
for (job_id,) in rows:
|
||||
submit_task(str(DEFAULT_DB), "company_research", job_id)
|
||||
submit_task(str(get_db_path()), "company_research", job_id)
|
||||
except Exception:
|
||||
pass # never block startup
|
||||
|
||||
|
|
@ -74,15 +97,24 @@ except Exception:
|
|||
|
||||
# ── First-run wizard gate ───────────────────────────────────────────────────────
|
||||
from scripts.user_profile import UserProfile as _UserProfile
|
||||
_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
|
||||
_USER_YAML = get_config_dir() / "user.yaml"
|
||||
|
||||
_show_wizard = (
|
||||
_show_wizard = not IS_DEMO and (
|
||||
not _UserProfile.exists(_USER_YAML)
|
||||
or not _UserProfile(_USER_YAML).wizard_complete
|
||||
)
|
||||
if _show_wizard:
|
||||
_setup_page = st.Page("pages/0_Setup.py", title="Setup", icon="👋")
|
||||
st.navigation({"": [_setup_page]}).run()
|
||||
# Sync UI cookie even during wizard so vue preference redirects correctly.
|
||||
# Tier not yet computed here — use cloud tier (or "free" fallback).
|
||||
try:
|
||||
from app.components.ui_switcher import sync_ui_cookie as _sync_wizard_cookie
|
||||
from app.cloud_session import get_cloud_tier as _gctr
|
||||
_wizard_tier = _gctr() if _gctr() != "local" else "free"
|
||||
_sync_wizard_cookie(_USER_YAML, _wizard_tier)
|
||||
except Exception:
|
||||
pass
|
||||
st.stop()
|
||||
|
||||
# ── Navigation ─────────────────────────────────────────────────────────────────
|
||||
|
|
@ -107,35 +139,53 @@ pg = st.navigation(pages)
|
|||
# ── Background task sidebar indicator ─────────────────────────────────────────
|
||||
# Fragment polls every 3s so stage labels update live without a full page reload.
|
||||
# The sidebar context WRAPS the fragment call — do not write to st.sidebar inside it.
|
||||
_TASK_LABELS = {
|
||||
"cover_letter": "Cover letter",
|
||||
"company_research": "Research",
|
||||
"email_sync": "Email sync",
|
||||
"discovery": "Discovery",
|
||||
"enrich_descriptions": "Enriching descriptions",
|
||||
"score": "Scoring matches",
|
||||
"scrape_url": "Scraping listing",
|
||||
"enrich_craigslist": "Enriching listing",
|
||||
"wizard_generate": "Wizard generation",
|
||||
"prepare_training": "Training data",
|
||||
}
|
||||
_DISCOVERY_PIPELINE = ["discovery", "enrich_descriptions", "score"]
|
||||
|
||||
|
||||
@st.fragment(run_every=3)
|
||||
def _task_indicator():
|
||||
tasks = get_active_tasks(DEFAULT_DB)
|
||||
tasks = get_active_tasks(get_db_path())
|
||||
if not tasks:
|
||||
return
|
||||
st.divider()
|
||||
st.markdown(f"**⏳ {len(tasks)} task(s) running**")
|
||||
for t in tasks:
|
||||
icon = "⏳" if t["status"] == "running" else "🕐"
|
||||
task_type = t["task_type"]
|
||||
if task_type == "cover_letter":
|
||||
label = "Cover letter"
|
||||
elif task_type == "company_research":
|
||||
label = "Research"
|
||||
elif task_type == "email_sync":
|
||||
label = "Email sync"
|
||||
elif task_type == "discovery":
|
||||
label = "Discovery"
|
||||
elif task_type == "enrich_descriptions":
|
||||
label = "Enriching"
|
||||
elif task_type == "scrape_url":
|
||||
label = "Scraping URL"
|
||||
elif task_type == "wizard_generate":
|
||||
label = "Wizard generation"
|
||||
elif task_type == "enrich_craigslist":
|
||||
label = "Enriching listing"
|
||||
else:
|
||||
label = task_type.replace("_", " ").title()
|
||||
stage = t.get("stage") or ""
|
||||
|
||||
pipeline_set = set(_DISCOVERY_PIPELINE)
|
||||
pipeline_tasks = [t for t in tasks if t["task_type"] in pipeline_set]
|
||||
other_tasks = [t for t in tasks if t["task_type"] not in pipeline_set]
|
||||
|
||||
# Discovery pipeline: render as ordered sub-queue with indented steps
|
||||
if pipeline_tasks:
|
||||
ordered = [
|
||||
next((t for t in pipeline_tasks if t["task_type"] == typ), None)
|
||||
for typ in _DISCOVERY_PIPELINE
|
||||
]
|
||||
ordered = [t for t in ordered if t is not None]
|
||||
for i, t in enumerate(ordered):
|
||||
icon = "⏳" if t["status"] == "running" else "🕐"
|
||||
label = _TASK_LABELS.get(t["task_type"], t["task_type"].replace("_", " ").title())
|
||||
stage = t.get("stage") or ""
|
||||
detail = f" · {stage}" if stage else ""
|
||||
prefix = "" if i == 0 else "↳ "
|
||||
st.caption(f"{prefix}{icon} {label}{detail}")
|
||||
|
||||
# All other tasks (cover letter, email sync, etc.) as individual rows
|
||||
for t in other_tasks:
|
||||
icon = "⏳" if t["status"] == "running" else "🕐"
|
||||
label = _TASK_LABELS.get(t["task_type"], t["task_type"].replace("_", " ").title())
|
||||
stage = t.get("stage") or ""
|
||||
detail = f" · {stage}" if stage else (f" — {t.get('company')}" if t.get("company") else "")
|
||||
st.caption(f"{icon} {label}{detail}")
|
||||
|
||||
|
|
@ -150,9 +200,67 @@ def _get_version() -> str:
|
|||
except Exception:
|
||||
return "dev"
|
||||
|
||||
# ── Effective tier (resolved before sidebar so switcher can use it) ──────────
|
||||
# get_cloud_tier() returns "local" in dev/self-hosted mode, real tier in cloud.
|
||||
_ui_profile = _UserProfile(_USER_YAML) if _UserProfile.exists(_USER_YAML) else None
|
||||
_ui_yaml_tier = _ui_profile.effective_tier if _ui_profile else "free"
|
||||
_ui_cloud_tier = get_cloud_tier()
|
||||
_ui_tier = _ui_cloud_tier if _ui_cloud_tier != "local" else _ui_yaml_tier
|
||||
|
||||
with st.sidebar:
|
||||
if IS_DEMO:
|
||||
st.info(
|
||||
"**Public demo** — read-only sample data. "
|
||||
"AI features and data saves are disabled.\n\n"
|
||||
"[Get your own instance →](https://circuitforge.tech/software/peregrine)",
|
||||
icon="🔒",
|
||||
)
|
||||
_task_indicator()
|
||||
|
||||
# Cloud LLM indicator — shown whenever any cloud backend is active
|
||||
_llm_cfg_path = Path(__file__).parent.parent / "config" / "llm.yaml"
|
||||
try:
|
||||
import yaml as _yaml
|
||||
from scripts.byok_guard import cloud_backends as _cloud_backends
|
||||
_active_cloud = _cloud_backends(_yaml.safe_load(_llm_cfg_path.read_text(encoding="utf-8")) or {})
|
||||
except Exception:
|
||||
_active_cloud = []
|
||||
if _active_cloud:
|
||||
_provider_names = ", ".join(b.replace("_", " ").title() for b in _active_cloud)
|
||||
st.warning(
|
||||
f"**Cloud LLM active**\n\n"
|
||||
f"{_provider_names}\n\n"
|
||||
"AI features send content to this provider. "
|
||||
"[Change in Settings](2_Settings)",
|
||||
icon="🔓",
|
||||
)
|
||||
|
||||
st.divider()
|
||||
try:
|
||||
from app.components.ui_switcher import render_sidebar_switcher
|
||||
render_sidebar_switcher(_USER_YAML, _ui_tier)
|
||||
except Exception:
|
||||
pass # never crash the app over the sidebar switcher
|
||||
st.caption(f"Peregrine {_get_version()}")
|
||||
inject_feedback_button(page=pg.title)
|
||||
|
||||
# ── Demo toolbar (DEMO_MODE only) ───────────────────────────────────────────
|
||||
if IS_DEMO:
|
||||
from app.components.demo_toolbar import render_demo_toolbar
|
||||
render_demo_toolbar()
|
||||
|
||||
# ── UI switcher banner (paid tier; or all visitors in demo mode) ─────────────
|
||||
try:
|
||||
from app.components.ui_switcher import render_banner
|
||||
render_banner(_USER_YAML, _ui_tier)
|
||||
except Exception:
|
||||
pass # never crash the app over the banner
|
||||
|
||||
pg.run()
|
||||
|
||||
# ── UI preference cookie sync (runs after page render) ──────────────────────
|
||||
try:
|
||||
from app.components.ui_switcher import sync_ui_cookie
|
||||
sync_ui_cookie(_USER_YAML, _ui_tier)
|
||||
except Exception:
|
||||
pass # never crash the app over cookie sync
|
||||
|
|
|
|||
219
app/cloud_session.py
Normal file
219
app/cloud_session.py
Normal file
|
|
@ -0,0 +1,219 @@
|
|||
# peregrine/app/cloud_session.py
|
||||
"""
|
||||
Cloud session middleware for multi-tenant Peregrine deployment.
|
||||
|
||||
In local-first mode (CLOUD_MODE unset or false), all functions are no-ops.
|
||||
In cloud mode (CLOUD_MODE=true), resolves the Directus session JWT from the
|
||||
X-CF-Session header, validates it, and injects user_id + db_path into
|
||||
st.session_state.
|
||||
|
||||
All Peregrine pages call get_db_path() instead of DEFAULT_DB directly to
|
||||
transparently support both local and cloud deployments.
|
||||
"""
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import hmac
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
import streamlit as st
|
||||
|
||||
from scripts.db import DEFAULT_DB
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
CLOUD_MODE: bool = os.environ.get("CLOUD_MODE", "").lower() in ("1", "true", "yes")
|
||||
CLOUD_DATA_ROOT: Path = Path(os.environ.get("CLOUD_DATA_ROOT", "/devl/menagerie-data"))
|
||||
DIRECTUS_JWT_SECRET: str = os.environ.get("DIRECTUS_JWT_SECRET", "")
|
||||
SERVER_SECRET: str = os.environ.get("CF_SERVER_SECRET", "")
|
||||
|
||||
# Heimdall license server — internal URL preferred when running on the same host
|
||||
HEIMDALL_URL: str = os.environ.get("HEIMDALL_URL", "https://license.circuitforge.tech")
|
||||
HEIMDALL_ADMIN_TOKEN: str = os.environ.get("HEIMDALL_ADMIN_TOKEN", "")
|
||||
|
||||
|
||||
def _extract_session_token(cookie_header: str) -> str:
|
||||
"""Extract cf_session value from a Cookie header string."""
|
||||
m = re.search(r'(?:^|;)\s*cf_session=([^;]+)', cookie_header)
|
||||
return m.group(1).strip() if m else ""
|
||||
|
||||
|
||||
def _ensure_provisioned(user_id: str, product: str) -> None:
|
||||
"""Call Heimdall /admin/provision for this user if no key exists yet.
|
||||
|
||||
Idempotent — Heimdall does nothing if a key already exists for this
|
||||
(user_id, product) pair. Called once per session start so new Google
|
||||
OAuth signups get a free key created automatically.
|
||||
"""
|
||||
if not HEIMDALL_ADMIN_TOKEN:
|
||||
return
|
||||
try:
|
||||
requests.post(
|
||||
f"{HEIMDALL_URL}/admin/provision",
|
||||
json={"directus_user_id": user_id, "product": product, "tier": "free"},
|
||||
headers={"Authorization": f"Bearer {HEIMDALL_ADMIN_TOKEN}"},
|
||||
timeout=5,
|
||||
)
|
||||
except Exception as exc:
|
||||
log.warning("Heimdall provision failed for user %s: %s", user_id, exc)
|
||||
|
||||
|
||||
@st.cache_data(ttl=300, show_spinner=False)
|
||||
def _fetch_cloud_tier(user_id: str, product: str) -> str:
|
||||
"""Call Heimdall to resolve the current cloud tier for this user.
|
||||
|
||||
Cached per (user_id, product) for 5 minutes to avoid hammering Heimdall
|
||||
on every Streamlit rerun. Returns "free" on any error so the app degrades
|
||||
gracefully rather than blocking the user.
|
||||
"""
|
||||
if not HEIMDALL_ADMIN_TOKEN:
|
||||
log.warning("HEIMDALL_ADMIN_TOKEN not set — defaulting tier to free")
|
||||
return "free"
|
||||
try:
|
||||
resp = requests.post(
|
||||
f"{HEIMDALL_URL}/admin/cloud/resolve",
|
||||
json={"user_id": user_id, "product": product},
|
||||
headers={"Authorization": f"Bearer {HEIMDALL_ADMIN_TOKEN}"},
|
||||
timeout=5,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
return resp.json().get("tier", "free")
|
||||
if resp.status_code == 404:
|
||||
# No cloud key yet — user signed up before provision ran; return free.
|
||||
return "free"
|
||||
log.warning("Heimdall resolve returned %s — defaulting tier to free", resp.status_code)
|
||||
except Exception as exc:
|
||||
log.warning("Heimdall tier resolve failed: %s — defaulting to free", exc)
|
||||
return "free"
|
||||
|
||||
|
||||
def validate_session_jwt(token: str) -> str:
|
||||
"""Validate a Directus session JWT and return the user UUID. Raises on failure."""
|
||||
import jwt # PyJWT — lazy import so local mode never needs it
|
||||
payload = jwt.decode(token, DIRECTUS_JWT_SECRET, algorithms=["HS256"])
|
||||
user_id = payload.get("id") or payload.get("sub")
|
||||
if not user_id:
|
||||
raise ValueError("JWT missing user id claim")
|
||||
return user_id
|
||||
|
||||
|
||||
def _user_data_path(user_id: str, app: str) -> Path:
|
||||
return CLOUD_DATA_ROOT / user_id / app
|
||||
|
||||
|
||||
def derive_db_key(user_id: str) -> str:
|
||||
"""Derive a per-user SQLCipher encryption key from the server secret."""
|
||||
return hmac.new(
|
||||
SERVER_SECRET.encode(),
|
||||
user_id.encode(),
|
||||
hashlib.sha256,
|
||||
).hexdigest()
|
||||
|
||||
|
||||
def _render_auth_wall(message: str = "Please sign in to continue.") -> None:
|
||||
"""Render a branded sign-in prompt and halt the page."""
|
||||
st.markdown(
|
||||
"""
|
||||
<style>
|
||||
[data-testid="stSidebar"] { display: none; }
|
||||
[data-testid="collapsedControl"] { display: none; }
|
||||
</style>
|
||||
""",
|
||||
unsafe_allow_html=True,
|
||||
)
|
||||
col = st.columns([1, 2, 1])[1]
|
||||
with col:
|
||||
st.markdown("## 🦅 Peregrine")
|
||||
st.info(message, icon="🔒")
|
||||
st.link_button(
|
||||
"Sign in to CircuitForge",
|
||||
url=f"https://circuitforge.tech/login?next=/peregrine",
|
||||
use_container_width=True,
|
||||
)
|
||||
|
||||
|
||||
def resolve_session(app: str = "peregrine") -> None:
|
||||
"""
|
||||
Call at the top of each Streamlit page.
|
||||
In local mode: no-op.
|
||||
In cloud mode: reads X-CF-Session header, validates JWT, creates user
|
||||
data directory on first visit, and sets st.session_state keys:
|
||||
- user_id: str
|
||||
- db_path: Path
|
||||
- db_key: str (SQLCipher key for this user)
|
||||
- cloud_tier: str (free | paid | premium | ultra — resolved from Heimdall)
|
||||
Idempotent — skips if user_id already in session_state.
|
||||
"""
|
||||
if not CLOUD_MODE:
|
||||
return
|
||||
if st.session_state.get("user_id"):
|
||||
return
|
||||
|
||||
# Primary: Caddy injects X-CF-Session header in production.
|
||||
# Fallback: direct access (E2E tests, dev without Caddy) reads the cookie header.
|
||||
cookie_header = (
|
||||
st.context.headers.get("x-cf-session", "")
|
||||
or st.context.headers.get("cookie", "")
|
||||
)
|
||||
session_jwt = _extract_session_token(cookie_header)
|
||||
if not session_jwt:
|
||||
_render_auth_wall("Please sign in to access Peregrine.")
|
||||
st.stop()
|
||||
|
||||
try:
|
||||
user_id = validate_session_jwt(session_jwt)
|
||||
except Exception:
|
||||
_render_auth_wall("Your session has expired. Please sign in again.")
|
||||
st.stop()
|
||||
|
||||
user_path = _user_data_path(user_id, app)
|
||||
user_path.mkdir(parents=True, exist_ok=True)
|
||||
config_path = user_path / "config"
|
||||
config_path.mkdir(exist_ok=True)
|
||||
(user_path / "data").mkdir(exist_ok=True)
|
||||
|
||||
# Bootstrap config files that the UI requires to exist — never overwrite
|
||||
_kw = config_path / "resume_keywords.yaml"
|
||||
if not _kw.exists():
|
||||
_kw.write_text("skills: []\ndomains: []\nkeywords: []\n")
|
||||
|
||||
st.session_state["user_id"] = user_id
|
||||
st.session_state["db_path"] = user_path / "staging.db"
|
||||
st.session_state["db_key"] = derive_db_key(user_id)
|
||||
_ensure_provisioned(user_id, app)
|
||||
st.session_state["cloud_tier"] = _fetch_cloud_tier(user_id, app)
|
||||
|
||||
|
||||
def get_db_path() -> Path:
|
||||
"""
|
||||
Return the active db_path for this session.
|
||||
Cloud: user-scoped path from session_state.
|
||||
Local: DEFAULT_DB (from STAGING_DB env var or repo default).
|
||||
"""
|
||||
return st.session_state.get("db_path", DEFAULT_DB)
|
||||
|
||||
|
||||
def get_config_dir() -> Path:
|
||||
"""
|
||||
Return the config directory for this session.
|
||||
Cloud: per-user path (<data_root>/<user_id>/peregrine/config/) so each
|
||||
user's YAML files (user.yaml, plain_text_resume.yaml, etc.) are
|
||||
isolated and never shared across tenants.
|
||||
Local: repo-level config/ directory.
|
||||
"""
|
||||
if CLOUD_MODE and st.session_state.get("db_path"):
|
||||
return Path(st.session_state["db_path"]).parent / "config"
|
||||
return Path(__file__).parent.parent / "config"
|
||||
|
||||
|
||||
def get_cloud_tier() -> str:
|
||||
"""
|
||||
Return the current user's cloud tier.
|
||||
Cloud mode: resolved from Heimdall at session start (cached 5 min).
|
||||
Local mode: always returns "local" so pages can distinguish self-hosted from cloud.
|
||||
"""
|
||||
if not CLOUD_MODE:
|
||||
return "local"
|
||||
return st.session_state.get("cloud_tier", "free")
|
||||
1
app/components/__init__.py
Normal file
1
app/components/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
# app/components/__init__.py
|
||||
72
app/components/demo_toolbar.py
Normal file
72
app/components/demo_toolbar.py
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
"""Demo toolbar — tier simulation for DEMO_MODE instances.
|
||||
|
||||
Renders a slim full-width bar above the Streamlit nav showing
|
||||
Free / Paid / Premium pills. Clicking a pill sets a prgn_demo_tier
|
||||
cookie (for persistence across reloads) and st.session_state.simulated_tier
|
||||
(for immediate use within the current render pass).
|
||||
|
||||
Only ever rendered when DEMO_MODE=true.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
import streamlit as st
|
||||
import streamlit.components.v1 as components
|
||||
|
||||
_VALID_TIERS = ("free", "paid", "premium")
|
||||
_DEFAULT_TIER = "paid" # most compelling first impression
|
||||
|
||||
_DEMO_MODE = os.environ.get("DEMO_MODE", "").lower() in ("1", "true", "yes")
|
||||
|
||||
_COOKIE_JS = """
|
||||
<script>
|
||||
(function() {{
|
||||
document.cookie = 'prgn_demo_tier={tier}; path=/; SameSite=Lax';
|
||||
}})();
|
||||
</script>
|
||||
"""
|
||||
|
||||
|
||||
def get_simulated_tier() -> str:
|
||||
"""Return the current simulated tier, defaulting to 'paid'."""
|
||||
return st.session_state.get("simulated_tier", _DEFAULT_TIER)
|
||||
|
||||
|
||||
def set_simulated_tier(tier: str) -> None:
|
||||
"""Set simulated tier in session state + cookie. Reruns the page."""
|
||||
if tier not in _VALID_TIERS:
|
||||
return
|
||||
st.session_state["simulated_tier"] = tier
|
||||
components.html(_COOKIE_JS.format(tier=tier), height=0)
|
||||
st.rerun()
|
||||
|
||||
|
||||
def render_demo_toolbar() -> None:
|
||||
"""Render the demo mode toolbar.
|
||||
|
||||
Shows a dismissible info bar with tier-selection pills.
|
||||
Call this at the TOP of app.py's render pass, before pg.run().
|
||||
"""
|
||||
current = get_simulated_tier()
|
||||
|
||||
labels = {t: t.capitalize() + (" ✓" if t == current else "") for t in _VALID_TIERS}
|
||||
|
||||
with st.container():
|
||||
cols = st.columns([3, 1, 1, 1, 2])
|
||||
with cols[0]:
|
||||
st.caption("🎭 **Demo mode** — exploring as:")
|
||||
for i, tier in enumerate(_VALID_TIERS):
|
||||
with cols[i + 1]:
|
||||
is_active = tier == current
|
||||
if st.button(
|
||||
labels[tier],
|
||||
key=f"_demo_tier_{tier}",
|
||||
type="primary" if is_active else "secondary",
|
||||
use_container_width=True,
|
||||
):
|
||||
if not is_active:
|
||||
set_simulated_tier(tier)
|
||||
with cols[4]:
|
||||
st.caption("[Get your own →](https://circuitforge.tech/software/peregrine)")
|
||||
st.divider()
|
||||
192
app/components/linkedin_import.py
Normal file
192
app/components/linkedin_import.py
Normal file
|
|
@ -0,0 +1,192 @@
|
|||
# app/components/linkedin_import.py
|
||||
"""
|
||||
Shared LinkedIn import widget.
|
||||
|
||||
Usage in a page:
|
||||
from app.components.linkedin_import import render_linkedin_tab
|
||||
|
||||
# At top of page render — check for pending import:
|
||||
_li_data = st.session_state.pop("_linkedin_extracted", None)
|
||||
if _li_data:
|
||||
st.session_state["_parsed_resume"] = _li_data
|
||||
st.rerun()
|
||||
|
||||
# Inside the LinkedIn tab:
|
||||
with tab_linkedin:
|
||||
render_linkedin_tab(config_dir=CONFIG_DIR, tier=tier)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import streamlit as st
|
||||
|
||||
_LINKEDIN_PROFILE_RE = re.compile(r"https?://(www\.)?linkedin\.com/in/", re.I)
|
||||
|
||||
|
||||
def _stage_path(config_dir: Path) -> Path:
|
||||
return config_dir / "linkedin_stage.json"
|
||||
|
||||
|
||||
def _load_stage(config_dir: Path) -> dict | None:
|
||||
path = _stage_path(config_dir)
|
||||
if not path.exists():
|
||||
return None
|
||||
try:
|
||||
return json.loads(path.read_text())
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _days_ago(iso_ts: str) -> str:
|
||||
try:
|
||||
dt = datetime.fromisoformat(iso_ts)
|
||||
delta = datetime.now(timezone.utc) - dt
|
||||
days = delta.days
|
||||
if days == 0:
|
||||
return "today"
|
||||
if days == 1:
|
||||
return "yesterday"
|
||||
return f"{days} days ago"
|
||||
except Exception:
|
||||
return "unknown"
|
||||
|
||||
|
||||
def _do_scrape(url: str, config_dir: Path) -> None:
|
||||
"""Validate URL, run scrape, update state."""
|
||||
if not _LINKEDIN_PROFILE_RE.match(url):
|
||||
st.error("Please enter a LinkedIn profile URL (linkedin.com/in/…)")
|
||||
return
|
||||
|
||||
with st.spinner("Fetching LinkedIn profile… (10–20 seconds)"):
|
||||
try:
|
||||
from scripts.linkedin_scraper import scrape_profile
|
||||
scrape_profile(url, _stage_path(config_dir))
|
||||
st.success("Profile imported successfully.")
|
||||
st.rerun()
|
||||
except ValueError as e:
|
||||
st.error(str(e))
|
||||
except RuntimeError as e:
|
||||
st.warning(str(e))
|
||||
except Exception as e:
|
||||
st.error(f"Unexpected error: {e}")
|
||||
|
||||
|
||||
def render_linkedin_tab(config_dir: Path, tier: str) -> None:
|
||||
"""
|
||||
Render the LinkedIn import UI.
|
||||
|
||||
When the user clicks "Use this data", writes the extracted dict to
|
||||
st.session_state["_linkedin_extracted"] and calls st.rerun().
|
||||
|
||||
Caller reads: data = st.session_state.pop("_linkedin_extracted", None)
|
||||
"""
|
||||
stage = _load_stage(config_dir)
|
||||
|
||||
# ── Staged data status bar ────────────────────────────────────────────────
|
||||
if stage:
|
||||
scraped_at = stage.get("scraped_at", "")
|
||||
source_label = "LinkedIn export" if stage.get("source") == "export_zip" else "LinkedIn profile"
|
||||
col_info, col_refresh = st.columns([4, 1])
|
||||
col_info.caption(f"Last imported from {source_label}: {_days_ago(scraped_at)}")
|
||||
if col_refresh.button("🔄 Refresh", key="li_refresh"):
|
||||
url = stage.get("url")
|
||||
if url:
|
||||
_do_scrape(url, config_dir)
|
||||
else:
|
||||
st.info("Original URL not available — paste the URL below to re-import.")
|
||||
|
||||
# ── URL import ────────────────────────────────────────────────────────────
|
||||
st.markdown("**Import from LinkedIn profile URL**")
|
||||
url_input = st.text_input(
|
||||
"LinkedIn profile URL",
|
||||
placeholder="https://linkedin.com/in/your-name",
|
||||
label_visibility="collapsed",
|
||||
key="li_url_input",
|
||||
)
|
||||
if st.button("🔗 Import from LinkedIn", key="li_import_btn", type="primary"):
|
||||
if not url_input.strip():
|
||||
st.warning("Please enter your LinkedIn profile URL.")
|
||||
else:
|
||||
_do_scrape(url_input.strip(), config_dir)
|
||||
|
||||
st.caption(
|
||||
"Imports from your public LinkedIn profile. No login or credentials required. "
|
||||
"Scraping typically takes 10–20 seconds."
|
||||
)
|
||||
st.info(
|
||||
"**LinkedIn limits public profile data.** Without logging in, LinkedIn only "
|
||||
"exposes your name, About summary, current employer, and certifications — "
|
||||
"past roles, education, and skills are hidden behind their login wall. "
|
||||
"For your full career history use the **data export zip** option below.",
|
||||
icon="ℹ️",
|
||||
)
|
||||
|
||||
# ── Section preview + use button ─────────────────────────────────────────
|
||||
if stage:
|
||||
from scripts.linkedin_parser import parse_stage
|
||||
extracted, err = parse_stage(_stage_path(config_dir))
|
||||
|
||||
if err:
|
||||
st.warning(f"Could not read staged data: {err}")
|
||||
else:
|
||||
st.divider()
|
||||
st.markdown("**Preview**")
|
||||
col1, col2, col3 = st.columns(3)
|
||||
col1.metric("Experience entries", len(extracted.get("experience", [])))
|
||||
col2.metric("Skills", len(extracted.get("skills", [])))
|
||||
col3.metric("Certifications", len(extracted.get("achievements", [])))
|
||||
|
||||
if extracted.get("career_summary"):
|
||||
with st.expander("Summary"):
|
||||
st.write(extracted["career_summary"])
|
||||
|
||||
if extracted.get("experience"):
|
||||
with st.expander(f"Experience ({len(extracted['experience'])} entries)"):
|
||||
for exp in extracted["experience"]:
|
||||
st.markdown(f"**{exp.get('title')}** @ {exp.get('company')} · {exp.get('date_range', '')}")
|
||||
|
||||
if extracted.get("education"):
|
||||
with st.expander("Education"):
|
||||
for edu in extracted["education"]:
|
||||
st.markdown(f"**{edu.get('school')}** — {edu.get('degree')} {edu.get('field', '')}".strip())
|
||||
|
||||
if extracted.get("skills"):
|
||||
with st.expander("Skills"):
|
||||
st.write(", ".join(extracted["skills"]))
|
||||
|
||||
st.divider()
|
||||
if st.button("✅ Use this data", key="li_use_btn", type="primary"):
|
||||
st.session_state["_linkedin_extracted"] = extracted
|
||||
st.rerun()
|
||||
|
||||
# ── Advanced: data export ─────────────────────────────────────────────────
|
||||
with st.expander("⬇️ Import from LinkedIn data export (advanced)", expanded=False):
|
||||
st.caption(
|
||||
"Download your LinkedIn data: **Settings & Privacy → Data Privacy → "
|
||||
"Get a copy of your data → Request archive → Fast file**. "
|
||||
"The Fast file is available immediately and contains your profile, "
|
||||
"experience, education, and skills."
|
||||
)
|
||||
zip_file = st.file_uploader(
|
||||
"Upload LinkedIn export zip", type=["zip"], key="li_zip_upload"
|
||||
)
|
||||
if zip_file is not None:
|
||||
if st.button("📦 Parse export", key="li_parse_zip"):
|
||||
with st.spinner("Parsing export archive…"):
|
||||
try:
|
||||
from scripts.linkedin_scraper import parse_export_zip
|
||||
extracted = parse_export_zip(
|
||||
zip_file.read(), _stage_path(config_dir)
|
||||
)
|
||||
st.success(
|
||||
f"Imported {len(extracted.get('experience', []))} experience entries, "
|
||||
f"{len(extracted.get('skills', []))} skills. "
|
||||
"Click 'Use this data' above to apply."
|
||||
)
|
||||
st.rerun()
|
||||
except Exception as e:
|
||||
st.error(f"Failed to parse export: {e}")
|
||||
31
app/components/paste_image.py
Normal file
31
app/components/paste_image.py
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
"""
|
||||
Paste-from-clipboard / drag-and-drop image component.
|
||||
|
||||
Uses st.components.v1.declare_component so JS can return image bytes to Python
|
||||
(st.components.v1.html() is one-way only). No build step required — the
|
||||
frontend is a single index.html file.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
from pathlib import Path
|
||||
|
||||
import streamlit.components.v1 as components
|
||||
|
||||
_FRONTEND = Path(__file__).parent / "paste_image_ui"
|
||||
|
||||
_paste_image = components.declare_component("paste_image", path=str(_FRONTEND))
|
||||
|
||||
|
||||
def paste_image_component(key: str | None = None) -> bytes | None:
|
||||
"""
|
||||
Render the paste/drop zone. Returns PNG/JPEG bytes when an image is
|
||||
pasted or dropped, or None if nothing has been submitted yet.
|
||||
"""
|
||||
result = _paste_image(key=key)
|
||||
if result:
|
||||
try:
|
||||
return base64.b64decode(result)
|
||||
except Exception:
|
||||
return None
|
||||
return None
|
||||
142
app/components/paste_image_ui/index.html
Normal file
142
app/components/paste_image_ui/index.html
Normal file
|
|
@ -0,0 +1,142 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<style>
|
||||
* { box-sizing: border-box; margin: 0; padding: 0; }
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, "Source Sans Pro", sans-serif;
|
||||
background: transparent;
|
||||
}
|
||||
.zone {
|
||||
width: 100%;
|
||||
min-height: 72px;
|
||||
border: 2px dashed var(--border, #ccc);
|
||||
border-radius: 8px;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
flex-direction: column;
|
||||
gap: 6px;
|
||||
padding: 12px 16px;
|
||||
cursor: pointer;
|
||||
outline: none;
|
||||
transition: border-color 0.15s, background 0.15s;
|
||||
color: var(--text-muted, #888);
|
||||
font-size: 13px;
|
||||
text-align: center;
|
||||
user-select: none;
|
||||
}
|
||||
.zone:focus { border-color: var(--primary, #ff4b4b); background: var(--primary-faint, rgba(255,75,75,0.06)); }
|
||||
.zone.dragover { border-color: var(--primary, #ff4b4b); background: var(--primary-faint, rgba(255,75,75,0.06)); }
|
||||
.zone.done { border-style: solid; border-color: #00c853; color: #00c853; }
|
||||
.icon { font-size: 22px; line-height: 1; }
|
||||
.hint { font-size: 11px; opacity: 0.7; }
|
||||
.status { margin-top: 5px; font-size: 11px; text-align: center; color: var(--text-muted, #888); min-height: 16px; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="zone" id="zone" tabindex="0" role="button"
|
||||
aria-label="Click to focus, then paste with Ctrl+V, or drag and drop an image">
|
||||
<span class="icon">📋</span>
|
||||
<span id="mainMsg"><strong>Click here</strong>, then <strong>Ctrl+V</strong> to paste</span>
|
||||
<span class="hint" id="hint">or drag & drop an image file</span>
|
||||
</div>
|
||||
<div class="status" id="status"></div>
|
||||
|
||||
<script>
|
||||
const zone = document.getElementById('zone');
|
||||
const status = document.getElementById('status');
|
||||
const mainMsg = document.getElementById('mainMsg');
|
||||
const hint = document.getElementById('hint');
|
||||
|
||||
// ── Streamlit handshake ─────────────────────────────────────────────────
|
||||
window.parent.postMessage({ type: "streamlit:componentReady", apiVersion: 1 }, "*");
|
||||
|
||||
function setHeight() {
|
||||
const h = document.body.scrollHeight + 4;
|
||||
window.parent.postMessage({ type: "streamlit:setFrameHeight", height: h }, "*");
|
||||
}
|
||||
setHeight();
|
||||
|
||||
// ── Theme ───────────────────────────────────────────────────────────────
|
||||
window.addEventListener("message", (e) => {
|
||||
if (e.data && e.data.type === "streamlit:render") {
|
||||
const t = e.data.args && e.data.args.theme;
|
||||
if (!t) return;
|
||||
const r = document.documentElement;
|
||||
r.style.setProperty("--primary", t.primaryColor || "#ff4b4b");
|
||||
r.style.setProperty("--primary-faint", (t.primaryColor || "#ff4b4b") + "10");
|
||||
r.style.setProperty("--text-muted", t.textColor ? t.textColor + "99" : "#888");
|
||||
r.style.setProperty("--border", t.textColor ? t.textColor + "33" : "#ccc");
|
||||
document.body.style.background = t.backgroundColor || "transparent";
|
||||
}
|
||||
});
|
||||
|
||||
// ── Image handling ──────────────────────────────────────────────────────
|
||||
function markDone() {
|
||||
zone.classList.add('done');
|
||||
// Clear children and rebuild with safe DOM methods
|
||||
while (zone.firstChild) zone.removeChild(zone.firstChild);
|
||||
const icon = document.createElement('span');
|
||||
icon.className = 'icon';
|
||||
icon.textContent = '\u2705';
|
||||
const msg = document.createElement('span');
|
||||
msg.textContent = 'Image ready \u2014 remove or replace below';
|
||||
zone.appendChild(icon);
|
||||
zone.appendChild(msg);
|
||||
setHeight();
|
||||
}
|
||||
|
||||
function sendImage(blob) {
|
||||
const reader = new FileReader();
|
||||
reader.onload = function(ev) {
|
||||
const dataUrl = ev.target.result;
|
||||
const b64 = dataUrl.slice(dataUrl.indexOf(',') + 1);
|
||||
window.parent.postMessage({ type: "streamlit:setComponentValue", value: b64 }, "*");
|
||||
markDone();
|
||||
};
|
||||
reader.readAsDataURL(blob);
|
||||
}
|
||||
|
||||
function findImageItem(items) {
|
||||
if (!items) return null;
|
||||
for (let i = 0; i < items.length; i++) {
|
||||
if (items[i].type && items[i].type.indexOf('image/') === 0) return items[i];
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Ctrl+V paste (works over HTTP — uses paste event, not Clipboard API)
|
||||
document.addEventListener('paste', function(e) {
|
||||
const item = findImageItem(e.clipboardData && e.clipboardData.items);
|
||||
if (item) { sendImage(item.getAsFile()); e.preventDefault(); }
|
||||
});
|
||||
|
||||
// Drag and drop
|
||||
zone.addEventListener('dragover', function(e) {
|
||||
e.preventDefault();
|
||||
zone.classList.add('dragover');
|
||||
});
|
||||
zone.addEventListener('dragleave', function() {
|
||||
zone.classList.remove('dragover');
|
||||
});
|
||||
zone.addEventListener('drop', function(e) {
|
||||
e.preventDefault();
|
||||
zone.classList.remove('dragover');
|
||||
const files = e.dataTransfer && e.dataTransfer.files;
|
||||
if (files && files.length) {
|
||||
for (let i = 0; i < files.length; i++) {
|
||||
if (files[i].type.indexOf('image/') === 0) { sendImage(files[i]); return; }
|
||||
}
|
||||
}
|
||||
// Fallback: dataTransfer items (e.g. dragged from browser)
|
||||
const item = findImageItem(e.dataTransfer && e.dataTransfer.items);
|
||||
if (item) sendImage(item.getAsFile());
|
||||
});
|
||||
|
||||
// Click to focus so Ctrl+V lands in this iframe
|
||||
zone.addEventListener('click', function() { zone.focus(); });
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
262
app/components/ui_switcher.py
Normal file
262
app/components/ui_switcher.py
Normal file
|
|
@ -0,0 +1,262 @@
|
|||
"""UI switcher component for Peregrine.
|
||||
|
||||
Manages the prgn_ui cookie (Caddy routing signal) and user.yaml
|
||||
ui_preference (durability across browser clears).
|
||||
|
||||
Cookie mechanics
|
||||
----------------
|
||||
Streamlit cannot read HTTP cookies server-side. Instead:
|
||||
- sync_ui_cookie() injects a JS snippet that sets document.cookie.
|
||||
- Vue SPA switch-back appends ?prgn_switch=streamlit to the redirect URL.
|
||||
sync_ui_cookie() reads this param via st.query_params and uses it as
|
||||
an override signal, then writes user.yaml to match.
|
||||
|
||||
Call sync_ui_cookie() in the app.py render pass (after pg.run()).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import streamlit as st
|
||||
import streamlit.components.v1 as components
|
||||
|
||||
from scripts.user_profile import UserProfile
|
||||
from app.wizard.tiers import can_use
|
||||
|
||||
_DEMO_MODE = os.environ.get("DEMO_MODE", "").lower() in ("1", "true", "yes")
|
||||
|
||||
# When set, the app is running without a Caddy reverse proxy in front
|
||||
# (local dev, direct port exposure). Switch to Vue by navigating directly
|
||||
# to this URL instead of relying on cookie-based Caddy routing.
|
||||
# Example: PEREGRINE_VUE_URL=http://localhost:8506
|
||||
_VUE_URL = os.environ.get("PEREGRINE_VUE_URL", "").strip().rstrip("/")
|
||||
|
||||
# When True, a window.location.reload() after setting prgn_ui=vue will be
|
||||
# intercepted by Caddy and routed to the Vue SPA. When False (no Caddy in the
|
||||
# traffic path — e.g. test instances, direct Docker exposure), reloading just
|
||||
# comes back to Streamlit and creates an infinite loop. Only set this in
|
||||
# production/staging compose files where Caddy is actually in front.
|
||||
_CADDY_PROXY = os.environ.get("PEREGRINE_CADDY_PROXY", "").lower() in ("1", "true", "yes")
|
||||
|
||||
_COOKIE_JS = """
|
||||
<script>
|
||||
(function() {{
|
||||
document.cookie = 'prgn_ui={value}; path=/; SameSite=Lax';
|
||||
{navigate_js}
|
||||
}})();
|
||||
</script>
|
||||
"""
|
||||
|
||||
|
||||
def _set_cookie_js(value: str, navigate: bool = False) -> None:
|
||||
"""Inject JS to set the prgn_ui cookie.
|
||||
|
||||
When PEREGRINE_VUE_URL is set (local dev, no Caddy): navigating to Vue
|
||||
uses window.parent.location.href to jump directly to the Vue container
|
||||
port. Without this, reload() just sends the request back to the same
|
||||
Streamlit port with no router in between to inspect the cookie.
|
||||
|
||||
When PEREGRINE_CADDY_PROXY is set (production/staging): navigate=True
|
||||
triggers window.location.reload() so Caddy sees the updated cookie on
|
||||
the next HTTP request and routes accordingly.
|
||||
|
||||
When neither is set (test instances, bare Docker): navigate is suppressed
|
||||
entirely — the cookie is written silently, but no reload is attempted.
|
||||
Reloading without a proxy just bounces back to Streamlit and loops.
|
||||
"""
|
||||
# components.html() renders in an iframe — window.parent navigates the host page
|
||||
if navigate and value == "vue" and _VUE_URL:
|
||||
nav_js = f"window.parent.location.href = '{_VUE_URL}';"
|
||||
elif navigate and _CADDY_PROXY:
|
||||
nav_js = "window.parent.location.reload();"
|
||||
else:
|
||||
nav_js = ""
|
||||
components.html(_COOKIE_JS.format(value=value, navigate_js=nav_js), height=0)
|
||||
|
||||
|
||||
def sync_ui_cookie(yaml_path: Path, tier: str) -> None:
|
||||
"""Sync the prgn_ui cookie to match user.yaml ui_preference.
|
||||
|
||||
Also handles:
|
||||
- ?prgn_switch=<value> param (Vue SPA switch-back signal): overrides yaml,
|
||||
writes yaml to match, clears the param.
|
||||
- Tier downgrade: resets vue preference to streamlit for ineligible users.
|
||||
- ?ui_fallback=1 param: Vue SPA was down — reinforce streamlit cookie and
|
||||
return early to avoid immediately navigating back to a broken Vue SPA.
|
||||
|
||||
When the resolved preference is "vue", this function navigates (full page
|
||||
reload) rather than silently setting the cookie. Without navigate=True,
|
||||
Streamlit would set prgn_ui=vue mid-page-load; subsequent HTTP requests
|
||||
made by Streamlit's own frontend (lazy JS chunks, WebSocket upgrade) would
|
||||
carry the new cookie and Caddy would misroute them to the Vue nginx
|
||||
container, causing TypeError: error loading dynamically imported module.
|
||||
"""
|
||||
# ── ?ui_fallback=1 — Vue SPA was down, Caddy bounced us back ──────────────
|
||||
# Return early: reinforce the streamlit cookie so we don't immediately
|
||||
# navigate back to a Vue SPA that may still be down.
|
||||
if st.query_params.get("ui_fallback"):
|
||||
st.toast("⚠️ New UI temporarily unavailable — switched back to Classic", icon="⚠️")
|
||||
st.query_params.pop("ui_fallback", None)
|
||||
_set_cookie_js("streamlit")
|
||||
return
|
||||
|
||||
# ── ?prgn_switch param — Vue SPA sent us here to switch back ──────────────
|
||||
switch_param = st.query_params.get("prgn_switch")
|
||||
if switch_param in ("streamlit", "vue"):
|
||||
try:
|
||||
profile = UserProfile(yaml_path)
|
||||
profile.ui_preference = switch_param
|
||||
profile.save()
|
||||
except Exception:
|
||||
# UI components must not crash the app — silent fallback
|
||||
pass
|
||||
st.query_params.pop("prgn_switch", None)
|
||||
_set_cookie_js(switch_param)
|
||||
return
|
||||
|
||||
# ── Normal path: read yaml, enforce tier, inject cookie ───────────────────
|
||||
profile = None
|
||||
try:
|
||||
profile = UserProfile(yaml_path)
|
||||
pref = profile.ui_preference
|
||||
except Exception:
|
||||
# UI components must not crash the app — silent fallback to default
|
||||
pref = "streamlit"
|
||||
|
||||
# Demo mode: Vue SPA has no demo data wiring — always serve Streamlit.
|
||||
# (The tier downgrade check below is skipped in demo mode, but we must
|
||||
# also block the Vue navigation itself so Caddy doesn't route to a blank SPA.)
|
||||
if pref == "vue" and _DEMO_MODE:
|
||||
pref = "streamlit"
|
||||
|
||||
# Tier downgrade protection (skip in demo — demo bypasses tier gate)
|
||||
if pref == "vue" and not _DEMO_MODE and not can_use(tier, "vue_ui_beta"):
|
||||
if profile is not None:
|
||||
try:
|
||||
profile.ui_preference = "streamlit"
|
||||
profile.save()
|
||||
except Exception:
|
||||
# UI components must not crash the app — silent fallback
|
||||
pass
|
||||
pref = "streamlit"
|
||||
|
||||
# Navigate (full reload) when switching to Vue so Caddy re-routes on the
|
||||
# next HTTP request before Streamlit serves any more content. Silent
|
||||
# cookie-only set is safe for streamlit since we're already on that origin.
|
||||
_set_cookie_js(pref, navigate=(pref == "vue"))
|
||||
|
||||
|
||||
def switch_ui(yaml_path: Path, to: str, tier: str) -> None:
|
||||
"""Write user.yaml, set cookie, and navigate.
|
||||
|
||||
to: "vue" | "streamlit"
|
||||
|
||||
Switching to Vue triggers window.location.reload() so Caddy sees the
|
||||
updated prgn_ui cookie and routes to the Vue SPA. st.rerun() alone is
|
||||
not sufficient — it operates over WebSocket and produces no HTTP request.
|
||||
|
||||
Switching back to streamlit uses st.rerun() (no full reload needed since
|
||||
we're already on the Streamlit origin and no Caddy re-routing is required).
|
||||
"""
|
||||
if to not in ("vue", "streamlit"):
|
||||
return
|
||||
try:
|
||||
profile = UserProfile(yaml_path)
|
||||
profile.ui_preference = to
|
||||
profile.save()
|
||||
except Exception:
|
||||
# UI components must not crash the app — silent fallback
|
||||
pass
|
||||
if to == "vue":
|
||||
# navigate=True triggers window.location.reload() after setting cookie
|
||||
_set_cookie_js("vue", navigate=True)
|
||||
else:
|
||||
sync_ui_cookie(yaml_path, tier=tier)
|
||||
st.rerun()
|
||||
|
||||
|
||||
def render_banner(yaml_path: Path, tier: str) -> None:
|
||||
"""Show the 'Try the new UI' banner once per session.
|
||||
|
||||
Dismissed flag stored in user.yaml dismissed_banners list so it
|
||||
persists across sessions (uses the existing dismissed_banners pattern).
|
||||
Eligible: paid+ tier, OR demo mode. Not shown if already on vue.
|
||||
"""
|
||||
eligible = _DEMO_MODE or can_use(tier, "vue_ui_beta")
|
||||
if not eligible:
|
||||
return
|
||||
|
||||
try:
|
||||
profile = UserProfile(yaml_path)
|
||||
except Exception:
|
||||
# UI components must not crash the app — silent fallback
|
||||
return
|
||||
|
||||
if profile.ui_preference == "vue":
|
||||
return
|
||||
if "ui_switcher_beta" in (profile.dismissed_banners or []):
|
||||
return
|
||||
|
||||
col1, col2, col3 = st.columns([8, 1, 1])
|
||||
with col1:
|
||||
st.info("✨ **New Peregrine UI available** — try the modern Vue interface (Beta)")
|
||||
with col2:
|
||||
if st.button("Try it", key="_ui_banner_try"):
|
||||
switch_ui(yaml_path, to="vue", tier=tier)
|
||||
with col3:
|
||||
if st.button("Dismiss", key="_ui_banner_dismiss"):
|
||||
profile.dismissed_banners = list(profile.dismissed_banners or []) + ["ui_switcher_beta"]
|
||||
profile.save()
|
||||
st.rerun()
|
||||
|
||||
|
||||
def render_sidebar_switcher(yaml_path: Path, tier: str) -> None:
|
||||
"""Persistent sidebar button to switch to the Vue UI.
|
||||
|
||||
Shown when the user is eligible (paid+ or demo) and currently on Streamlit.
|
||||
This is always visible — unlike the banner which can be dismissed.
|
||||
"""
|
||||
eligible = _DEMO_MODE or can_use(tier, "vue_ui_beta")
|
||||
if not eligible:
|
||||
return
|
||||
try:
|
||||
profile = UserProfile(yaml_path)
|
||||
if profile.ui_preference == "vue":
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if st.button("✨ Switch to New UI", key="_sidebar_switch_vue", use_container_width=True):
|
||||
switch_ui(yaml_path, to="vue", tier=tier)
|
||||
|
||||
|
||||
def render_settings_toggle(yaml_path: Path, tier: str) -> None:
|
||||
"""Toggle in Settings → System → Deployment expander."""
|
||||
eligible = _DEMO_MODE or can_use(tier, "vue_ui_beta")
|
||||
if not eligible:
|
||||
return
|
||||
|
||||
try:
|
||||
profile = UserProfile(yaml_path)
|
||||
current = profile.ui_preference
|
||||
except Exception:
|
||||
# UI components must not crash the app — silent fallback to default
|
||||
current = "streamlit"
|
||||
|
||||
options = ["streamlit", "vue"]
|
||||
labels = ["Classic (Streamlit)", "✨ New UI (Vue, Beta)"]
|
||||
current_idx = options.index(current) if current in options else 0
|
||||
|
||||
st.markdown("**UI Version**")
|
||||
chosen = st.radio(
|
||||
"UI Version",
|
||||
options=labels,
|
||||
index=current_idx,
|
||||
key="_ui_toggle_radio",
|
||||
label_visibility="collapsed",
|
||||
)
|
||||
chosen_val = options[labels.index(chosen)]
|
||||
|
||||
if chosen_val != current:
|
||||
switch_ui(yaml_path, to=chosen_val, tier=tier)
|
||||
247
app/feedback.py
Normal file
247
app/feedback.py
Normal file
|
|
@ -0,0 +1,247 @@
|
|||
"""
|
||||
Floating feedback button + dialog — thin Streamlit shell.
|
||||
All business logic lives in scripts/feedback_api.py.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
import streamlit as st
|
||||
|
||||
# ── CSS: float the button to the bottom-right corner ─────────────────────────
|
||||
# Targets the button by its aria-label (set via `help=` parameter).
|
||||
_FLOAT_CSS = """
|
||||
<style>
|
||||
button[aria-label="Send feedback or report a bug"] {
|
||||
position: fixed !important;
|
||||
bottom: 2rem !important;
|
||||
right: 2rem !important;
|
||||
z-index: 9999 !important;
|
||||
border-radius: 25px !important;
|
||||
padding: 0.5rem 1.25rem !important;
|
||||
box-shadow: 0 4px 16px rgba(0,0,0,0.25) !important;
|
||||
font-size: 0.9rem !important;
|
||||
}
|
||||
</style>
|
||||
"""
|
||||
|
||||
|
||||
@st.dialog("Send Feedback", width="large")
|
||||
def _feedback_dialog(page: str) -> None:
|
||||
"""Two-step feedback dialog: form → consent/attachments → submit."""
|
||||
from scripts.feedback_api import (
|
||||
collect_context, collect_logs, collect_listings,
|
||||
build_issue_body, create_forgejo_issue, upload_attachment,
|
||||
)
|
||||
from scripts.db import DEFAULT_DB
|
||||
|
||||
# ── Initialise step counter ───────────────────────────────────────────────
|
||||
if "fb_step" not in st.session_state:
|
||||
st.session_state.fb_step = 1
|
||||
|
||||
# ═════════════════════════════════════════════════════════════════════════
|
||||
# STEP 1 — Form
|
||||
# ═════════════════════════════════════════════════════════════════════════
|
||||
if st.session_state.fb_step == 1:
|
||||
st.subheader("What's on your mind?")
|
||||
|
||||
fb_type = st.selectbox(
|
||||
"Type", ["Bug", "Feature Request", "Other"], key="fb_type"
|
||||
)
|
||||
fb_title = st.text_input(
|
||||
"Title", placeholder="Short summary of the issue or idea", key="fb_title"
|
||||
)
|
||||
fb_desc = st.text_area(
|
||||
"Description",
|
||||
placeholder="Describe what happened or what you'd like to see...",
|
||||
key="fb_desc",
|
||||
)
|
||||
if fb_type == "Bug":
|
||||
st.text_area(
|
||||
"Reproduction steps",
|
||||
placeholder="1. Go to...\n2. Click...\n3. See error",
|
||||
key="fb_repro",
|
||||
)
|
||||
|
||||
col_cancel, _, col_next = st.columns([1, 3, 1])
|
||||
with col_cancel:
|
||||
if st.button("Cancel"):
|
||||
_clear_feedback_state()
|
||||
st.rerun() # intentionally closes the dialog
|
||||
with col_next:
|
||||
if st.button("Next →", type="primary"):
|
||||
# Read widget values NOW (same rerun as the click — values are
|
||||
# available here even on first click). Copy to non-widget keys
|
||||
# so they survive step 2's render (Streamlit removes widget
|
||||
# state for widgets that are no longer rendered).
|
||||
title = fb_title.strip()
|
||||
desc = fb_desc.strip()
|
||||
if not title or not desc:
|
||||
st.error("Please fill in both Title and Description.")
|
||||
else:
|
||||
st.session_state.fb_data_type = fb_type
|
||||
st.session_state.fb_data_title = title
|
||||
st.session_state.fb_data_desc = desc
|
||||
st.session_state.fb_data_repro = st.session_state.get("fb_repro", "")
|
||||
st.session_state.fb_step = 2
|
||||
|
||||
# ═════════════════════════════════════════════════════════════════════════
|
||||
# STEP 2 — Consent + attachments
|
||||
# ═════════════════════════════════════════════════════════════════════════
|
||||
elif st.session_state.fb_step == 2:
|
||||
st.subheader("Optional: attach diagnostic data")
|
||||
|
||||
# ── Diagnostic data toggle + preview ─────────────────────────────────
|
||||
include_diag = st.toggle(
|
||||
"Include diagnostic data (logs + recent listings)", key="fb_diag"
|
||||
)
|
||||
if include_diag:
|
||||
with st.expander("Preview what will be sent", expanded=True):
|
||||
st.caption("**App logs (last 100 lines, PII masked):**")
|
||||
st.code(collect_logs(100), language=None)
|
||||
st.caption("**Recent listings (title / company / URL only):**")
|
||||
for j in collect_listings(DEFAULT_DB, 5):
|
||||
st.write(f"- {j['title']} @ {j['company']} — {j['url']}")
|
||||
|
||||
# ── Screenshot ────────────────────────────────────────────────────────
|
||||
st.divider()
|
||||
st.caption("**Screenshot** (optional)")
|
||||
|
||||
from app.components.paste_image import paste_image_component
|
||||
|
||||
# Keyed so we can reset the component when the user removes the image
|
||||
if "fb_paste_key" not in st.session_state:
|
||||
st.session_state.fb_paste_key = 0
|
||||
|
||||
pasted = paste_image_component(key=f"fb_paste_{st.session_state.fb_paste_key}")
|
||||
if pasted:
|
||||
st.session_state.fb_screenshot = pasted
|
||||
|
||||
st.caption("or upload a file:")
|
||||
uploaded = st.file_uploader(
|
||||
"Upload screenshot",
|
||||
type=["png", "jpg", "jpeg"],
|
||||
label_visibility="collapsed",
|
||||
key="fb_upload",
|
||||
)
|
||||
if uploaded:
|
||||
st.session_state.fb_screenshot = uploaded.read()
|
||||
|
||||
if st.session_state.get("fb_screenshot"):
|
||||
st.image(
|
||||
st.session_state["fb_screenshot"],
|
||||
caption="Screenshot preview — this will be attached to the issue",
|
||||
use_container_width=True,
|
||||
)
|
||||
if st.button("🗑 Remove screenshot"):
|
||||
st.session_state.pop("fb_screenshot", None)
|
||||
st.session_state.fb_paste_key = st.session_state.get("fb_paste_key", 0) + 1
|
||||
# no st.rerun() — button click already re-renders the dialog
|
||||
|
||||
# ── Attribution consent ───────────────────────────────────────────────
|
||||
st.divider()
|
||||
submitter: str | None = None
|
||||
try:
|
||||
import yaml
|
||||
_ROOT = Path(__file__).parent.parent
|
||||
user = yaml.safe_load((_ROOT / "config" / "user.yaml").read_text()) or {}
|
||||
name = (user.get("name") or "").strip()
|
||||
email = (user.get("email") or "").strip()
|
||||
if name or email:
|
||||
label = f"Include my name & email in the report: **{name}** ({email})"
|
||||
if st.checkbox(label, key="fb_attr"):
|
||||
submitter = f"{name} <{email}>"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ── Navigation ────────────────────────────────────────────────────────
|
||||
col_back, _, col_submit = st.columns([1, 3, 2])
|
||||
with col_back:
|
||||
if st.button("← Back"):
|
||||
st.session_state.fb_step = 1
|
||||
# no st.rerun() — button click already re-renders the dialog
|
||||
|
||||
with col_submit:
|
||||
if st.button("Submit Feedback", type="primary"):
|
||||
_submit(page, include_diag, submitter, collect_context,
|
||||
collect_logs, collect_listings, build_issue_body,
|
||||
create_forgejo_issue, upload_attachment, DEFAULT_DB)
|
||||
|
||||
|
||||
def _submit(page, include_diag, submitter, collect_context, collect_logs,
|
||||
collect_listings, build_issue_body, create_forgejo_issue,
|
||||
upload_attachment, db_path) -> None:
|
||||
"""Handle form submission: build body, file issue, upload screenshot."""
|
||||
with st.spinner("Filing issue…"):
|
||||
context = collect_context(page)
|
||||
attachments: dict = {}
|
||||
if include_diag:
|
||||
attachments["logs"] = collect_logs(100)
|
||||
attachments["listings"] = collect_listings(db_path, 5)
|
||||
if submitter:
|
||||
attachments["submitter"] = submitter
|
||||
|
||||
fb_type = st.session_state.get("fb_data_type", "Other")
|
||||
type_key = {"Bug": "bug", "Feature Request": "feature", "Other": "other"}.get(
|
||||
fb_type, "other"
|
||||
)
|
||||
labels = ["beta-feedback", "needs-triage"]
|
||||
labels.append(
|
||||
{"bug": "bug", "feature": "feature-request"}.get(type_key, "question")
|
||||
)
|
||||
|
||||
form = {
|
||||
"type": type_key,
|
||||
"description": st.session_state.get("fb_data_desc", ""),
|
||||
"repro": st.session_state.get("fb_data_repro", "") if type_key == "bug" else "",
|
||||
}
|
||||
|
||||
body = build_issue_body(form, context, attachments)
|
||||
|
||||
try:
|
||||
result = create_forgejo_issue(
|
||||
st.session_state.get("fb_data_title", "Feedback"), body, labels
|
||||
)
|
||||
screenshot = st.session_state.get("fb_screenshot")
|
||||
if screenshot:
|
||||
upload_attachment(result["number"], screenshot)
|
||||
|
||||
_clear_feedback_state()
|
||||
st.success(f"Issue filed! [View on Forgejo]({result['url']})")
|
||||
st.balloons()
|
||||
|
||||
except Exception as exc:
|
||||
st.error(f"Failed to file issue: {exc}")
|
||||
|
||||
|
||||
def _clear_feedback_state() -> None:
|
||||
for key in [
|
||||
"fb_step",
|
||||
"fb_type", "fb_title", "fb_desc", "fb_repro", # widget keys
|
||||
"fb_data_type", "fb_data_title", "fb_data_desc", "fb_data_repro", # saved data
|
||||
"fb_diag", "fb_upload", "fb_attr", "fb_screenshot", "fb_paste_key",
|
||||
]:
|
||||
st.session_state.pop(key, None)
|
||||
|
||||
|
||||
def inject_feedback_button(page: str = "Unknown") -> None:
|
||||
"""
|
||||
Inject the floating feedback button. Call once per page render in app.py.
|
||||
Hidden automatically in DEMO_MODE.
|
||||
"""
|
||||
if os.environ.get("DEMO_MODE", "").lower() in ("1", "true", "yes"):
|
||||
return
|
||||
if not os.environ.get("FORGEJO_API_TOKEN"):
|
||||
return # silently skip if not configured
|
||||
|
||||
st.markdown(_FLOAT_CSS, unsafe_allow_html=True)
|
||||
if st.button(
|
||||
"💬 Feedback",
|
||||
key="__feedback_floating_btn__",
|
||||
help="Send feedback or report a bug",
|
||||
):
|
||||
_feedback_dialog(page)
|
||||
|
|
@ -15,11 +15,14 @@ sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|||
import streamlit as st
|
||||
import yaml
|
||||
|
||||
from app.cloud_session import resolve_session, get_db_path, get_config_dir
|
||||
resolve_session("peregrine")
|
||||
|
||||
_ROOT = Path(__file__).parent.parent.parent
|
||||
CONFIG_DIR = _ROOT / "config"
|
||||
CONFIG_DIR = get_config_dir() # per-user dir in cloud; repo config/ locally
|
||||
USER_YAML = CONFIG_DIR / "user.yaml"
|
||||
STEPS = 6 # mandatory steps
|
||||
STEP_LABELS = ["Hardware", "Tier", "Identity", "Resume", "Inference", "Search"]
|
||||
STEP_LABELS = ["Hardware", "Tier", "Resume", "Identity", "Inference", "Search"]
|
||||
|
||||
|
||||
# ── Helpers ────────────────────────────────────────────────────────────────────
|
||||
|
|
@ -74,18 +77,16 @@ def _suggest_profile(gpus: list[str]) -> str:
|
|||
|
||||
def _submit_wizard_task(section: str, input_data: dict) -> int:
|
||||
"""Submit a wizard_generate background task. Returns task_id."""
|
||||
from scripts.db import DEFAULT_DB
|
||||
from scripts.task_runner import submit_task
|
||||
params = json.dumps({"section": section, "input": input_data})
|
||||
task_id, _ = submit_task(DEFAULT_DB, "wizard_generate", 0, params=params)
|
||||
task_id, _ = submit_task(get_db_path(), "wizard_generate", 0, params=params)
|
||||
return task_id
|
||||
|
||||
|
||||
def _poll_wizard_task(section: str) -> dict | None:
|
||||
"""Return the most recent wizard_generate task row for a given section, or None."""
|
||||
import sqlite3
|
||||
from scripts.db import DEFAULT_DB
|
||||
conn = sqlite3.connect(DEFAULT_DB)
|
||||
conn = sqlite3.connect(get_db_path())
|
||||
conn.row_factory = sqlite3.Row
|
||||
row = conn.execute(
|
||||
"SELECT * FROM background_tasks "
|
||||
|
|
@ -105,10 +106,11 @@ def _generation_widget(section: str, label: str, tier: str,
|
|||
Call this inside a step to add LLM generation support.
|
||||
The caller decides whether to auto-populate a field with the result.
|
||||
"""
|
||||
from app.wizard.tiers import can_use, tier_label as tl
|
||||
from app.wizard.tiers import can_use, tier_label as tl, has_configured_llm
|
||||
|
||||
if not can_use(tier, feature_key):
|
||||
st.caption(f"{tl(feature_key)} {label}")
|
||||
_has_byok = has_configured_llm()
|
||||
if not can_use(tier, feature_key, has_byok=_has_byok):
|
||||
st.caption(f"{tl(feature_key, has_byok=_has_byok)} {label}")
|
||||
return None
|
||||
|
||||
col_btn, col_fb = st.columns([2, 5])
|
||||
|
|
@ -177,6 +179,13 @@ st.divider()
|
|||
|
||||
# ── Step 1: Hardware ───────────────────────────────────────────────────────────
|
||||
if step == 1:
|
||||
from app.cloud_session import CLOUD_MODE as _CLOUD_MODE
|
||||
if _CLOUD_MODE:
|
||||
# Cloud deployment: always single-gpu (Heimdall), skip hardware selection
|
||||
_save_yaml({"inference_profile": "single-gpu", "wizard_step": 1})
|
||||
st.session_state.wizard_step = 2
|
||||
st.rerun()
|
||||
|
||||
from app.wizard.step_hardware import validate, PROFILES
|
||||
|
||||
st.subheader("Step 1 \u2014 Hardware Detection")
|
||||
|
|
@ -210,6 +219,14 @@ if step == 1:
|
|||
|
||||
# ── Step 2: Tier ───────────────────────────────────────────────────────────────
|
||||
elif step == 2:
|
||||
from app.cloud_session import CLOUD_MODE as _CLOUD_MODE
|
||||
if _CLOUD_MODE:
|
||||
# Cloud mode: tier already resolved from Heimdall at session init
|
||||
cloud_tier = st.session_state.get("cloud_tier", "free")
|
||||
_save_yaml({"tier": cloud_tier, "wizard_step": 2})
|
||||
st.session_state.wizard_step = 3
|
||||
st.rerun()
|
||||
|
||||
from app.wizard.step_tier import validate
|
||||
|
||||
st.subheader("Step 2 \u2014 Choose Your Plan")
|
||||
|
|
@ -246,63 +263,21 @@ elif step == 2:
|
|||
st.rerun()
|
||||
|
||||
|
||||
# ── Step 3: Identity ───────────────────────────────────────────────────────────
|
||||
# ── Step 3: Resume ─────────────────────────────────────────────────────────────
|
||||
elif step == 3:
|
||||
from app.wizard.step_identity import validate
|
||||
|
||||
st.subheader("Step 3 \u2014 Your Identity")
|
||||
st.caption("Used in cover letter PDFs, LLM prompts, and the app header.")
|
||||
|
||||
c1, c2 = st.columns(2)
|
||||
name = c1.text_input("Full Name *", saved_yaml.get("name", ""))
|
||||
email = c1.text_input("Email *", saved_yaml.get("email", ""))
|
||||
phone = c2.text_input("Phone", saved_yaml.get("phone", ""))
|
||||
linkedin = c2.text_input("LinkedIn URL", saved_yaml.get("linkedin", ""))
|
||||
|
||||
# Career summary with optional LLM generation
|
||||
summary_default = st.session_state.get("_gen_result_career_summary") or saved_yaml.get("career_summary", "")
|
||||
summary = st.text_area(
|
||||
"Career Summary *", value=summary_default, height=120,
|
||||
placeholder="Experienced professional with X years in [field]. Specialise in [skills].",
|
||||
help="Injected into cover letter and research prompts as your professional context.",
|
||||
)
|
||||
|
||||
gen_result = _generation_widget(
|
||||
section="career_summary",
|
||||
label="Generate from resume",
|
||||
tier=_tier,
|
||||
feature_key="llm_career_summary",
|
||||
input_data={"resume_text": saved_yaml.get("_raw_resume_text", "")},
|
||||
)
|
||||
if gen_result and gen_result != summary:
|
||||
st.info(f"\u2728 Suggested summary \u2014 paste it above if it looks good:\n\n{gen_result}")
|
||||
|
||||
col_back, col_next = st.columns([1, 4])
|
||||
if col_back.button("\u2190 Back", key="ident_back"):
|
||||
st.session_state.wizard_step = 2
|
||||
st.rerun()
|
||||
if col_next.button("Next \u2192", type="primary", key="ident_next"):
|
||||
errs = validate({"name": name, "email": email, "career_summary": summary})
|
||||
if errs:
|
||||
st.error("\n".join(errs))
|
||||
else:
|
||||
_save_yaml({
|
||||
"name": name, "email": email, "phone": phone,
|
||||
"linkedin": linkedin, "career_summary": summary,
|
||||
"wizard_complete": False, "wizard_step": 3,
|
||||
})
|
||||
st.session_state.wizard_step = 4
|
||||
st.rerun()
|
||||
|
||||
|
||||
# ── Step 4: Resume ─────────────────────────────────────────────────────────────
|
||||
elif step == 4:
|
||||
from app.wizard.step_resume import validate
|
||||
|
||||
st.subheader("Step 4 \u2014 Resume")
|
||||
st.subheader("Step 3 \u2014 Resume")
|
||||
st.caption("Upload your resume for fast parsing, or build it section by section.")
|
||||
|
||||
tab_upload, tab_builder = st.tabs(["\U0001f4ce Upload", "\U0001f4dd Build manually"])
|
||||
# Read LinkedIn import result before tabs render (spec: "at step render time")
|
||||
_li_data = st.session_state.pop("_linkedin_extracted", None)
|
||||
if _li_data:
|
||||
st.session_state["_parsed_resume"] = _li_data
|
||||
|
||||
tab_upload, tab_builder, tab_linkedin = st.tabs([
|
||||
"\U0001f4ce Upload", "\U0001f4dd Build Manually", "\U0001f517 LinkedIn"
|
||||
])
|
||||
|
||||
with tab_upload:
|
||||
uploaded = st.file_uploader("Upload PDF, DOCX, or ODT", type=["pdf", "docx", "odt"])
|
||||
|
|
@ -391,9 +366,13 @@ elif step == 4:
|
|||
input_data={"bullet_notes": all_bullets},
|
||||
)
|
||||
|
||||
with tab_linkedin:
|
||||
from app.components.linkedin_import import render_linkedin_tab
|
||||
render_linkedin_tab(config_dir=CONFIG_DIR, tier=_tier)
|
||||
|
||||
col_back, col_next = st.columns([1, 4])
|
||||
if col_back.button("\u2190 Back", key="resume_back"):
|
||||
st.session_state.wizard_step = 3
|
||||
st.session_state.wizard_step = 2
|
||||
st.rerun()
|
||||
if col_next.button("Next \u2192", type="primary", key="resume_next"):
|
||||
parsed = st.session_state.get("_parsed_resume", {})
|
||||
|
|
@ -405,22 +384,84 @@ elif step == 4:
|
|||
if errs:
|
||||
st.error("\n".join(errs))
|
||||
else:
|
||||
resume_yaml_path = _ROOT / "config" / "plain_text_resume.yaml"
|
||||
resume_yaml_path = CONFIG_DIR / "plain_text_resume.yaml"
|
||||
resume_yaml_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
resume_data = {**parsed, "experience": experience} if parsed else {"experience": experience}
|
||||
resume_yaml_path.write_text(
|
||||
yaml.dump(resume_data, default_flow_style=False, allow_unicode=True)
|
||||
)
|
||||
_save_yaml({"wizard_step": 4})
|
||||
_save_yaml({"wizard_step": 3})
|
||||
st.session_state.wizard_step = 4
|
||||
st.rerun()
|
||||
|
||||
|
||||
# ── Step 4: Identity ───────────────────────────────────────────────────────────
|
||||
elif step == 4:
|
||||
from app.wizard.step_identity import validate
|
||||
|
||||
st.subheader("Step 4 \u2014 Your Identity")
|
||||
st.caption("Used in cover letter PDFs, LLM prompts, and the app header.")
|
||||
|
||||
c1, c2 = st.columns(2)
|
||||
_parsed = st.session_state.get("_parsed_resume", {})
|
||||
name = c1.text_input("Full Name *", saved_yaml.get("name") or _parsed.get("name", ""))
|
||||
email = c1.text_input("Email *", saved_yaml.get("email") or _parsed.get("email", ""))
|
||||
phone = c2.text_input("Phone", saved_yaml.get("phone") or _parsed.get("phone", ""))
|
||||
linkedin = c2.text_input("LinkedIn URL", saved_yaml.get("linkedin", ""))
|
||||
|
||||
# Career summary with optional LLM generation — resume text available now (step 3 ran first)
|
||||
summary_default = st.session_state.get("_gen_result_career_summary") or saved_yaml.get("career_summary", "")
|
||||
summary = st.text_area(
|
||||
"Career Summary *", value=summary_default, height=120,
|
||||
placeholder="Experienced professional with X years in [field]. Specialise in [skills].",
|
||||
help="Injected into cover letter and research prompts as your professional context.",
|
||||
)
|
||||
|
||||
gen_result = _generation_widget(
|
||||
section="career_summary",
|
||||
label="Generate from resume",
|
||||
tier=_tier,
|
||||
feature_key="llm_career_summary",
|
||||
input_data={"resume_text": saved_yaml.get("_raw_resume_text", "")},
|
||||
)
|
||||
if gen_result and gen_result != summary:
|
||||
st.info(f"\u2728 Suggested summary \u2014 paste it above if it looks good:\n\n{gen_result}")
|
||||
|
||||
col_back, col_next = st.columns([1, 4])
|
||||
if col_back.button("\u2190 Back", key="ident_back"):
|
||||
st.session_state.wizard_step = 3
|
||||
st.rerun()
|
||||
if col_next.button("Next \u2192", type="primary", key="ident_next"):
|
||||
errs = validate({"name": name, "email": email, "career_summary": summary})
|
||||
if errs:
|
||||
st.error("\n".join(errs))
|
||||
else:
|
||||
_save_yaml({
|
||||
"name": name, "email": email, "phone": phone,
|
||||
"linkedin": linkedin, "career_summary": summary,
|
||||
"wizard_complete": False, "wizard_step": 4,
|
||||
})
|
||||
st.session_state.wizard_step = 5
|
||||
st.rerun()
|
||||
|
||||
|
||||
# ── Step 5: Inference ──────────────────────────────────────────────────────────
|
||||
elif step == 5:
|
||||
from app.cloud_session import CLOUD_MODE as _CLOUD_MODE
|
||||
if _CLOUD_MODE:
|
||||
# Cloud deployment: inference is managed server-side; skip this step
|
||||
_save_yaml({"wizard_step": 5})
|
||||
st.session_state.wizard_step = 6
|
||||
st.rerun()
|
||||
|
||||
from app.wizard.step_inference import validate
|
||||
|
||||
st.subheader("Step 5 \u2014 Inference & API Keys")
|
||||
st.info(
|
||||
"**Simplest setup:** set `OLLAMA_HOST` in your `.env` file — "
|
||||
"Peregrine auto-detects it, no config file needed. "
|
||||
"Or use the fields below to configure API keys and endpoints."
|
||||
)
|
||||
profile = saved_yaml.get("inference_profile", "remote")
|
||||
|
||||
if profile == "remote":
|
||||
|
|
@ -430,8 +471,18 @@ elif step == 5:
|
|||
placeholder="https://api.together.xyz/v1")
|
||||
openai_key = st.text_input("Endpoint API Key (optional)", type="password",
|
||||
key="oai_key") if openai_url else ""
|
||||
ollama_host = st.text_input("Ollama host (optional \u2014 local fallback)",
|
||||
placeholder="http://localhost:11434",
|
||||
key="ollama_host_input")
|
||||
ollama_model = st.text_input("Ollama model (optional)",
|
||||
value="llama3.2:3b",
|
||||
key="ollama_model_input")
|
||||
else:
|
||||
st.info(f"Local mode ({profile}): Ollama provides inference.")
|
||||
import os
|
||||
_ollama_host_env = os.environ.get("OLLAMA_HOST", "")
|
||||
if _ollama_host_env:
|
||||
st.caption(f"OLLAMA_HOST from .env: `{_ollama_host_env}`")
|
||||
anthropic_key = openai_url = openai_key = ""
|
||||
|
||||
with st.expander("Advanced \u2014 Service Ports & Hosts"):
|
||||
|
|
@ -510,6 +561,14 @@ elif step == 5:
|
|||
if anthropic_key or openai_url:
|
||||
env_path.write_text("\n".join(env_lines) + "\n")
|
||||
|
||||
if profile == "remote":
|
||||
if ollama_host:
|
||||
env_lines = _set_env(env_lines, "OLLAMA_HOST", ollama_host)
|
||||
if ollama_model:
|
||||
env_lines = _set_env(env_lines, "OLLAMA_MODEL", ollama_model)
|
||||
if ollama_host or ollama_model:
|
||||
env_path.write_text("\n".join(env_lines) + "\n")
|
||||
|
||||
_save_yaml({"services": svc, "wizard_step": 5})
|
||||
st.session_state.wizard_step = 6
|
||||
st.rerun()
|
||||
|
|
@ -595,7 +654,7 @@ elif step == 6:
|
|||
)
|
||||
default_profile = {
|
||||
"name": "default",
|
||||
"job_titles": titles,
|
||||
"titles": titles,
|
||||
"locations": locations,
|
||||
"remote_only": False,
|
||||
"boards": ["linkedin", "indeed", "glassdoor", "zip_recruiter"],
|
||||
|
|
|
|||
|
|
@ -12,12 +12,15 @@ from scripts.db import (
|
|||
DEFAULT_DB, init_db, get_jobs_by_status, update_job_status,
|
||||
update_cover_letter, mark_applied, get_email_leads,
|
||||
)
|
||||
from app.cloud_session import resolve_session, get_db_path
|
||||
|
||||
resolve_session("peregrine")
|
||||
|
||||
st.title("📋 Job Review")
|
||||
|
||||
init_db(DEFAULT_DB)
|
||||
init_db(get_db_path())
|
||||
|
||||
_email_leads = get_email_leads(DEFAULT_DB)
|
||||
_email_leads = get_email_leads(get_db_path())
|
||||
|
||||
# ── Sidebar filters ────────────────────────────────────────────────────────────
|
||||
with st.sidebar:
|
||||
|
|
@ -37,7 +40,7 @@ with st.sidebar:
|
|||
index=0,
|
||||
)
|
||||
|
||||
jobs = get_jobs_by_status(DEFAULT_DB, show_status)
|
||||
jobs = get_jobs_by_status(get_db_path(), show_status)
|
||||
|
||||
if remote_only:
|
||||
jobs = [j for j in jobs if j.get("is_remote")]
|
||||
|
|
@ -86,11 +89,11 @@ if show_status == "pending" and _email_leads:
|
|||
with right_l:
|
||||
if st.button("✅ Approve", key=f"el_approve_{lead_id}",
|
||||
type="primary", use_container_width=True):
|
||||
update_job_status(DEFAULT_DB, [lead_id], "approved")
|
||||
update_job_status(get_db_path(), [lead_id], "approved")
|
||||
st.rerun()
|
||||
if st.button("❌ Reject", key=f"el_reject_{lead_id}",
|
||||
use_container_width=True):
|
||||
update_job_status(DEFAULT_DB, [lead_id], "rejected")
|
||||
update_job_status(get_db_path(), [lead_id], "rejected")
|
||||
st.rerun()
|
||||
st.divider()
|
||||
|
||||
|
|
@ -162,7 +165,7 @@ for job in jobs:
|
|||
)
|
||||
save_col, _ = st.columns([2, 5])
|
||||
if save_col.button("💾 Save draft", key=f"save_cl_{job_id}"):
|
||||
update_cover_letter(DEFAULT_DB, job_id, st.session_state[_cl_key])
|
||||
update_cover_letter(get_db_path(), job_id, st.session_state[_cl_key])
|
||||
st.success("Saved!")
|
||||
|
||||
# Applied date + cover letter preview (applied/synced)
|
||||
|
|
@ -182,11 +185,11 @@ for job in jobs:
|
|||
if show_status == "pending":
|
||||
if st.button("✅ Approve", key=f"approve_{job_id}",
|
||||
type="primary", use_container_width=True):
|
||||
update_job_status(DEFAULT_DB, [job_id], "approved")
|
||||
update_job_status(get_db_path(), [job_id], "approved")
|
||||
st.rerun()
|
||||
if st.button("❌ Reject", key=f"reject_{job_id}",
|
||||
use_container_width=True):
|
||||
update_job_status(DEFAULT_DB, [job_id], "rejected")
|
||||
update_job_status(get_db_path(), [job_id], "rejected")
|
||||
st.rerun()
|
||||
|
||||
elif show_status == "approved":
|
||||
|
|
@ -198,6 +201,6 @@ for job in jobs:
|
|||
use_container_width=True):
|
||||
cl_text = st.session_state.get(f"cl_{job_id}", "")
|
||||
if cl_text:
|
||||
update_cover_letter(DEFAULT_DB, job_id, cl_text)
|
||||
mark_applied(DEFAULT_DB, [job_id])
|
||||
update_cover_letter(get_db_path(), job_id, cl_text)
|
||||
mark_applied(get_db_path(), [job_id])
|
||||
st.rerun()
|
||||
|
|
|
|||
|
|
@ -12,21 +12,24 @@ import yaml
|
|||
import os as _os
|
||||
|
||||
from scripts.user_profile import UserProfile
|
||||
from app.cloud_session import resolve_session, get_db_path, get_config_dir, CLOUD_MODE
|
||||
|
||||
_USER_YAML = Path(__file__).parent.parent.parent / "config" / "user.yaml"
|
||||
_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
|
||||
_name = _profile.name if _profile else "Job Seeker"
|
||||
|
||||
resolve_session("peregrine")
|
||||
st.title("⚙️ Settings")
|
||||
|
||||
CONFIG_DIR = Path(__file__).parent.parent.parent / "config"
|
||||
# Config paths — per-user directory in cloud mode, shared repo config/ locally
|
||||
CONFIG_DIR = get_config_dir()
|
||||
SEARCH_CFG = CONFIG_DIR / "search_profiles.yaml"
|
||||
BLOCKLIST_CFG = CONFIG_DIR / "blocklist.yaml"
|
||||
LLM_CFG = CONFIG_DIR / "llm.yaml"
|
||||
NOTION_CFG = CONFIG_DIR / "notion.yaml"
|
||||
RESUME_PATH = Path(__file__).parent.parent.parent / "config" / "plain_text_resume.yaml"
|
||||
RESUME_PATH = CONFIG_DIR / "plain_text_resume.yaml"
|
||||
KEYWORDS_CFG = CONFIG_DIR / "resume_keywords.yaml"
|
||||
|
||||
_USER_YAML = CONFIG_DIR / "user.yaml"
|
||||
_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
|
||||
_name = _profile.name if _profile else "Peregrine User"
|
||||
|
||||
def load_yaml(path: Path) -> dict:
|
||||
if path.exists():
|
||||
return yaml.safe_load(path.read_text()) or {}
|
||||
|
|
@ -36,51 +39,25 @@ def save_yaml(path: Path, data: dict) -> None:
|
|||
path.write_text(yaml.dump(data, default_flow_style=False, allow_unicode=True))
|
||||
|
||||
|
||||
def _suggest_search_terms(current_titles: list[str], resume_path: Path) -> dict:
|
||||
"""Call LLM to suggest additional job titles and exclude keywords."""
|
||||
import json
|
||||
import re
|
||||
from scripts.llm_router import LLMRouter
|
||||
from scripts.suggest_helpers import (
|
||||
suggest_search_terms as _suggest_search_terms_impl,
|
||||
suggest_resume_keywords as _suggest_resume_keywords,
|
||||
)
|
||||
|
||||
resume_context = ""
|
||||
if resume_path.exists():
|
||||
resume = load_yaml(resume_path)
|
||||
lines = []
|
||||
for exp in (resume.get("experience_details") or [])[:3]:
|
||||
pos = exp.get("position", "")
|
||||
co = exp.get("company", "")
|
||||
skills = ", ".join((exp.get("skills_acquired") or [])[:5])
|
||||
lines.append(f"- {pos} at {co}: {skills}")
|
||||
resume_context = "\n".join(lines)
|
||||
|
||||
titles_str = "\n".join(f"- {t}" for t in current_titles)
|
||||
prompt = f"""You are helping a job seeker optimize their search criteria.
|
||||
|
||||
Their background (from resume):
|
||||
{resume_context or "Customer success and technical account management leader"}
|
||||
|
||||
Current job titles being searched:
|
||||
{titles_str}
|
||||
|
||||
Suggest:
|
||||
1. 5-8 additional job titles they might be missing (alternative names, adjacent roles, senior variants)
|
||||
2. 3-5 keywords to add to the exclusion filter (to screen out irrelevant postings)
|
||||
|
||||
Return ONLY valid JSON in this exact format:
|
||||
{{"suggested_titles": ["Title 1", "Title 2"], "suggested_excludes": ["keyword 1", "keyword 2"]}}"""
|
||||
|
||||
result = LLMRouter().complete(prompt).strip()
|
||||
m = re.search(r"\{.*\}", result, re.DOTALL)
|
||||
if m:
|
||||
try:
|
||||
return json.loads(m.group())
|
||||
except Exception:
|
||||
pass
|
||||
return {"suggested_titles": [], "suggested_excludes": []}
|
||||
def _suggest_search_terms(current_titles, resume_path, blocklist=None, user_profile=None):
|
||||
return _suggest_search_terms_impl(
|
||||
current_titles,
|
||||
resume_path,
|
||||
blocklist or {},
|
||||
user_profile or {},
|
||||
)
|
||||
|
||||
_show_finetune = bool(_profile and _profile.inference_profile in ("single-gpu", "dual-gpu"))
|
||||
|
||||
USER_CFG = CONFIG_DIR / "user.yaml"
|
||||
# Server config is always repo-level — it controls the container, not the user
|
||||
SERVER_CFG = Path(__file__).parent.parent.parent / "config" / "server.yaml"
|
||||
SERVER_CFG_EXAMPLE = Path(__file__).parent.parent.parent / "config" / "server.yaml.example"
|
||||
|
||||
_dev_mode = _os.getenv("DEV_MODE", "").lower() in ("true", "1", "yes")
|
||||
_u_for_dev = yaml.safe_load(USER_CFG.read_text()) or {} if USER_CFG.exists() else {}
|
||||
|
|
@ -88,19 +65,23 @@ _show_dev_tab = _dev_mode or bool(_u_for_dev.get("dev_tier_override"))
|
|||
|
||||
_tab_names = [
|
||||
"👤 My Profile", "📝 Resume Profile", "🔎 Search",
|
||||
"⚙️ System", "🎯 Fine-Tune", "🔑 License"
|
||||
"⚙️ System", "🎯 Fine-Tune", "🔑 License", "💾 Data"
|
||||
]
|
||||
if CLOUD_MODE:
|
||||
_tab_names.append("🔒 Privacy")
|
||||
if _show_dev_tab:
|
||||
_tab_names.append("🛠️ Developer")
|
||||
_all_tabs = st.tabs(_tab_names)
|
||||
tab_profile, tab_resume, tab_search, tab_system, tab_finetune, tab_license = _all_tabs[:6]
|
||||
tab_profile, tab_resume, tab_search, tab_system, tab_finetune, tab_license, tab_data = _all_tabs[:7]
|
||||
tab_privacy = _all_tabs[7] if CLOUD_MODE else None
|
||||
|
||||
# ── Inline LLM generate buttons ───────────────────────────────────────────────
|
||||
# Paid-tier feature: ✨ Generate buttons sit directly below each injectable field.
|
||||
# Unlocked when user has a configured LLM backend (BYOK) OR a paid tier.
|
||||
# Writes into session state keyed to the widget's `key=` param, then reruns.
|
||||
from app.wizard.tiers import can_use as _cu
|
||||
from app.wizard.tiers import can_use as _cu, has_configured_llm as _has_llm
|
||||
_byok = _has_llm()
|
||||
_gen_panel_active = bool(_profile) and _cu(
|
||||
_profile.effective_tier if _profile else "free", "llm_career_summary"
|
||||
_profile.effective_tier if _profile else "free", "llm_career_summary", has_byok=_byok
|
||||
)
|
||||
|
||||
# Seed session state for LLM-injectable text fields on first load
|
||||
|
|
@ -249,7 +230,7 @@ with tab_profile:
|
|||
st.rerun()
|
||||
|
||||
if not _can_generate:
|
||||
st.caption("✨ AI generation requires a paid tier.")
|
||||
st.caption("✨ AI generation requires a paid tier or a configured LLM backend (BYOK).")
|
||||
|
||||
_mission_updated = {
|
||||
r["key"]: r["value"]
|
||||
|
|
@ -321,6 +302,18 @@ with tab_search:
|
|||
st.session_state["_sp_excludes"] = "\n".join(p.get("exclude_keywords", []))
|
||||
st.session_state["_sp_hash"] = _sp_hash
|
||||
|
||||
# Apply any pending programmatic updates BEFORE widgets are instantiated.
|
||||
# Streamlit forbids writing to a widget's key after it renders on the same pass;
|
||||
# button handlers write to *_pending keys instead, consumed here on the next pass.
|
||||
for _pend, _wkey in [("_sp_titles_pending", "_sp_titles_multi"),
|
||||
("_sp_locs_pending", "_sp_locations_multi"),
|
||||
("_sp_new_title_pending", "_sp_new_title"),
|
||||
("_sp_paste_titles_pending", "_sp_paste_titles"),
|
||||
("_sp_new_loc_pending", "_sp_new_loc"),
|
||||
("_sp_paste_locs_pending", "_sp_paste_locs")]:
|
||||
if _pend in st.session_state:
|
||||
st.session_state[_wkey] = st.session_state.pop(_pend)
|
||||
|
||||
# ── Titles ────────────────────────────────────────────────────────────────
|
||||
_title_row, _suggest_btn_col = st.columns([4, 1])
|
||||
with _title_row:
|
||||
|
|
@ -328,7 +321,27 @@ with tab_search:
|
|||
with _suggest_btn_col:
|
||||
st.write("")
|
||||
_run_suggest = st.button("✨ Suggest", key="sp_suggest_btn",
|
||||
help="Ask the LLM to suggest additional titles and exclude keywords based on your resume")
|
||||
help="Ask the LLM to suggest additional titles and smarter exclude keywords — using your blocklist, mission values, and career background.")
|
||||
|
||||
_title_sugg_count = len((st.session_state.get("_sp_suggestions") or {}).get("suggested_titles", []))
|
||||
if _title_sugg_count:
|
||||
st.markdown(f"""<style>
|
||||
@keyframes _pg_arrow_float {{
|
||||
0%, 100% {{
|
||||
transform: translateY(0px);
|
||||
filter: drop-shadow(0 0 2px #4fc3f7);
|
||||
}}
|
||||
50% {{
|
||||
transform: translateY(4px);
|
||||
filter: drop-shadow(0 0 8px #4fc3f7);
|
||||
}}
|
||||
}}
|
||||
/* Target the expand-arrow SVG inside the multiselect dropdown indicator */
|
||||
.stMultiSelect [data-baseweb="select"] > div + div svg {{
|
||||
animation: _pg_arrow_float 1.3s ease-in-out infinite;
|
||||
cursor: pointer;
|
||||
}}
|
||||
</style>""", unsafe_allow_html=True)
|
||||
|
||||
st.multiselect(
|
||||
"Job titles",
|
||||
|
|
@ -337,6 +350,14 @@ with tab_search:
|
|||
help="Select from known titles. Suggestions from ✨ Suggest appear here — pick the ones you want.",
|
||||
label_visibility="collapsed",
|
||||
)
|
||||
|
||||
if _title_sugg_count:
|
||||
st.markdown(
|
||||
f'<div style="font-size:0.8em; color:#4fc3f7; margin-top:-10px; margin-bottom:4px;">'
|
||||
f' ↑ {_title_sugg_count} new suggestion{"s" if _title_sugg_count != 1 else ""} '
|
||||
f'added — open the dropdown to browse</div>',
|
||||
unsafe_allow_html=True,
|
||||
)
|
||||
_add_t_col, _add_t_btn = st.columns([5, 1])
|
||||
with _add_t_col:
|
||||
st.text_input("Add a title", key="_sp_new_title", label_visibility="collapsed",
|
||||
|
|
@ -352,8 +373,8 @@ with tab_search:
|
|||
st.session_state["_sp_title_options"] = _opts
|
||||
if _t not in _sel:
|
||||
_sel.append(_t)
|
||||
st.session_state["_sp_titles_multi"] = _sel
|
||||
st.session_state["_sp_new_title"] = ""
|
||||
st.session_state["_sp_titles_pending"] = _sel
|
||||
st.session_state["_sp_new_title_pending"] = ""
|
||||
st.rerun()
|
||||
with st.expander("📋 Paste a list of titles"):
|
||||
st.text_area("One title per line", key="_sp_paste_titles", height=80, label_visibility="collapsed",
|
||||
|
|
@ -368,23 +389,44 @@ with tab_search:
|
|||
if _t not in _sel:
|
||||
_sel.append(_t)
|
||||
st.session_state["_sp_title_options"] = _opts
|
||||
st.session_state["_sp_titles_multi"] = _sel
|
||||
st.session_state["_sp_paste_titles"] = ""
|
||||
st.session_state["_sp_titles_pending"] = _sel
|
||||
st.session_state["_sp_paste_titles_pending"] = ""
|
||||
st.rerun()
|
||||
|
||||
# ── LLM suggestions panel ────────────────────────────────────────────────
|
||||
if _run_suggest:
|
||||
_current_titles = list(st.session_state.get("_sp_titles_multi", []))
|
||||
_blocklist = load_yaml(BLOCKLIST_CFG)
|
||||
_user_profile = load_yaml(USER_CFG)
|
||||
with st.spinner("Asking LLM for suggestions…"):
|
||||
suggestions = _suggest_search_terms(_current_titles, RESUME_PATH)
|
||||
# Add suggested titles to options list (not auto-selected — user picks from dropdown)
|
||||
_opts = list(st.session_state.get("_sp_title_options", []))
|
||||
for _t in suggestions.get("suggested_titles", []):
|
||||
if _t not in _opts:
|
||||
_opts.append(_t)
|
||||
st.session_state["_sp_title_options"] = _opts
|
||||
st.session_state["_sp_suggestions"] = suggestions
|
||||
st.rerun()
|
||||
try:
|
||||
suggestions = _suggest_search_terms(_current_titles, RESUME_PATH, _blocklist, _user_profile)
|
||||
except Exception as _e:
|
||||
_err_msg = str(_e)
|
||||
if "exhausted" in _err_msg.lower() or isinstance(_e, RuntimeError):
|
||||
st.warning(
|
||||
f"No LLM backend available: {_err_msg}. "
|
||||
"Check that Ollama is running and has GPU access, or enable a cloud backend in Settings → System → LLM.",
|
||||
icon="⚠️",
|
||||
)
|
||||
else:
|
||||
st.error(f"Suggestion failed: {_err_msg}", icon="🚨")
|
||||
suggestions = None
|
||||
if suggestions is not None:
|
||||
# Add suggested titles to options list (not auto-selected — user picks from dropdown)
|
||||
_opts = list(st.session_state.get("_sp_title_options", []))
|
||||
_new_titles = [_t for _t in suggestions.get("suggested_titles", []) if _t not in _opts]
|
||||
_opts.extend(_new_titles)
|
||||
st.session_state["_sp_title_options"] = _opts
|
||||
st.session_state["_sp_suggestions"] = suggestions
|
||||
if not _new_titles and not suggestions.get("suggested_excludes"):
|
||||
_resume_hint = " Upload your resume in Settings → Resume Profile for better results." if not RESUME_PATH.exists() else ""
|
||||
st.info(
|
||||
f"No new suggestions found — the LLM didn't generate anything new for these titles.{_resume_hint}",
|
||||
icon="ℹ️",
|
||||
)
|
||||
else:
|
||||
st.rerun()
|
||||
|
||||
if st.session_state.get("_sp_suggestions"):
|
||||
sugg = st.session_state["_sp_suggestions"]
|
||||
|
|
@ -433,8 +475,8 @@ with tab_search:
|
|||
st.session_state["_sp_loc_options"] = _opts
|
||||
if _l not in _sel:
|
||||
_sel.append(_l)
|
||||
st.session_state["_sp_locations_multi"] = _sel
|
||||
st.session_state["_sp_new_loc"] = ""
|
||||
st.session_state["_sp_locs_pending"] = _sel
|
||||
st.session_state["_sp_new_loc_pending"] = ""
|
||||
st.rerun()
|
||||
with st.expander("📋 Paste a list of locations"):
|
||||
st.text_area("One location per line", key="_sp_paste_locs", height=80, label_visibility="collapsed",
|
||||
|
|
@ -449,8 +491,8 @@ with tab_search:
|
|||
if _l not in _sel:
|
||||
_sel.append(_l)
|
||||
st.session_state["_sp_loc_options"] = _opts
|
||||
st.session_state["_sp_locations_multi"] = _sel
|
||||
st.session_state["_sp_paste_locs"] = ""
|
||||
st.session_state["_sp_locs_pending"] = _sel
|
||||
st.session_state["_sp_paste_locs_pending"] = ""
|
||||
st.rerun()
|
||||
|
||||
st.subheader("Exclude Keywords")
|
||||
|
|
@ -585,6 +627,23 @@ def _upload_resume_widget(key_prefix: str) -> None:
|
|||
)
|
||||
|
||||
with tab_resume:
|
||||
# ── LinkedIn import ───────────────────────────────────────────────────────
|
||||
_li_data = st.session_state.pop("_linkedin_extracted", None)
|
||||
if _li_data:
|
||||
# Merge imported data into resume YAML — only bootstrap empty fields,
|
||||
# never overwrite existing detail with sparse LinkedIn data
|
||||
existing = load_yaml(RESUME_PATH)
|
||||
existing.update({k: v for k, v in _li_data.items() if v and not existing.get(k)})
|
||||
RESUME_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
save_yaml(RESUME_PATH, existing)
|
||||
st.success("LinkedIn data applied to resume profile.")
|
||||
st.rerun()
|
||||
|
||||
with st.expander("🔗 Import from LinkedIn", expanded=False):
|
||||
from app.components.linkedin_import import render_linkedin_tab
|
||||
_tab_tier = _profile.tier if _profile else "free"
|
||||
render_linkedin_tab(config_dir=CONFIG_DIR, tier=_tab_tier)
|
||||
|
||||
st.caption(
|
||||
f"Edit {_name}'s application profile. "
|
||||
"Bullets are used as paste-able shortcuts in the Apply Workspace."
|
||||
|
|
@ -744,11 +803,34 @@ with tab_resume:
|
|||
st.balloons()
|
||||
|
||||
st.divider()
|
||||
st.subheader("🏷️ Skills & Keywords")
|
||||
st.caption(
|
||||
f"Matched against job descriptions to surface {_name}'s most relevant experience "
|
||||
"and highlight keyword overlap in research briefs. Search the bundled list or add your own."
|
||||
)
|
||||
_kw_header_col, _kw_btn_col = st.columns([5, 1])
|
||||
with _kw_header_col:
|
||||
st.subheader("🏷️ Skills & Keywords")
|
||||
st.caption(
|
||||
f"Matched against job descriptions to surface {_name}'s most relevant experience "
|
||||
"and highlight keyword overlap in research briefs. Search the bundled list or add your own."
|
||||
)
|
||||
with _kw_btn_col:
|
||||
st.write("")
|
||||
st.write("")
|
||||
_run_kw_suggest = st.button(
|
||||
"✨ Suggest", key="kw_suggest_btn",
|
||||
help="Ask the LLM to suggest skills, domains, and keywords based on your resume.",
|
||||
)
|
||||
|
||||
if _run_kw_suggest:
|
||||
_kw_current = load_yaml(KEYWORDS_CFG) if KEYWORDS_CFG.exists() else {}
|
||||
with st.spinner("Asking LLM for keyword suggestions…"):
|
||||
try:
|
||||
_kw_sugg = _suggest_resume_keywords(RESUME_PATH, _kw_current)
|
||||
st.session_state["_kw_suggestions"] = _kw_sugg
|
||||
st.rerun()
|
||||
except RuntimeError as _e:
|
||||
st.warning(
|
||||
f"No LLM backend available: {_e}. "
|
||||
"Check that Ollama is running and has GPU access, or enable a cloud backend in Settings → System → LLM.",
|
||||
icon="⚠️",
|
||||
)
|
||||
|
||||
from scripts.skills_utils import load_suggestions as _load_sugg, filter_tag as _filter_tag
|
||||
|
||||
|
|
@ -769,6 +851,13 @@ with tab_resume:
|
|||
kw_current: list[str] = kw_data.get(kw_category, [])
|
||||
kw_suggestions = _load_sugg(kw_category)
|
||||
|
||||
# If a custom tag was added last render, clear the multiselect's session
|
||||
# state key NOW (before the widget is created) so Streamlit uses `default`
|
||||
# instead of the stale session state that lacks the new tag.
|
||||
_reset_key = f"_kw_reset_{kw_category}"
|
||||
if st.session_state.pop(_reset_key, False):
|
||||
st.session_state.pop(f"kw_ms_{kw_category}", None)
|
||||
|
||||
# Merge: suggestions first, then any custom tags not in suggestions
|
||||
kw_custom = [t for t in kw_current if t not in kw_suggestions]
|
||||
kw_options = kw_suggestions + kw_custom
|
||||
|
|
@ -789,6 +878,7 @@ with tab_resume:
|
|||
label_visibility="collapsed",
|
||||
placeholder=f"Custom: {kw_placeholder}",
|
||||
)
|
||||
_tag_just_added = False
|
||||
if kw_btn_col.button("+", key=f"kw_add_{kw_category}", help="Add custom tag"):
|
||||
cleaned = _filter_tag(kw_raw)
|
||||
if cleaned is None:
|
||||
|
|
@ -796,13 +886,19 @@ with tab_resume:
|
|||
elif cleaned in kw_options:
|
||||
st.info(f"'{cleaned}' is already in the list — select it above.")
|
||||
else:
|
||||
# Persist custom tag: add to YAML and session state so it appears in options
|
||||
# Save to YAML and set a reset flag so the multiselect session
|
||||
# state is cleared before the widget renders on the next rerun,
|
||||
# allowing `default` (which includes the new tag) to take effect.
|
||||
kw_new_list = kw_selected + [cleaned]
|
||||
st.session_state[_reset_key] = True
|
||||
kw_data[kw_category] = kw_new_list
|
||||
kw_changed = True
|
||||
_tag_just_added = True
|
||||
|
||||
# Detect multiselect changes
|
||||
if sorted(kw_selected) != sorted(kw_current):
|
||||
# Detect multiselect changes. Skip when a tag was just added — the change
|
||||
# detection would otherwise overwrite kw_data with the old kw_selected
|
||||
# (which doesn't include the new tag) in the same render.
|
||||
if not _tag_just_added and sorted(kw_selected) != sorted(kw_current):
|
||||
kw_data[kw_category] = kw_selected
|
||||
kw_changed = True
|
||||
|
||||
|
|
@ -812,10 +908,45 @@ with tab_resume:
|
|||
save_yaml(KEYWORDS_CFG, kw_data)
|
||||
st.rerun()
|
||||
|
||||
# ── LLM keyword suggestion chips ──────────────────────────────────────
|
||||
_kw_sugg_data = st.session_state.get("_kw_suggestions")
|
||||
if _kw_sugg_data:
|
||||
_KW_ICONS = {"skills": "🛠️", "domains": "🏢", "keywords": "🔑"}
|
||||
_any_shown = False
|
||||
for _cat, _icon in _KW_ICONS.items():
|
||||
_cat_sugg = [t for t in _kw_sugg_data.get(_cat, [])
|
||||
if t not in kw_data.get(_cat, [])]
|
||||
if not _cat_sugg:
|
||||
continue
|
||||
_any_shown = True
|
||||
st.caption(f"**{_icon} {_cat.capitalize()} suggestions** — click to add:")
|
||||
_chip_cols = st.columns(min(len(_cat_sugg), 4))
|
||||
for _i, _tag in enumerate(_cat_sugg):
|
||||
with _chip_cols[_i % 4]:
|
||||
if st.button(f"+ {_tag}", key=f"kw_sugg_{_cat}_{_i}"):
|
||||
_new_list = list(kw_data.get(_cat, [])) + [_tag]
|
||||
kw_data[_cat] = _new_list
|
||||
save_yaml(KEYWORDS_CFG, kw_data)
|
||||
_kw_sugg_data[_cat] = [t for t in _kw_sugg_data[_cat] if t != _tag]
|
||||
st.session_state["_kw_suggestions"] = _kw_sugg_data
|
||||
st.rerun()
|
||||
if _any_shown:
|
||||
if st.button("✕ Clear suggestions", key="kw_clear_sugg"):
|
||||
st.session_state.pop("_kw_suggestions", None)
|
||||
st.rerun()
|
||||
|
||||
# ── System tab ────────────────────────────────────────────────────────────────
|
||||
with tab_system:
|
||||
st.caption("Infrastructure, LLM backends, integrations, and service connections.")
|
||||
|
||||
if CLOUD_MODE:
|
||||
st.info(
|
||||
"**Your instance is managed by CircuitForge.**\n\n"
|
||||
"Infrastructure, LLM backends, and service settings are configured by the platform. "
|
||||
"To change your plan or billing, visit your [account page](https://circuitforge.tech/account)."
|
||||
)
|
||||
st.stop()
|
||||
|
||||
# ── File Paths & Inference ────────────────────────────────────────────────
|
||||
with st.expander("📁 File Paths & Inference Profile"):
|
||||
_su = _yaml_up.safe_load(USER_CFG.read_text()) or {} if USER_CFG.exists() else {}
|
||||
|
|
@ -873,6 +1004,60 @@ with tab_system:
|
|||
|
||||
st.divider()
|
||||
|
||||
# ── Deployment / Server ───────────────────────────────────────────────────
|
||||
with st.expander("🖥️ Deployment / Server", expanded=False):
|
||||
st.caption(
|
||||
"Settings that affect how Peregrine is served. "
|
||||
"Changes require a restart (`./manage.sh restart`) to take effect."
|
||||
)
|
||||
|
||||
_srv = _yaml_up.safe_load(SERVER_CFG.read_text()) if SERVER_CFG.exists() else {}
|
||||
_srv_example = _yaml_up.safe_load(SERVER_CFG_EXAMPLE.read_text()) if SERVER_CFG_EXAMPLE.exists() else {}
|
||||
_srv_defaults = {**_srv_example, **_srv}
|
||||
|
||||
_active_base_url = _os.environ.get("STREAMLIT_SERVER_BASE_URL_PATH", "")
|
||||
if _active_base_url:
|
||||
st.info(f"**Active base URL path:** `/{_active_base_url}` (set via environment)")
|
||||
else:
|
||||
st.info("**Active base URL path:** *(none — serving at root `/`)*")
|
||||
|
||||
s_base_url = st.text_input(
|
||||
"Base URL path",
|
||||
value=_srv_defaults.get("base_url_path", ""),
|
||||
placeholder="e.g. peregrine",
|
||||
help=(
|
||||
"URL prefix when serving behind a reverse proxy at a sub-path. "
|
||||
"Leave empty for direct access. "
|
||||
"Maps to STREAMLIT_BASE_URL_PATH in .env.\n\n"
|
||||
"Docs: https://docs.streamlit.io/develop/api-reference/configuration/config.toml#server.baseUrlPath"
|
||||
),
|
||||
)
|
||||
s_server_port = st.number_input(
|
||||
"Container port",
|
||||
value=int(_srv_defaults.get("server_port", 8501)),
|
||||
min_value=1024, max_value=65535, step=1,
|
||||
help="Port Streamlit listens on inside the container. The host port is set via STREAMLIT_PORT in .env.",
|
||||
)
|
||||
|
||||
if st.button("💾 Save Deployment Settings", key="save_server"):
|
||||
_new_srv = {"base_url_path": s_base_url.strip(), "server_port": int(s_server_port)}
|
||||
save_yaml(SERVER_CFG, _new_srv)
|
||||
# Mirror base_url_path into .env so compose picks it up on next restart
|
||||
_env_path = Path(__file__).parent.parent.parent / ".env"
|
||||
if _env_path.exists():
|
||||
_env_lines = [l for l in _env_path.read_text().splitlines()
|
||||
if not l.startswith("STREAMLIT_BASE_URL_PATH=")]
|
||||
_env_lines.append(f"STREAMLIT_BASE_URL_PATH={s_base_url.strip()}")
|
||||
_env_path.write_text("\n".join(_env_lines) + "\n")
|
||||
st.success("Deployment settings saved. Run `./manage.sh restart` to apply.")
|
||||
|
||||
st.divider()
|
||||
from app.components.ui_switcher import render_settings_toggle as _render_ui_toggle
|
||||
_ui_tier = _profile.tier if _profile else "free"
|
||||
_render_ui_toggle(yaml_path=_USER_YAML, tier=_ui_tier)
|
||||
|
||||
st.divider()
|
||||
|
||||
# ── LLM Backends ─────────────────────────────────────────────────────────
|
||||
with st.expander("🤖 LLM Backends", expanded=False):
|
||||
import requests as _req
|
||||
|
|
@ -953,18 +1138,88 @@ with tab_system:
|
|||
f"{'✓' if llm_backends.get(n, {}).get('enabled', True) else '✗'} {n}"
|
||||
for n in llm_new_order
|
||||
))
|
||||
if st.button("💾 Save LLM settings", type="primary", key="sys_save_llm"):
|
||||
save_yaml(LLM_CFG, {**llm_cfg, "backends": llm_updated_backends, "fallback_order": llm_new_order})
|
||||
# ── Cloud backend warning + acknowledgment ─────────────────────────────
|
||||
from scripts.byok_guard import cloud_backends as _cloud_backends
|
||||
|
||||
_pending_cfg = {**llm_cfg, "backends": llm_updated_backends, "fallback_order": llm_new_order}
|
||||
_pending_cloud = set(_cloud_backends(_pending_cfg))
|
||||
|
||||
_user_cfg_for_ack = yaml.safe_load(USER_CFG.read_text(encoding="utf-8")) or {} if USER_CFG.exists() else {}
|
||||
_already_acked = set(_user_cfg_for_ack.get("byok_acknowledged_backends", []))
|
||||
# Intentional: once a backend is acknowledged, it stays acknowledged even if
|
||||
# temporarily disabled and re-enabled. This avoids nagging returning users.
|
||||
_unacknowledged = _pending_cloud - _already_acked
|
||||
|
||||
def _do_save_llm(ack_backends: set) -> None:
|
||||
"""Write llm.yaml and update acknowledgment in user.yaml."""
|
||||
save_yaml(LLM_CFG, _pending_cfg)
|
||||
st.session_state.pop("_llm_order", None)
|
||||
st.session_state.pop("_llm_order_cfg_key", None)
|
||||
if ack_backends:
|
||||
# Re-read user.yaml at save time (not at render time) to avoid
|
||||
# overwriting changes made by other processes between render and save.
|
||||
_uy = yaml.safe_load(USER_CFG.read_text(encoding="utf-8")) or {} if USER_CFG.exists() else {}
|
||||
_uy["byok_acknowledged_backends"] = sorted(_already_acked | ack_backends)
|
||||
save_yaml(USER_CFG, _uy)
|
||||
st.success("LLM settings saved!")
|
||||
|
||||
if _unacknowledged:
|
||||
_provider_labels = ", ".join(b.replace("_", " ").title() for b in sorted(_unacknowledged))
|
||||
_policy_links = []
|
||||
for _b in sorted(_unacknowledged):
|
||||
if _b in ("anthropic", "claude_code"):
|
||||
_policy_links.append("[Anthropic privacy policy](https://www.anthropic.com/privacy)")
|
||||
elif _b == "openai":
|
||||
_policy_links.append("[OpenAI privacy policy](https://openai.com/policies/privacy-policy)")
|
||||
_policy_str = " · ".join(_policy_links) if _policy_links else "Review your provider's documentation."
|
||||
|
||||
st.warning(
|
||||
f"**Cloud LLM active — your data will leave this machine**\n\n"
|
||||
f"Enabling **{_provider_labels}** means AI features will send content "
|
||||
f"directly to that provider. CircuitForge does not receive or log it, "
|
||||
f"but their privacy policy governs it — not ours.\n\n"
|
||||
f"**What leaves your machine:**\n"
|
||||
f"- Cover letter generation: your resume, job description, and profile\n"
|
||||
f"- Keyword suggestions: your skills list and resume summary\n"
|
||||
f"- Survey assistant: survey question text\n"
|
||||
f"- Company research / Interview prep: company name and role only\n\n"
|
||||
f"**What stays local always:** your jobs database, email credentials, "
|
||||
f"license key, and Notion token.\n\n"
|
||||
f"For sensitive data (disability, immigration, medical), a local model is "
|
||||
f"strongly recommended. These tools assist with paperwork — they don't "
|
||||
f"replace professional advice.\n\n"
|
||||
f"{_policy_str} · "
|
||||
f"[CircuitForge privacy policy](https://circuitforge.tech/privacy)",
|
||||
icon="⚠️",
|
||||
)
|
||||
|
||||
_ack = st.checkbox(
|
||||
f"I understand — content will be sent to **{_provider_labels}** when I use AI features",
|
||||
key="byok_ack_checkbox",
|
||||
)
|
||||
_col_cancel, _col_save = st.columns(2)
|
||||
if _col_cancel.button("Cancel", key="byok_cancel"):
|
||||
st.session_state.pop("byok_ack_checkbox", None)
|
||||
st.rerun()
|
||||
if _col_save.button(
|
||||
"💾 Save with cloud LLM",
|
||||
type="primary",
|
||||
key="sys_save_llm_cloud",
|
||||
disabled=not _ack,
|
||||
):
|
||||
_do_save_llm(_unacknowledged)
|
||||
else:
|
||||
if st.button("💾 Save LLM settings", type="primary", key="sys_save_llm"):
|
||||
_do_save_llm(set())
|
||||
|
||||
# ── Services ──────────────────────────────────────────────────────────────
|
||||
with st.expander("🔌 Services", expanded=True):
|
||||
import subprocess as _sp
|
||||
import shutil as _shutil
|
||||
import os as _os
|
||||
TOKENS_CFG = CONFIG_DIR / "tokens.yaml"
|
||||
COMPOSE_DIR = str(Path(__file__).parent.parent.parent)
|
||||
_compose_env = {**_os.environ, "COMPOSE_PROJECT_NAME": "peregrine"}
|
||||
_docker_available = bool(_shutil.which("docker"))
|
||||
_sys_profile_name = _profile.inference_profile if _profile else "remote"
|
||||
SYS_SERVICES = [
|
||||
|
|
@ -1056,7 +1311,7 @@ with tab_system:
|
|||
elif up:
|
||||
if st.button("⏹ Stop", key=f"sys_svc_stop_{svc['port']}", use_container_width=True):
|
||||
with st.spinner(f"Stopping {svc['name']}…"):
|
||||
r = _sp.run(svc["stop"], capture_output=True, text=True, cwd=svc["cwd"])
|
||||
r = _sp.run(svc["stop"], capture_output=True, text=True, cwd=svc["cwd"], env=_compose_env)
|
||||
st.success("Stopped.") if r.returncode == 0 else st.error(r.stderr or r.stdout)
|
||||
st.rerun()
|
||||
else:
|
||||
|
|
@ -1067,7 +1322,7 @@ with tab_system:
|
|||
_start_cmd.append(_sel)
|
||||
if st.button("▶ Start", key=f"sys_svc_start_{svc['port']}", use_container_width=True, type="primary"):
|
||||
with st.spinner(f"Starting {svc['name']}…"):
|
||||
r = _sp.run(_start_cmd, capture_output=True, text=True, cwd=svc["cwd"])
|
||||
r = _sp.run(_start_cmd, capture_output=True, text=True, cwd=svc["cwd"], env=_compose_env)
|
||||
st.success("Started!") if r.returncode == 0 else st.error(r.stderr or r.stdout)
|
||||
st.rerun()
|
||||
|
||||
|
|
@ -1206,12 +1461,11 @@ with tab_finetune:
|
|||
st.markdown("**Step 2: Extract Training Pairs**")
|
||||
import json as _json
|
||||
import sqlite3 as _sqlite3
|
||||
from scripts.db import DEFAULT_DB as _FT_DB
|
||||
|
||||
jsonl_path = _profile.docs_dir / "training_data" / "cover_letters.jsonl"
|
||||
|
||||
# Show task status
|
||||
_ft_conn = _sqlite3.connect(_FT_DB)
|
||||
_ft_conn = _sqlite3.connect(get_db_path())
|
||||
_ft_conn.row_factory = _sqlite3.Row
|
||||
_ft_task = _ft_conn.execute(
|
||||
"SELECT * FROM background_tasks WHERE task_type='prepare_training' ORDER BY id DESC LIMIT 1"
|
||||
|
|
@ -1295,6 +1549,13 @@ with tab_finetune:
|
|||
with tab_license:
|
||||
st.subheader("🔑 License")
|
||||
|
||||
if CLOUD_MODE:
|
||||
_cloud_tier = st.session_state.get("cloud_tier", "free")
|
||||
st.success(f"**{_cloud_tier.title()} tier** — managed via your CircuitForge account")
|
||||
st.caption("Your plan is tied to your account and applied automatically.")
|
||||
st.page_link("https://circuitforge.tech/account", label="Manage plan →", icon="🔗")
|
||||
st.stop()
|
||||
|
||||
from scripts.license import (
|
||||
verify_local as _verify_local,
|
||||
activate as _activate,
|
||||
|
|
@ -1338,6 +1599,103 @@ with tab_license:
|
|||
except Exception as _e:
|
||||
st.error(f"Activation failed: {_e}")
|
||||
|
||||
# ── Data tab — Backup / Restore / Teleport ────────────────────────────────────
|
||||
with tab_data:
|
||||
st.subheader("💾 Backup / Restore / Teleport")
|
||||
st.caption(
|
||||
"Export all your personal configs and job data as a portable zip. "
|
||||
"Use to migrate between machines, back up before testing, or transfer to a new Docker volume."
|
||||
)
|
||||
|
||||
from scripts.backup import create_backup, list_backup_contents, restore_backup as _do_restore
|
||||
|
||||
# Cloud mode: per-user data lives at get_db_path().parent — not the app root.
|
||||
# db_key is used to transparently decrypt on export and re-encrypt on import.
|
||||
_db_key = st.session_state.get("db_key", "") if CLOUD_MODE else ""
|
||||
_base_dir = get_db_path().parent if (CLOUD_MODE and st.session_state.get("db_path")) else Path(__file__).parent.parent.parent
|
||||
|
||||
# ── Backup ────────────────────────────────────────────────────────────────
|
||||
st.markdown("### 📦 Create Backup")
|
||||
_incl_db = st.checkbox("Include staging.db (job data)", value=True, key="backup_incl_db")
|
||||
if st.button("Create Backup", key="backup_create"):
|
||||
with st.spinner("Creating backup…"):
|
||||
try:
|
||||
_zip_bytes = create_backup(_base_dir, include_db=_incl_db, db_key=_db_key)
|
||||
_info = list_backup_contents(_zip_bytes)
|
||||
from datetime import datetime as _dt
|
||||
_ts = _dt.now().strftime("%Y%m%d-%H%M%S")
|
||||
_fname = f"peregrine-backup-{_ts}.zip"
|
||||
st.success(
|
||||
f"Backup ready — {len(_info['files'])} files, "
|
||||
f"{_info['total_bytes'] / 1024:.0f} KB uncompressed"
|
||||
)
|
||||
st.download_button(
|
||||
label="⬇️ Download backup zip",
|
||||
data=_zip_bytes,
|
||||
file_name=_fname,
|
||||
mime="application/zip",
|
||||
key="backup_download",
|
||||
)
|
||||
with st.expander("Files included"):
|
||||
for _fn in _info["files"]:
|
||||
_sz = _info["sizes"].get(_fn, 0)
|
||||
st.caption(f"`{_fn}` — {_sz:,} bytes")
|
||||
except Exception as _e:
|
||||
st.error(f"Backup failed: {_e}")
|
||||
|
||||
st.divider()
|
||||
|
||||
# ── Restore ───────────────────────────────────────────────────────────────
|
||||
st.markdown("### 📂 Restore from Backup")
|
||||
st.warning(
|
||||
"Restoring overwrites existing config files and (optionally) staging.db. "
|
||||
"Create a fresh backup first if you want to preserve current settings.",
|
||||
icon="⚠️",
|
||||
)
|
||||
_restore_file = st.file_uploader(
|
||||
"Upload backup zip", type=["zip"], key="restore_upload",
|
||||
help="Select a peregrine-backup-*.zip created by this tool."
|
||||
)
|
||||
_restore_db = st.checkbox("Restore staging.db (job data)", value=True, key="restore_incl_db")
|
||||
_restore_overwrite = st.checkbox("Overwrite existing files", value=True, key="restore_overwrite")
|
||||
|
||||
if _restore_file and st.button("Restore", type="primary", key="restore_go"):
|
||||
with st.spinner("Restoring…"):
|
||||
try:
|
||||
_zip_bytes = _restore_file.read()
|
||||
_result = _do_restore(
|
||||
_zip_bytes, _base_dir,
|
||||
include_db=_restore_db,
|
||||
overwrite=_restore_overwrite,
|
||||
db_key=_db_key,
|
||||
)
|
||||
st.success(f"Restored {len(_result['restored'])} files.")
|
||||
with st.expander("Details"):
|
||||
for _fn in _result["restored"]:
|
||||
st.caption(f"✓ `{_fn}`")
|
||||
for _fn in _result["skipped"]:
|
||||
st.caption(f"— `{_fn}` (skipped)")
|
||||
st.info("Restart the app for changes to take effect.", icon="ℹ️")
|
||||
except Exception as _e:
|
||||
st.error(f"Restore failed: {_e}")
|
||||
|
||||
st.divider()
|
||||
|
||||
# ── Teleport ──────────────────────────────────────────────────────────────
|
||||
st.markdown("### 🚀 Teleport to Another Machine")
|
||||
st.markdown("""
|
||||
**How to move Peregrine to a new machine or Docker volume:**
|
||||
|
||||
1. **Here (source):** click **Create Backup** above and download the zip.
|
||||
2. **On the target machine:** clone the repo and run `./manage.sh start`.
|
||||
3. **In the target Peregrine UI:** go to Settings → 💾 Data → Restore from Backup and upload the zip.
|
||||
4. Restart the target app: `./manage.sh restart`.
|
||||
|
||||
The zip contains all gitignored configs (email credentials, Notion token, LLM settings, resume YAML)
|
||||
and optionally your staging database (all discovered/applied jobs, contacts, cover letters).
|
||||
""")
|
||||
|
||||
|
||||
# ── Developer tab ─────────────────────────────────────────────────────────────
|
||||
if _show_dev_tab:
|
||||
with _all_tabs[-1]:
|
||||
|
|
@ -1409,3 +1767,161 @@ if _show_dev_tab:
|
|||
st.error(f"Invalid token ({resp.status_code})")
|
||||
except Exception as e:
|
||||
st.error(f"Error: {e}")
|
||||
|
||||
st.divider()
|
||||
st.markdown("**📊 Export Classifier Training Data**")
|
||||
st.caption(
|
||||
"Exports inbound emails from `job_contacts` (labeled by the IMAP sync classifier) "
|
||||
"to `data/email_score.jsonl` for use with `scripts/benchmark_classifier.py --score`. "
|
||||
"⚠️ Labels are generated by llama3.1:8b — review before using as ground truth."
|
||||
)
|
||||
_db_candidates = [
|
||||
Path(__file__).parent.parent.parent / "data" / "staging.db",
|
||||
Path(__file__).parent.parent.parent / "staging.db",
|
||||
]
|
||||
_db_path = next((p for p in _db_candidates if p.exists()), None)
|
||||
_score_out = Path(__file__).parent.parent.parent / "data" / "email_score.jsonl"
|
||||
|
||||
if _db_path is None:
|
||||
st.warning("No `staging.db` found — run discovery first to create the database.")
|
||||
else:
|
||||
st.caption(f"Database: `{_db_path.name}` · Output: `data/email_score.jsonl`")
|
||||
if st.button("📤 Export DB labels → email_score.jsonl", key="dev_export_db"):
|
||||
import sqlite3 as _sqlite3
|
||||
from scripts.benchmark_classifier import LABELS as _BC_LABELS
|
||||
_conn = _sqlite3.connect(_db_path)
|
||||
_cur = _conn.cursor()
|
||||
_cur.execute("""
|
||||
SELECT subject, body, stage_signal
|
||||
FROM job_contacts
|
||||
WHERE stage_signal IS NOT NULL
|
||||
AND stage_signal != ''
|
||||
AND direction = 'inbound'
|
||||
ORDER BY received_at
|
||||
""")
|
||||
_rows = _cur.fetchall()
|
||||
_conn.close()
|
||||
|
||||
if not _rows:
|
||||
st.warning("No labeled emails in `job_contacts`. Run IMAP sync first.")
|
||||
else:
|
||||
_score_out.parent.mkdir(parents=True, exist_ok=True)
|
||||
_written, _skipped = 0, 0
|
||||
_label_counts: dict = {}
|
||||
with _score_out.open("w") as _f:
|
||||
for _subj, _body, _label in _rows:
|
||||
if _label not in _BC_LABELS:
|
||||
_skipped += 1
|
||||
continue
|
||||
import json as _json_dev
|
||||
_f.write(_json_dev.dumps({
|
||||
"subject": _subj or "",
|
||||
"body": (_body or "")[:800],
|
||||
"label": _label,
|
||||
}) + "\n")
|
||||
_written += 1
|
||||
_label_counts[_label] = _label_counts.get(_label, 0) + 1
|
||||
st.success(f"Exported **{_written}** emails → `data/email_score.jsonl` ({_skipped} skipped — unknown labels)")
|
||||
st.caption("Label distribution:")
|
||||
for _lbl, _cnt in sorted(_label_counts.items(), key=lambda x: -x[1]):
|
||||
st.caption(f" `{_lbl}`: {_cnt}")
|
||||
|
||||
# ── Privacy & Telemetry (cloud mode only) ─────────────────────────────────────
|
||||
if CLOUD_MODE and tab_privacy is not None:
|
||||
with tab_privacy:
|
||||
from app.telemetry import get_consent as _get_consent, update_consent as _update_consent
|
||||
|
||||
st.subheader("🔒 Privacy & Telemetry")
|
||||
st.caption(
|
||||
"You have full, unconditional control over what data leaves your session. "
|
||||
"Changes take effect immediately."
|
||||
)
|
||||
|
||||
_uid = st.session_state.get("user_id", "")
|
||||
_consent = _get_consent(_uid) if _uid else {
|
||||
"all_disabled": False,
|
||||
"usage_events_enabled": True,
|
||||
"content_sharing_enabled": False,
|
||||
"support_access_enabled": False,
|
||||
}
|
||||
|
||||
with st.expander("📊 Usage & Telemetry", expanded=True):
|
||||
st.markdown(
|
||||
"CircuitForge is built by a tiny team. Anonymous usage data helps us fix the "
|
||||
"parts of the job search that are broken. You can opt out at any time."
|
||||
)
|
||||
|
||||
_all_off = st.toggle(
|
||||
"🚫 Disable ALL telemetry",
|
||||
value=bool(_consent.get("all_disabled", False)),
|
||||
key="privacy_all_disabled",
|
||||
help="Hard kill switch — overrides all options below. Nothing is written or transmitted.",
|
||||
)
|
||||
if _all_off != _consent.get("all_disabled", False) and _uid:
|
||||
_update_consent(_uid, all_disabled=_all_off)
|
||||
st.rerun()
|
||||
|
||||
st.divider()
|
||||
|
||||
_disabled = _all_off # grey out individual toggles when master switch is on
|
||||
|
||||
_usage_on = st.toggle(
|
||||
"📈 Share anonymous usage statistics",
|
||||
value=bool(_consent.get("usage_events_enabled", True)),
|
||||
disabled=_disabled,
|
||||
key="privacy_usage_events",
|
||||
help="Feature usage, error rates, completion counts — no content, no PII.",
|
||||
)
|
||||
if not _disabled and _usage_on != _consent.get("usage_events_enabled", True) and _uid:
|
||||
_update_consent(_uid, usage_events_enabled=_usage_on)
|
||||
st.rerun()
|
||||
|
||||
_content_on = st.toggle(
|
||||
"📝 Share de-identified content for model improvement",
|
||||
value=bool(_consent.get("content_sharing_enabled", False)),
|
||||
disabled=_disabled,
|
||||
key="privacy_content_sharing",
|
||||
help=(
|
||||
"Opt-in: anonymised cover letters (PII stripped) may be used to improve "
|
||||
"the CircuitForge fine-tuned model. Never shared with third parties."
|
||||
),
|
||||
)
|
||||
if not _disabled and _content_on != _consent.get("content_sharing_enabled", False) and _uid:
|
||||
_update_consent(_uid, content_sharing_enabled=_content_on)
|
||||
st.rerun()
|
||||
|
||||
st.divider()
|
||||
with st.expander("🎫 Temporary Support Access", expanded=False):
|
||||
st.caption(
|
||||
"Grant CircuitForge support read-only access to your session for a specific "
|
||||
"support ticket. Time-limited and revocable. You will be notified when access "
|
||||
"expires or is used."
|
||||
)
|
||||
from datetime import datetime as _dt, timedelta as _td
|
||||
_hours = st.selectbox(
|
||||
"Access duration", [4, 8, 24, 48, 72],
|
||||
format_func=lambda h: f"{h} hours",
|
||||
key="privacy_support_hours",
|
||||
)
|
||||
_ticket = st.text_input("Support ticket reference (optional)", key="privacy_ticket_ref")
|
||||
if st.button("Grant temporary support access", key="privacy_support_grant"):
|
||||
if _uid:
|
||||
try:
|
||||
from app.telemetry import get_platform_conn as _get_pc
|
||||
_pc = _get_pc()
|
||||
_expires = _dt.utcnow() + _td(hours=_hours)
|
||||
with _pc.cursor() as _cur:
|
||||
_cur.execute(
|
||||
"INSERT INTO support_access_grants "
|
||||
"(user_id, expires_at, ticket_ref) VALUES (%s, %s, %s)",
|
||||
(_uid, _expires, _ticket or None),
|
||||
)
|
||||
_pc.commit()
|
||||
st.success(
|
||||
f"Support access granted until {_expires.strftime('%Y-%m-%d %H:%M')} UTC. "
|
||||
"You can revoke it here at any time."
|
||||
)
|
||||
except Exception as _e:
|
||||
st.error(f"Could not save grant: {_e}")
|
||||
else:
|
||||
st.warning("Session not resolved — please reload the page.")
|
||||
|
|
|
|||
|
|
@ -15,24 +15,27 @@ import streamlit.components.v1 as components
|
|||
import yaml
|
||||
|
||||
from scripts.user_profile import UserProfile
|
||||
|
||||
_USER_YAML = Path(__file__).parent.parent.parent / "config" / "user.yaml"
|
||||
_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
|
||||
_name = _profile.name if _profile else "Job Seeker"
|
||||
|
||||
from scripts.db import (
|
||||
DEFAULT_DB, init_db, get_jobs_by_status,
|
||||
update_cover_letter, mark_applied, update_job_status,
|
||||
get_task_for_job,
|
||||
)
|
||||
from scripts.task_runner import submit_task
|
||||
|
||||
DOCS_DIR = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch"
|
||||
RESUME_YAML = Path(__file__).parent.parent.parent / "config" / "plain_text_resume.yaml"
|
||||
from app.cloud_session import resolve_session, get_db_path, get_config_dir
|
||||
from app.telemetry import log_usage_event
|
||||
|
||||
st.title("🚀 Apply Workspace")
|
||||
|
||||
init_db(DEFAULT_DB)
|
||||
resolve_session("peregrine")
|
||||
init_db(get_db_path())
|
||||
|
||||
_CONFIG_DIR = get_config_dir()
|
||||
_USER_YAML = _CONFIG_DIR / "user.yaml"
|
||||
_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
|
||||
_name = _profile.name if _profile else "Job Seeker"
|
||||
|
||||
DOCS_DIR = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch"
|
||||
RESUME_YAML = _CONFIG_DIR / "plain_text_resume.yaml"
|
||||
|
||||
# ── PDF generation ─────────────────────────────────────────────────────────────
|
||||
def _make_cover_letter_pdf(job: dict, cover_letter: str, output_dir: Path) -> Path:
|
||||
|
|
@ -156,7 +159,7 @@ def _copy_btn(text: str, label: str = "📋 Copy", done: str = "✅ Copied!", he
|
|||
)
|
||||
|
||||
# ── Job selection ──────────────────────────────────────────────────────────────
|
||||
approved = get_jobs_by_status(DEFAULT_DB, "approved")
|
||||
approved = get_jobs_by_status(get_db_path(), "approved")
|
||||
if not approved:
|
||||
st.info("No approved jobs — head to Job Review to approve some listings first.")
|
||||
st.stop()
|
||||
|
|
@ -219,17 +222,17 @@ with col_tools:
|
|||
if _cl_key not in st.session_state:
|
||||
st.session_state[_cl_key] = job.get("cover_letter") or ""
|
||||
|
||||
_cl_task = get_task_for_job(DEFAULT_DB, "cover_letter", selected_id)
|
||||
_cl_task = get_task_for_job(get_db_path(), "cover_letter", selected_id)
|
||||
_cl_running = _cl_task and _cl_task["status"] in ("queued", "running")
|
||||
|
||||
if st.button("✨ Generate / Regenerate", use_container_width=True, disabled=bool(_cl_running)):
|
||||
submit_task(DEFAULT_DB, "cover_letter", selected_id)
|
||||
submit_task(get_db_path(), "cover_letter", selected_id)
|
||||
st.rerun()
|
||||
|
||||
if _cl_running:
|
||||
@st.fragment(run_every=3)
|
||||
def _cl_status_fragment():
|
||||
t = get_task_for_job(DEFAULT_DB, "cover_letter", selected_id)
|
||||
t = get_task_for_job(get_db_path(), "cover_letter", selected_id)
|
||||
if t and t["status"] in ("queued", "running"):
|
||||
lbl = "Queued…" if t["status"] == "queued" else "Generating via LLM…"
|
||||
st.info(f"⏳ {lbl}")
|
||||
|
|
@ -272,7 +275,7 @@ with col_tools:
|
|||
key=f"cl_refine_{selected_id}"):
|
||||
import json as _json
|
||||
submit_task(
|
||||
DEFAULT_DB, "cover_letter", selected_id,
|
||||
get_db_path(), "cover_letter", selected_id,
|
||||
params=_json.dumps({
|
||||
"previous_result": cl_text,
|
||||
"feedback": feedback_text.strip(),
|
||||
|
|
@ -288,7 +291,7 @@ with col_tools:
|
|||
_copy_btn(cl_text, label="📋 Copy Letter")
|
||||
with c2:
|
||||
if st.button("💾 Save draft", use_container_width=True):
|
||||
update_cover_letter(DEFAULT_DB, selected_id, cl_text)
|
||||
update_cover_letter(get_db_path(), selected_id, cl_text)
|
||||
st.success("Saved!")
|
||||
|
||||
# PDF generation
|
||||
|
|
@ -297,8 +300,10 @@ with col_tools:
|
|||
with st.spinner("Generating PDF…"):
|
||||
try:
|
||||
pdf_path = _make_cover_letter_pdf(job, cl_text, DOCS_DIR)
|
||||
update_cover_letter(DEFAULT_DB, selected_id, cl_text)
|
||||
update_cover_letter(get_db_path(), selected_id, cl_text)
|
||||
st.success(f"Saved: `{pdf_path.name}`")
|
||||
if user_id := st.session_state.get("user_id"):
|
||||
log_usage_event(user_id, "peregrine", "cover_letter_generated")
|
||||
except Exception as e:
|
||||
st.error(f"PDF error: {e}")
|
||||
|
||||
|
|
@ -312,13 +317,15 @@ with col_tools:
|
|||
with c4:
|
||||
if st.button("✅ Mark as Applied", use_container_width=True, type="primary"):
|
||||
if cl_text:
|
||||
update_cover_letter(DEFAULT_DB, selected_id, cl_text)
|
||||
mark_applied(DEFAULT_DB, [selected_id])
|
||||
update_cover_letter(get_db_path(), selected_id, cl_text)
|
||||
mark_applied(get_db_path(), [selected_id])
|
||||
st.success("Marked as applied!")
|
||||
if user_id := st.session_state.get("user_id"):
|
||||
log_usage_event(user_id, "peregrine", "job_applied")
|
||||
st.rerun()
|
||||
|
||||
if st.button("🚫 Reject listing", use_container_width=True):
|
||||
update_job_status(DEFAULT_DB, [selected_id], "rejected")
|
||||
update_job_status(get_db_path(), [selected_id], "rejected")
|
||||
# Advance selectbox to next job so list doesn't snap to first item
|
||||
current_idx = ids.index(selected_id) if selected_id in ids else 0
|
||||
if current_idx + 1 < len(ids):
|
||||
|
|
@ -382,7 +389,7 @@ with col_tools:
|
|||
|
||||
st.markdown("---")
|
||||
else:
|
||||
st.warning("Resume YAML not found — check that AIHawk is cloned.")
|
||||
st.warning("Resume profile not found — complete setup or upload a resume in Settings → Resume Profile.")
|
||||
|
||||
# ── Application Q&A ───────────────────────────────────────────────────────
|
||||
with st.expander("💬 Answer Application Questions"):
|
||||
|
|
|
|||
|
|
@ -31,31 +31,41 @@ _name = _profile.name if _profile else "Job Seeker"
|
|||
from scripts.db import (
|
||||
DEFAULT_DB, init_db,
|
||||
get_interview_jobs, advance_to_stage, reject_at_stage,
|
||||
set_interview_date, add_contact, get_contacts,
|
||||
set_interview_date, set_calendar_event_id, add_contact, get_contacts,
|
||||
get_research, get_task_for_job, get_job_by_id,
|
||||
get_unread_stage_signals, dismiss_stage_signal,
|
||||
)
|
||||
from scripts.task_runner import submit_task
|
||||
from app.cloud_session import resolve_session, get_db_path
|
||||
|
||||
resolve_session("peregrine")
|
||||
|
||||
_CONFIG_DIR = Path(__file__).parent.parent.parent / "config"
|
||||
_CALENDAR_INTEGRATIONS = ("apple_calendar", "google_calendar")
|
||||
_calendar_connected = any(
|
||||
(_CONFIG_DIR / "integrations" / f"{n}.yaml").exists()
|
||||
for n in _CALENDAR_INTEGRATIONS
|
||||
)
|
||||
|
||||
st.title("🎯 Interviews")
|
||||
|
||||
init_db(DEFAULT_DB)
|
||||
init_db(get_db_path())
|
||||
|
||||
# ── Sidebar: Email sync ────────────────────────────────────────────────────────
|
||||
with st.sidebar:
|
||||
st.markdown("### 📧 Email Sync")
|
||||
_email_task = get_task_for_job(DEFAULT_DB, "email_sync", 0)
|
||||
_email_task = get_task_for_job(get_db_path(), "email_sync", 0)
|
||||
_email_running = _email_task and _email_task["status"] in ("queued", "running")
|
||||
|
||||
if st.button("🔄 Sync Emails", use_container_width=True, type="primary",
|
||||
disabled=bool(_email_running)):
|
||||
submit_task(DEFAULT_DB, "email_sync", 0)
|
||||
submit_task(get_db_path(), "email_sync", 0)
|
||||
st.rerun()
|
||||
|
||||
if _email_running:
|
||||
@st.fragment(run_every=4)
|
||||
def _email_sidebar_status():
|
||||
t = get_task_for_job(DEFAULT_DB, "email_sync", 0)
|
||||
t = get_task_for_job(get_db_path(), "email_sync", 0)
|
||||
if t and t["status"] in ("queued", "running"):
|
||||
st.info("⏳ Syncing…")
|
||||
else:
|
||||
|
|
@ -92,7 +102,7 @@ STAGE_NEXT_LABEL = {
|
|||
}
|
||||
|
||||
# ── Data ──────────────────────────────────────────────────────────────────────
|
||||
jobs_by_stage = get_interview_jobs(DEFAULT_DB)
|
||||
jobs_by_stage = get_interview_jobs(get_db_path())
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
def _days_ago(date_str: str | None) -> str:
|
||||
|
|
@ -113,8 +123,8 @@ def _days_ago(date_str: str | None) -> str:
|
|||
def _research_modal(job: dict) -> None:
|
||||
job_id = job["id"]
|
||||
st.caption(f"**{job.get('company')}** — {job.get('title')}")
|
||||
research = get_research(DEFAULT_DB, job_id=job_id)
|
||||
task = get_task_for_job(DEFAULT_DB, "company_research", job_id)
|
||||
research = get_research(get_db_path(), job_id=job_id)
|
||||
task = get_task_for_job(get_db_path(), "company_research", job_id)
|
||||
running = task and task["status"] in ("queued", "running")
|
||||
|
||||
if running:
|
||||
|
|
@ -137,7 +147,7 @@ def _research_modal(job: dict) -> None:
|
|||
"inaccuracies. SearXNG is now available — re-run to get verified facts."
|
||||
)
|
||||
if st.button("🔄 Re-run with live data", key=f"modal_rescrape_{job_id}", type="primary"):
|
||||
submit_task(DEFAULT_DB, "company_research", job_id)
|
||||
submit_task(get_db_path(), "company_research", job_id)
|
||||
st.rerun()
|
||||
st.divider()
|
||||
else:
|
||||
|
|
@ -153,14 +163,14 @@ def _research_modal(job: dict) -> None:
|
|||
)
|
||||
st.markdown(research["raw_output"])
|
||||
if st.button("🔄 Refresh", key=f"modal_regen_{job_id}", disabled=bool(running)):
|
||||
submit_task(DEFAULT_DB, "company_research", job_id)
|
||||
submit_task(get_db_path(), "company_research", job_id)
|
||||
st.rerun()
|
||||
else:
|
||||
st.info("No research brief yet.")
|
||||
if task and task["status"] == "failed":
|
||||
st.error(f"Last attempt failed: {task.get('error', '')}")
|
||||
if st.button("🔬 Generate now", key=f"modal_gen_{job_id}"):
|
||||
submit_task(DEFAULT_DB, "company_research", job_id)
|
||||
submit_task(get_db_path(), "company_research", job_id)
|
||||
st.rerun()
|
||||
|
||||
|
||||
|
|
@ -168,7 +178,7 @@ def _research_modal(job: dict) -> None:
|
|||
def _email_modal(job: dict) -> None:
|
||||
job_id = job["id"]
|
||||
st.caption(f"**{job.get('company')}** — {job.get('title')}")
|
||||
contacts = get_contacts(DEFAULT_DB, job_id=job_id)
|
||||
contacts = get_contacts(get_db_path(), job_id=job_id)
|
||||
|
||||
if not contacts:
|
||||
st.info("No emails logged yet. Use the form below to add one.")
|
||||
|
|
@ -239,7 +249,7 @@ def _email_modal(job: dict) -> None:
|
|||
body_text = st.text_area("Body / notes", height=80, key=f"body_modal_{job_id}")
|
||||
if st.form_submit_button("📧 Save contact"):
|
||||
add_contact(
|
||||
DEFAULT_DB, job_id=job_id,
|
||||
get_db_path(), job_id=job_id,
|
||||
direction=direction, subject=subject,
|
||||
from_addr=from_addr, body=body_text, received_at=recv_at,
|
||||
)
|
||||
|
|
@ -248,7 +258,7 @@ def _email_modal(job: dict) -> None:
|
|||
def _render_card(job: dict, stage: str, compact: bool = False) -> None:
|
||||
"""Render a single job card appropriate for the given stage."""
|
||||
job_id = job["id"]
|
||||
contacts = get_contacts(DEFAULT_DB, job_id=job_id)
|
||||
contacts = get_contacts(get_db_path(), job_id=job_id)
|
||||
last_contact = contacts[-1] if contacts else None
|
||||
|
||||
with st.container(border=True):
|
||||
|
|
@ -271,13 +281,26 @@ def _render_card(job: dict, stage: str, compact: bool = False) -> None:
|
|||
format="YYYY-MM-DD",
|
||||
)
|
||||
if st.form_submit_button("📅 Save date"):
|
||||
set_interview_date(DEFAULT_DB, job_id=job_id, date_str=str(new_date))
|
||||
set_interview_date(get_db_path(), job_id=job_id, date_str=str(new_date))
|
||||
st.success("Saved!")
|
||||
st.rerun()
|
||||
|
||||
# Calendar push — only shown when a date is saved and an integration is connected
|
||||
if current_idate and _calendar_connected:
|
||||
_has_event = bool(job.get("calendar_event_id"))
|
||||
_cal_label = "🔄 Update Calendar" if _has_event else "📅 Add to Calendar"
|
||||
if st.button(_cal_label, key=f"cal_push_{job_id}", use_container_width=True):
|
||||
from scripts.calendar_push import push_interview_event
|
||||
result = push_interview_event(get_db_path(), job_id=job_id, config_dir=_CONFIG_DIR)
|
||||
if result["ok"]:
|
||||
st.success(f"Event {'updated' if _has_event else 'added'} ({result['provider'].replace('_', ' ').title()})")
|
||||
st.rerun()
|
||||
else:
|
||||
st.error(result["error"])
|
||||
|
||||
if not compact:
|
||||
if stage in ("applied", "phone_screen", "interviewing"):
|
||||
signals = get_unread_stage_signals(DEFAULT_DB, job_id=job_id)
|
||||
signals = get_unread_stage_signals(get_db_path(), job_id=job_id)
|
||||
if signals:
|
||||
sig = signals[-1]
|
||||
_SIGNAL_TO_STAGE = {
|
||||
|
|
@ -298,23 +321,23 @@ def _render_card(job: dict, stage: str, compact: bool = False) -> None:
|
|||
if sig["stage_signal"] == "rejected":
|
||||
if b1.button("✗ Reject", key=f"sig_rej_{sig['id']}",
|
||||
use_container_width=True):
|
||||
reject_at_stage(DEFAULT_DB, job_id=job_id, rejection_stage=stage)
|
||||
dismiss_stage_signal(DEFAULT_DB, sig["id"])
|
||||
reject_at_stage(get_db_path(), job_id=job_id, rejection_stage=stage)
|
||||
dismiss_stage_signal(get_db_path(), sig["id"])
|
||||
st.rerun(scope="app")
|
||||
elif target_stage and b1.button(
|
||||
f"→ {target_label}", key=f"sig_adv_{sig['id']}",
|
||||
use_container_width=True, type="primary",
|
||||
):
|
||||
if target_stage == "phone_screen" and stage == "applied":
|
||||
advance_to_stage(DEFAULT_DB, job_id=job_id, stage="phone_screen")
|
||||
submit_task(DEFAULT_DB, "company_research", job_id)
|
||||
advance_to_stage(get_db_path(), job_id=job_id, stage="phone_screen")
|
||||
submit_task(get_db_path(), "company_research", job_id)
|
||||
elif target_stage:
|
||||
advance_to_stage(DEFAULT_DB, job_id=job_id, stage=target_stage)
|
||||
dismiss_stage_signal(DEFAULT_DB, sig["id"])
|
||||
advance_to_stage(get_db_path(), job_id=job_id, stage=target_stage)
|
||||
dismiss_stage_signal(get_db_path(), sig["id"])
|
||||
st.rerun(scope="app")
|
||||
if b2.button("Dismiss", key=f"sig_dis_{sig['id']}",
|
||||
use_container_width=True):
|
||||
dismiss_stage_signal(DEFAULT_DB, sig["id"])
|
||||
dismiss_stage_signal(get_db_path(), sig["id"])
|
||||
st.rerun()
|
||||
|
||||
# Advance / Reject buttons
|
||||
|
|
@ -326,16 +349,16 @@ def _render_card(job: dict, stage: str, compact: bool = False) -> None:
|
|||
f"→ {next_label}", key=f"adv_{job_id}",
|
||||
use_container_width=True, type="primary",
|
||||
):
|
||||
advance_to_stage(DEFAULT_DB, job_id=job_id, stage=next_stage)
|
||||
advance_to_stage(get_db_path(), job_id=job_id, stage=next_stage)
|
||||
if next_stage == "phone_screen":
|
||||
submit_task(DEFAULT_DB, "company_research", job_id)
|
||||
submit_task(get_db_path(), "company_research", job_id)
|
||||
st.rerun(scope="app") # full rerun — card must appear in new column
|
||||
|
||||
if c2.button(
|
||||
"✗ Reject", key=f"rej_{job_id}",
|
||||
use_container_width=True,
|
||||
):
|
||||
reject_at_stage(DEFAULT_DB, job_id=job_id, rejection_stage=stage)
|
||||
reject_at_stage(get_db_path(), job_id=job_id, rejection_stage=stage)
|
||||
st.rerun() # fragment-scope rerun — card disappears without scroll-to-top
|
||||
|
||||
if job.get("url"):
|
||||
|
|
@ -365,7 +388,7 @@ def _render_card(job: dict, stage: str, compact: bool = False) -> None:
|
|||
@st.fragment
|
||||
def _card_fragment(job_id: int, stage: str) -> None:
|
||||
"""Re-fetches the job on each fragment rerun; renders nothing if moved/rejected."""
|
||||
job = get_job_by_id(DEFAULT_DB, job_id)
|
||||
job = get_job_by_id(get_db_path(), job_id)
|
||||
if job is None or job.get("status") != stage:
|
||||
return
|
||||
_render_card(job, stage)
|
||||
|
|
@ -374,11 +397,11 @@ def _card_fragment(job_id: int, stage: str) -> None:
|
|||
@st.fragment
|
||||
def _pre_kanban_row_fragment(job_id: int) -> None:
|
||||
"""Pre-kanban compact row for applied and survey-stage jobs."""
|
||||
job = get_job_by_id(DEFAULT_DB, job_id)
|
||||
job = get_job_by_id(get_db_path(), job_id)
|
||||
if job is None or job.get("status") not in ("applied", "survey"):
|
||||
return
|
||||
stage = job["status"]
|
||||
contacts = get_contacts(DEFAULT_DB, job_id=job_id)
|
||||
contacts = get_contacts(get_db_path(), job_id=job_id)
|
||||
last_contact = contacts[-1] if contacts else None
|
||||
|
||||
with st.container(border=True):
|
||||
|
|
@ -394,7 +417,7 @@ def _pre_kanban_row_fragment(job_id: int) -> None:
|
|||
_email_modal(job)
|
||||
|
||||
# Stage signal hint (email-detected next steps)
|
||||
signals = get_unread_stage_signals(DEFAULT_DB, job_id=job_id)
|
||||
signals = get_unread_stage_signals(get_db_path(), job_id=job_id)
|
||||
if signals:
|
||||
sig = signals[-1]
|
||||
_SIGNAL_TO_STAGE = {
|
||||
|
|
@ -417,15 +440,15 @@ def _pre_kanban_row_fragment(job_id: int) -> None:
|
|||
use_container_width=True, type="primary",
|
||||
):
|
||||
if target_stage == "phone_screen":
|
||||
advance_to_stage(DEFAULT_DB, job_id=job_id, stage="phone_screen")
|
||||
submit_task(DEFAULT_DB, "company_research", job_id)
|
||||
advance_to_stage(get_db_path(), job_id=job_id, stage="phone_screen")
|
||||
submit_task(get_db_path(), "company_research", job_id)
|
||||
else:
|
||||
advance_to_stage(DEFAULT_DB, job_id=job_id, stage=target_stage)
|
||||
dismiss_stage_signal(DEFAULT_DB, sig["id"])
|
||||
advance_to_stage(get_db_path(), job_id=job_id, stage=target_stage)
|
||||
dismiss_stage_signal(get_db_path(), sig["id"])
|
||||
st.rerun(scope="app")
|
||||
if s2.button("Dismiss", key=f"sig_dis_pre_{sig['id']}",
|
||||
use_container_width=True):
|
||||
dismiss_stage_signal(DEFAULT_DB, sig["id"])
|
||||
dismiss_stage_signal(get_db_path(), sig["id"])
|
||||
st.rerun()
|
||||
|
||||
with right:
|
||||
|
|
@ -433,24 +456,24 @@ def _pre_kanban_row_fragment(job_id: int) -> None:
|
|||
"→ 📞 Phone Screen", key=f"adv_pre_{job_id}",
|
||||
use_container_width=True, type="primary",
|
||||
):
|
||||
advance_to_stage(DEFAULT_DB, job_id=job_id, stage="phone_screen")
|
||||
submit_task(DEFAULT_DB, "company_research", job_id)
|
||||
advance_to_stage(get_db_path(), job_id=job_id, stage="phone_screen")
|
||||
submit_task(get_db_path(), "company_research", job_id)
|
||||
st.rerun(scope="app")
|
||||
col_a, col_b = st.columns(2)
|
||||
if stage == "applied" and col_a.button(
|
||||
"📋 Survey", key=f"to_survey_{job_id}", use_container_width=True,
|
||||
):
|
||||
advance_to_stage(DEFAULT_DB, job_id=job_id, stage="survey")
|
||||
advance_to_stage(get_db_path(), job_id=job_id, stage="survey")
|
||||
st.rerun(scope="app")
|
||||
if col_b.button("✗ Reject", key=f"rej_pre_{job_id}", use_container_width=True):
|
||||
reject_at_stage(DEFAULT_DB, job_id=job_id, rejection_stage=stage)
|
||||
reject_at_stage(get_db_path(), job_id=job_id, rejection_stage=stage)
|
||||
st.rerun()
|
||||
|
||||
|
||||
@st.fragment
|
||||
def _hired_card_fragment(job_id: int) -> None:
|
||||
"""Compact hired job card — shown in the Offer/Hired column."""
|
||||
job = get_job_by_id(DEFAULT_DB, job_id)
|
||||
job = get_job_by_id(get_db_path(), job_id)
|
||||
if job is None or job.get("status") != "hired":
|
||||
return
|
||||
with st.container(border=True):
|
||||
|
|
|
|||
|
|
@ -25,11 +25,14 @@ from scripts.db import (
|
|||
get_task_for_job,
|
||||
)
|
||||
from scripts.task_runner import submit_task
|
||||
from app.cloud_session import resolve_session, get_db_path
|
||||
|
||||
init_db(DEFAULT_DB)
|
||||
resolve_session("peregrine")
|
||||
|
||||
init_db(get_db_path())
|
||||
|
||||
# ── Job selection ─────────────────────────────────────────────────────────────
|
||||
jobs_by_stage = get_interview_jobs(DEFAULT_DB)
|
||||
jobs_by_stage = get_interview_jobs(get_db_path())
|
||||
active_stages = ["phone_screen", "interviewing", "offer"]
|
||||
active_jobs = [
|
||||
j for stage in active_stages
|
||||
|
|
@ -100,10 +103,10 @@ col_prep, col_context = st.columns([2, 3])
|
|||
# ════════════════════════════════════════════════
|
||||
with col_prep:
|
||||
|
||||
research = get_research(DEFAULT_DB, job_id=selected_id)
|
||||
research = get_research(get_db_path(), job_id=selected_id)
|
||||
|
||||
# Refresh / generate research
|
||||
_res_task = get_task_for_job(DEFAULT_DB, "company_research", selected_id)
|
||||
_res_task = get_task_for_job(get_db_path(), "company_research", selected_id)
|
||||
_res_running = _res_task and _res_task["status"] in ("queued", "running")
|
||||
|
||||
if not research:
|
||||
|
|
@ -112,13 +115,13 @@ with col_prep:
|
|||
if _res_task and _res_task["status"] == "failed":
|
||||
st.error(f"Last attempt failed: {_res_task.get('error', '')}")
|
||||
if st.button("🔬 Generate research brief", type="primary", use_container_width=True):
|
||||
submit_task(DEFAULT_DB, "company_research", selected_id)
|
||||
submit_task(get_db_path(), "company_research", selected_id)
|
||||
st.rerun()
|
||||
|
||||
if _res_running:
|
||||
@st.fragment(run_every=3)
|
||||
def _res_status_initial():
|
||||
t = get_task_for_job(DEFAULT_DB, "company_research", selected_id)
|
||||
t = get_task_for_job(get_db_path(), "company_research", selected_id)
|
||||
if t and t["status"] in ("queued", "running"):
|
||||
stage = t.get("stage") or ""
|
||||
lbl = "Queued…" if t["status"] == "queued" else (stage or "Generating… this may take 30–60 seconds")
|
||||
|
|
@ -133,13 +136,13 @@ with col_prep:
|
|||
col_ts, col_btn = st.columns([3, 1])
|
||||
col_ts.caption(f"Research generated: {generated_at}")
|
||||
if col_btn.button("🔄 Refresh", use_container_width=True, disabled=bool(_res_running)):
|
||||
submit_task(DEFAULT_DB, "company_research", selected_id)
|
||||
submit_task(get_db_path(), "company_research", selected_id)
|
||||
st.rerun()
|
||||
|
||||
if _res_running:
|
||||
@st.fragment(run_every=3)
|
||||
def _res_status_refresh():
|
||||
t = get_task_for_job(DEFAULT_DB, "company_research", selected_id)
|
||||
t = get_task_for_job(get_db_path(), "company_research", selected_id)
|
||||
if t and t["status"] in ("queued", "running"):
|
||||
stage = t.get("stage") or ""
|
||||
lbl = "Queued…" if t["status"] == "queued" else (stage or "Refreshing research…")
|
||||
|
|
@ -224,7 +227,11 @@ with col_prep:
|
|||
st.markdown(msg["content"])
|
||||
|
||||
# Initial question if session is empty
|
||||
if not st.session_state[qa_key]:
|
||||
import os as _os
|
||||
_is_demo = _os.environ.get("DEMO_MODE", "").lower() in ("1", "true", "yes")
|
||||
if not st.session_state[qa_key] and _is_demo:
|
||||
st.info("AI features are disabled in the public demo. Run your own instance to use Practice Q&A.")
|
||||
elif not st.session_state[qa_key]:
|
||||
with st.spinner("Setting up your mock interview…"):
|
||||
try:
|
||||
from scripts.llm_router import complete
|
||||
|
|
@ -307,7 +314,7 @@ with col_context:
|
|||
st.markdown(job.get("description") or "_No description saved for this listing._")
|
||||
|
||||
with tab_emails:
|
||||
contacts = get_contacts(DEFAULT_DB, job_id=selected_id)
|
||||
contacts = get_contacts(get_db_path(), job_id=selected_id)
|
||||
if not contacts:
|
||||
st.info("No contacts logged yet. Use the Interviews page to log emails.")
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -22,10 +22,13 @@ from scripts.db import (
|
|||
insert_survey_response, get_survey_responses,
|
||||
)
|
||||
from scripts.llm_router import LLMRouter
|
||||
from app.cloud_session import resolve_session, get_db_path
|
||||
|
||||
resolve_session("peregrine")
|
||||
|
||||
st.title("📋 Survey Assistant")
|
||||
|
||||
init_db(DEFAULT_DB)
|
||||
init_db(get_db_path())
|
||||
|
||||
|
||||
# ── Vision service health check ────────────────────────────────────────────────
|
||||
|
|
@ -40,7 +43,7 @@ def _vision_available() -> bool:
|
|||
vision_up = _vision_available()
|
||||
|
||||
# ── Job selector ───────────────────────────────────────────────────────────────
|
||||
jobs_by_stage = get_interview_jobs(DEFAULT_DB)
|
||||
jobs_by_stage = get_interview_jobs(get_db_path())
|
||||
survey_jobs = jobs_by_stage.get("survey", [])
|
||||
other_jobs = (
|
||||
jobs_by_stage.get("applied", []) +
|
||||
|
|
@ -61,7 +64,7 @@ selected_job_id = st.selectbox(
|
|||
format_func=lambda jid: job_labels[jid],
|
||||
index=0,
|
||||
)
|
||||
selected_job = get_job_by_id(DEFAULT_DB, selected_job_id)
|
||||
selected_job = get_job_by_id(get_db_path(), selected_job_id)
|
||||
|
||||
# ── LLM prompt builders ────────────────────────────────────────────────────────
|
||||
_SURVEY_SYSTEM = (
|
||||
|
|
@ -236,7 +239,7 @@ with right_col:
|
|||
image_path = str(img_file)
|
||||
|
||||
insert_survey_response(
|
||||
DEFAULT_DB,
|
||||
get_db_path(),
|
||||
job_id=selected_job_id,
|
||||
survey_name=survey_name,
|
||||
source=source,
|
||||
|
|
@ -256,7 +259,7 @@ with right_col:
|
|||
# ── History ────────────────────────────────────────────────────────────────────
|
||||
st.divider()
|
||||
st.subheader("📂 Response History")
|
||||
history = get_survey_responses(DEFAULT_DB, job_id=selected_job_id)
|
||||
history = get_survey_responses(get_db_path(), job_id=selected_job_id)
|
||||
|
||||
if not history:
|
||||
st.caption("No saved responses for this job yet.")
|
||||
|
|
|
|||
127
app/telemetry.py
Normal file
127
app/telemetry.py
Normal file
|
|
@ -0,0 +1,127 @@
|
|||
# peregrine/app/telemetry.py
|
||||
"""
|
||||
Usage event telemetry for cloud-hosted Peregrine.
|
||||
|
||||
In local-first mode (CLOUD_MODE unset/false), all functions are no-ops —
|
||||
no network calls, no DB writes, no imports of psycopg2.
|
||||
|
||||
In cloud mode, events are written to the platform Postgres DB ONLY after
|
||||
confirming the user's telemetry consent.
|
||||
|
||||
THE HARD RULE: if telemetry_consent.all_disabled is True for a user,
|
||||
nothing is written, no exceptions. This function is the ONLY path to
|
||||
usage_events — no feature may write there directly.
|
||||
"""
|
||||
import os
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
CLOUD_MODE: bool = os.environ.get("CLOUD_MODE", "").lower() in ("1", "true", "yes")
|
||||
PLATFORM_DB_URL: str = os.environ.get("PLATFORM_DB_URL", "")
|
||||
|
||||
_platform_conn = None
|
||||
|
||||
|
||||
def get_platform_conn():
|
||||
"""Lazy psycopg2 connection to the platform Postgres DB. Reconnects if closed."""
|
||||
global _platform_conn
|
||||
if _platform_conn is None or _platform_conn.closed:
|
||||
import psycopg2
|
||||
_platform_conn = psycopg2.connect(PLATFORM_DB_URL)
|
||||
return _platform_conn
|
||||
|
||||
|
||||
def get_consent(user_id: str) -> dict:
|
||||
"""
|
||||
Fetch telemetry consent for the user.
|
||||
Returns safe defaults if record doesn't exist yet:
|
||||
- usage_events_enabled: True (new cloud users start opted-in, per onboarding disclosure)
|
||||
- all_disabled: False
|
||||
"""
|
||||
conn = get_platform_conn()
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT all_disabled, usage_events_enabled "
|
||||
"FROM telemetry_consent WHERE user_id = %s",
|
||||
(user_id,)
|
||||
)
|
||||
row = cur.fetchone()
|
||||
if row is None:
|
||||
return {"all_disabled": False, "usage_events_enabled": True}
|
||||
return {"all_disabled": row[0], "usage_events_enabled": row[1]}
|
||||
|
||||
|
||||
def log_usage_event(
|
||||
user_id: str,
|
||||
app: str,
|
||||
event_type: str,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Write a usage event to the platform DB if consent allows.
|
||||
|
||||
Silent no-op in local mode. Silent no-op if telemetry is disabled.
|
||||
Swallows all exceptions — telemetry must never crash the app.
|
||||
|
||||
Args:
|
||||
user_id: Directus user UUID (from st.session_state["user_id"])
|
||||
app: App slug ('peregrine', 'falcon', etc.)
|
||||
event_type: Snake_case event label ('cover_letter_generated', 'job_applied', etc.)
|
||||
metadata: Optional JSON-serialisable dict — NO PII
|
||||
"""
|
||||
if not CLOUD_MODE:
|
||||
return
|
||||
|
||||
try:
|
||||
consent = get_consent(user_id)
|
||||
if consent.get("all_disabled") or not consent.get("usage_events_enabled", True):
|
||||
return
|
||||
|
||||
conn = get_platform_conn()
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"INSERT INTO usage_events (user_id, app, event_type, metadata) "
|
||||
"VALUES (%s, %s, %s, %s)",
|
||||
(user_id, app, event_type, json.dumps(metadata) if metadata else None),
|
||||
)
|
||||
conn.commit()
|
||||
except Exception:
|
||||
# Telemetry must never crash the app
|
||||
pass
|
||||
|
||||
|
||||
def update_consent(user_id: str, **fields) -> None:
|
||||
"""
|
||||
UPSERT telemetry consent for a user.
|
||||
|
||||
Accepted keyword args (all optional, any subset may be provided):
|
||||
all_disabled: bool
|
||||
usage_events_enabled: bool
|
||||
content_sharing_enabled: bool
|
||||
support_access_enabled: bool
|
||||
|
||||
Safe to call in cloud mode only — no-op in local mode.
|
||||
Swallows all exceptions so the Settings UI is never broken by a DB hiccup.
|
||||
"""
|
||||
if not CLOUD_MODE:
|
||||
return
|
||||
allowed = {"all_disabled", "usage_events_enabled", "content_sharing_enabled", "support_access_enabled"}
|
||||
cols = {k: v for k, v in fields.items() if k in allowed}
|
||||
if not cols:
|
||||
return
|
||||
try:
|
||||
conn = get_platform_conn()
|
||||
col_names = ", ".join(cols)
|
||||
placeholders = ", ".join(["%s"] * len(cols))
|
||||
set_clause = ", ".join(f"{k} = EXCLUDED.{k}" for k in cols)
|
||||
col_vals = list(cols.values())
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
f"INSERT INTO telemetry_consent (user_id, {col_names}) "
|
||||
f"VALUES (%s, {placeholders}) "
|
||||
f"ON CONFLICT (user_id) DO UPDATE SET {set_clause}, updated_at = NOW()",
|
||||
[user_id] + col_vals,
|
||||
)
|
||||
conn.commit()
|
||||
except Exception:
|
||||
pass
|
||||
|
|
@ -1,70 +1,155 @@
|
|||
"""
|
||||
Tier definitions and feature gates for Peregrine.
|
||||
|
||||
Tiers: free < paid < premium
|
||||
Tiers: free < paid < premium < ultra (ultra reserved; no Peregrine features use it yet)
|
||||
FEATURES maps feature key → minimum tier required.
|
||||
Features not in FEATURES are available to all tiers (free).
|
||||
|
||||
BYOK policy
|
||||
-----------
|
||||
Features in BYOK_UNLOCKABLE are gated only because CircuitForge would otherwise
|
||||
be providing the LLM compute. When a user has any configured LLM backend (local
|
||||
ollama/vllm or their own API key), those features unlock regardless of tier.
|
||||
Pass has_byok=has_configured_llm() to can_use() at call sites.
|
||||
|
||||
Features that stay gated even with BYOK:
|
||||
- Integrations (Notion sync, calendars, etc.) — infrastructure we run
|
||||
- llm_keywords_blocklist — orchestration pipeline over background keyword data
|
||||
- email_classifier — training pipeline, not a single LLM call
|
||||
- shared_cover_writer_model — our fine-tuned model weights
|
||||
- model_fine_tuning — GPU infrastructure
|
||||
- multi_user — account infrastructure
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
TIERS = ["free", "paid", "premium"]
|
||||
import os as _os
|
||||
from pathlib import Path
|
||||
|
||||
from circuitforge_core.tiers import (
|
||||
can_use as _core_can_use,
|
||||
TIERS,
|
||||
tier_label as _core_tier_label,
|
||||
)
|
||||
|
||||
# Maps feature key → minimum tier string required.
|
||||
# Features absent from this dict are free (available to all).
|
||||
FEATURES: dict[str, str] = {
|
||||
# Wizard LLM generation
|
||||
# Wizard LLM generation — BYOK-unlockable (pure LLM calls)
|
||||
"llm_career_summary": "paid",
|
||||
"llm_expand_bullets": "paid",
|
||||
"llm_suggest_skills": "paid",
|
||||
"llm_voice_guidelines": "premium",
|
||||
"llm_job_titles": "paid",
|
||||
"llm_keywords_blocklist": "paid",
|
||||
"llm_mission_notes": "paid",
|
||||
|
||||
# App features
|
||||
# Orchestration — stays gated (background data pipeline, not just an LLM call)
|
||||
"llm_keywords_blocklist": "paid",
|
||||
|
||||
# App features — BYOK-unlockable (pure LLM calls over job/profile data)
|
||||
"company_research": "paid",
|
||||
"interview_prep": "paid",
|
||||
"email_classifier": "paid",
|
||||
"survey_assistant": "paid",
|
||||
|
||||
# Orchestration / infrastructure — stays gated
|
||||
"email_classifier": "paid",
|
||||
"model_fine_tuning": "premium",
|
||||
"shared_cover_writer_model": "paid",
|
||||
"multi_user": "premium",
|
||||
|
||||
# Integrations (paid)
|
||||
# Integrations — stays gated (infrastructure CircuitForge operates)
|
||||
"notion_sync": "paid",
|
||||
"google_sheets_sync": "paid",
|
||||
"airtable_sync": "paid",
|
||||
"google_calendar_sync": "paid",
|
||||
"apple_calendar_sync": "paid",
|
||||
"slack_notifications": "paid",
|
||||
|
||||
# Beta UI access — open to all tiers (access management, not compute)
|
||||
"vue_ui_beta": "free",
|
||||
}
|
||||
|
||||
# Features that unlock when the user supplies any LLM backend (local or BYOK).
|
||||
# These are pure LLM-call features — the only reason they're behind a tier is
|
||||
# because CircuitForge would otherwise be providing the compute.
|
||||
BYOK_UNLOCKABLE: frozenset[str] = frozenset({
|
||||
"llm_career_summary",
|
||||
"llm_expand_bullets",
|
||||
"llm_suggest_skills",
|
||||
"llm_voice_guidelines",
|
||||
"llm_job_titles",
|
||||
"llm_mission_notes",
|
||||
"company_research",
|
||||
"interview_prep",
|
||||
"survey_assistant",
|
||||
})
|
||||
|
||||
# Demo mode flag — read from environment at module load time.
|
||||
# Allows demo toolbar to override tier without accessing st.session_state (thread-safe).
|
||||
# _DEMO_MODE is immutable after import for the process lifetime.
|
||||
# DEMO_MODE must be set in the environment before the process starts (e.g., via
|
||||
# Docker Compose environment:). Runtime toggling is not supported.
|
||||
_DEMO_MODE = _os.environ.get("DEMO_MODE", "").lower() in ("1", "true", "yes")
|
||||
|
||||
# Free integrations (not in FEATURES):
|
||||
# google_drive_sync, dropbox_sync, onedrive_sync, mega_sync,
|
||||
# nextcloud_sync, discord_notifications, home_assistant
|
||||
|
||||
_LLM_CFG = Path(__file__).parent.parent.parent / "config" / "llm.yaml"
|
||||
|
||||
def can_use(tier: str, feature: str) -> bool:
|
||||
|
||||
def has_configured_llm(config_path: Path | None = None) -> bool:
|
||||
"""Return True if at least one non-vision LLM backend is enabled in llm.yaml.
|
||||
|
||||
Local backends (ollama, vllm) count — the policy is "you're providing the
|
||||
compute", whether that's your own hardware or your own API key.
|
||||
"""
|
||||
import yaml
|
||||
path = config_path or _LLM_CFG
|
||||
try:
|
||||
with open(path) as f:
|
||||
cfg = yaml.safe_load(f) or {}
|
||||
return any(
|
||||
b.get("enabled", True) and b.get("type") != "vision_service"
|
||||
for b in cfg.get("backends", {}).values()
|
||||
)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def can_use(
|
||||
tier: str,
|
||||
feature: str,
|
||||
has_byok: bool = False,
|
||||
*,
|
||||
demo_tier: str | None = None,
|
||||
) -> bool:
|
||||
"""Return True if the given tier has access to the feature.
|
||||
|
||||
has_byok: pass has_configured_llm() to unlock BYOK_UNLOCKABLE features
|
||||
for users who supply their own LLM backend regardless of tier.
|
||||
|
||||
demo_tier: when set AND _DEMO_MODE is True, substitutes for `tier`.
|
||||
Read from st.session_state by the *caller*, not here — keeps
|
||||
this function thread-safe for background tasks and tests.
|
||||
|
||||
Returns True for unknown features (not gated).
|
||||
Returns False for unknown/invalid tier strings.
|
||||
"""
|
||||
required = FEATURES.get(feature)
|
||||
if required is None:
|
||||
return True # not gated — available to all
|
||||
try:
|
||||
return TIERS.index(tier) >= TIERS.index(required)
|
||||
except ValueError:
|
||||
return False # invalid tier string
|
||||
effective_tier = demo_tier if (demo_tier is not None and _DEMO_MODE) else tier
|
||||
# Pass Peregrine's BYOK_UNLOCKABLE via has_byok collapse — core's frozenset is empty
|
||||
if has_byok and feature in BYOK_UNLOCKABLE:
|
||||
return True
|
||||
return _core_can_use(feature, effective_tier, _features=FEATURES)
|
||||
|
||||
|
||||
def tier_label(feature: str) -> str:
|
||||
"""Return a display label for a locked feature, or '' if free/unknown."""
|
||||
required = FEATURES.get(feature)
|
||||
if required is None:
|
||||
def tier_label(feature: str, has_byok: bool = False) -> str:
|
||||
"""Return a display label for a locked feature, or '' if free/unlocked."""
|
||||
if has_byok and feature in BYOK_UNLOCKABLE:
|
||||
return ""
|
||||
return "🔒 Paid" if required == "paid" else "⭐ Premium"
|
||||
raw = _core_tier_label(feature, _features=FEATURES)
|
||||
if not raw or raw == "free":
|
||||
return ""
|
||||
return "🔒 Paid" if raw == "paid" else "⭐ Premium"
|
||||
|
||||
|
||||
def effective_tier(
|
||||
|
|
|
|||
99
compose.cloud.yml
Normal file
99
compose.cloud.yml
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
# compose.cloud.yml — Multi-tenant cloud stack for menagerie.circuitforge.tech/peregrine
|
||||
#
|
||||
# Each authenticated user gets their own encrypted SQLite data tree at
|
||||
# /devl/menagerie-data/<user-id>/peregrine/
|
||||
#
|
||||
# Caddy injects the Directus session cookie as X-CF-Session header before forwarding.
|
||||
# cloud_session.py resolves user_id → per-user db_path at session init.
|
||||
#
|
||||
# Usage:
|
||||
# docker compose -f compose.cloud.yml --project-name peregrine-cloud up -d
|
||||
# docker compose -f compose.cloud.yml --project-name peregrine-cloud down
|
||||
# docker compose -f compose.cloud.yml --project-name peregrine-cloud logs app -f
|
||||
|
||||
services:
|
||||
app:
|
||||
build:
|
||||
context: ..
|
||||
dockerfile: peregrine/Dockerfile.cfcore
|
||||
container_name: peregrine-cloud
|
||||
ports:
|
||||
- "8505:8501"
|
||||
volumes:
|
||||
- /devl/menagerie-data:/devl/menagerie-data # per-user data trees
|
||||
- ./config/llm.cloud.yaml:/app/config/llm.yaml:ro # cloud-safe backends only (no claude_code/copilot/anthropic)
|
||||
environment:
|
||||
- CLOUD_MODE=true
|
||||
- CLOUD_DATA_ROOT=/devl/menagerie-data
|
||||
- DIRECTUS_JWT_SECRET=${DIRECTUS_JWT_SECRET}
|
||||
- CF_SERVER_SECRET=${CF_SERVER_SECRET}
|
||||
- PLATFORM_DB_URL=${PLATFORM_DB_URL}
|
||||
- HEIMDALL_URL=${HEIMDALL_URL:-http://cf-license:8000}
|
||||
- HEIMDALL_ADMIN_TOKEN=${HEIMDALL_ADMIN_TOKEN}
|
||||
- STAGING_DB=/devl/menagerie-data/cloud-default.db # fallback only — never used
|
||||
- DOCS_DIR=/tmp/cloud-docs
|
||||
- STREAMLIT_SERVER_BASE_URL_PATH=peregrine
|
||||
- PYTHONUNBUFFERED=1
|
||||
- PEREGRINE_CADDY_PROXY=1
|
||||
- CF_ORCH_URL=http://host.docker.internal:7700
|
||||
- DEMO_MODE=false
|
||||
- FORGEJO_API_TOKEN=${FORGEJO_API_TOKEN:-}
|
||||
depends_on:
|
||||
searxng:
|
||||
condition: service_healthy
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
restart: unless-stopped
|
||||
|
||||
api:
|
||||
build:
|
||||
context: ..
|
||||
dockerfile: peregrine/Dockerfile.cfcore
|
||||
command: >
|
||||
bash -c "uvicorn dev_api:app --host 0.0.0.0 --port 8601"
|
||||
volumes:
|
||||
- /devl/menagerie-data:/devl/menagerie-data
|
||||
- ./config/llm.cloud.yaml:/app/config/llm.yaml:ro
|
||||
environment:
|
||||
- CLOUD_MODE=true
|
||||
- CLOUD_DATA_ROOT=/devl/menagerie-data
|
||||
- STAGING_DB=/devl/menagerie-data/cloud-default.db
|
||||
- DIRECTUS_JWT_SECRET=${DIRECTUS_JWT_SECRET}
|
||||
- CF_SERVER_SECRET=${CF_SERVER_SECRET}
|
||||
- PLATFORM_DB_URL=${PLATFORM_DB_URL}
|
||||
- HEIMDALL_URL=${HEIMDALL_URL:-http://cf-license:8000}
|
||||
- HEIMDALL_ADMIN_TOKEN=${HEIMDALL_ADMIN_TOKEN}
|
||||
- PYTHONUNBUFFERED=1
|
||||
- FORGEJO_API_TOKEN=${FORGEJO_API_TOKEN:-}
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
restart: unless-stopped
|
||||
|
||||
web:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/web/Dockerfile
|
||||
args:
|
||||
VITE_BASE_PATH: /peregrine/
|
||||
ports:
|
||||
- "8508:80"
|
||||
depends_on:
|
||||
- api
|
||||
restart: unless-stopped
|
||||
|
||||
searxng:
|
||||
image: searxng/searxng:latest
|
||||
volumes:
|
||||
- ./docker/searxng:/etc/searxng:ro
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-q", "--spider", "http://localhost:8080/"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
restart: unless-stopped
|
||||
# No host port — internal only
|
||||
|
||||
networks:
|
||||
default:
|
||||
external: true
|
||||
name: caddy-proxy_caddy-internal
|
||||
62
compose.demo.yml
Normal file
62
compose.demo.yml
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
# compose.demo.yml — Public demo stack for demo.circuitforge.tech/peregrine
|
||||
#
|
||||
# Runs a fully isolated, neutered Peregrine instance:
|
||||
# - DEMO_MODE=true: blocks all LLM inference in llm_router.py
|
||||
# - demo/config/: pre-seeded demo user profile, all backends disabled
|
||||
# - demo/data/: isolated SQLite DB (no personal job data)
|
||||
# - No personal documents mounted
|
||||
# - Port 8504 (separate from the personal instance on 8502)
|
||||
#
|
||||
# Usage:
|
||||
# docker compose -f compose.demo.yml --project-name peregrine-demo up -d
|
||||
# docker compose -f compose.demo.yml --project-name peregrine-demo down
|
||||
#
|
||||
# Caddy demo.circuitforge.tech/peregrine* → host port 8504
|
||||
|
||||
services:
|
||||
|
||||
app:
|
||||
build: .
|
||||
ports:
|
||||
- "8504:8501"
|
||||
volumes:
|
||||
- ./demo/config:/app/config
|
||||
- ./demo/data:/app/data
|
||||
# No /docs mount — demo has no personal documents
|
||||
environment:
|
||||
- DEMO_MODE=true
|
||||
- STAGING_DB=/app/data/staging.db
|
||||
- DOCS_DIR=/tmp/demo-docs
|
||||
- STREAMLIT_SERVER_BASE_URL_PATH=peregrine
|
||||
- PYTHONUNBUFFERED=1
|
||||
- PYTHONLOGGING=WARNING
|
||||
# No API keys — inference is blocked by DEMO_MODE before any key is needed
|
||||
depends_on:
|
||||
searxng:
|
||||
condition: service_healthy
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
restart: unless-stopped
|
||||
|
||||
web:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/web/Dockerfile
|
||||
args:
|
||||
VITE_BASE_PATH: /peregrine/
|
||||
ports:
|
||||
- "8507:80"
|
||||
restart: unless-stopped
|
||||
|
||||
searxng:
|
||||
image: searxng/searxng:latest
|
||||
volumes:
|
||||
- ./docker/searxng:/etc/searxng:ro
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-q", "--spider", "http://localhost:8080/"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
restart: unless-stopped
|
||||
# No host port published — internal only; demo app uses it for job description enrichment
|
||||
# (non-AI scraping is allowed; only LLM inference is blocked)
|
||||
|
|
@ -18,6 +18,15 @@ services:
|
|||
device_ids: ["0"]
|
||||
capabilities: [gpu]
|
||||
|
||||
ollama_research:
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
device_ids: ["1"]
|
||||
capabilities: [gpu]
|
||||
|
||||
vision:
|
||||
deploy:
|
||||
resources:
|
||||
|
|
|
|||
|
|
@ -18,6 +18,14 @@ services:
|
|||
reservations:
|
||||
devices: []
|
||||
|
||||
ollama_research:
|
||||
devices:
|
||||
- nvidia.com/gpu=1
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices: []
|
||||
|
||||
vision:
|
||||
devices:
|
||||
- nvidia.com/gpu=0
|
||||
|
|
|
|||
35
compose.test-cfcore.yml
Normal file
35
compose.test-cfcore.yml
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
# compose.test-cfcore.yml — single-user test instance for circuitforge-core integration
|
||||
#
|
||||
# Run from the PARENT directory of peregrine/ (the build context must include
|
||||
# both peregrine/ and circuitforge-core/ as siblings):
|
||||
#
|
||||
# cd /devl (or /Library/Development/CircuitForge on dev)
|
||||
# docker compose -f peregrine/compose.test-cfcore.yml --project-name peregrine-test up -d
|
||||
# docker compose -f peregrine/compose.test-cfcore.yml --project-name peregrine-test logs -f
|
||||
# docker compose -f peregrine/compose.test-cfcore.yml --project-name peregrine-test down
|
||||
#
|
||||
# UI: http://localhost:8516
|
||||
# Purpose: smoke-test circuitforge-core shims (db, llm_router, tiers, task_scheduler)
|
||||
# before promoting cfcore integration to the production cloud instance.
|
||||
|
||||
services:
|
||||
app:
|
||||
build:
|
||||
context: ..
|
||||
dockerfile: peregrine/Dockerfile.cfcore
|
||||
container_name: peregrine-test-cfcore
|
||||
ports:
|
||||
- "8516:8501"
|
||||
volumes:
|
||||
- /devl/job-seeker:/devl/job-seeker
|
||||
- /devl/job-seeker/config:/app/config
|
||||
- /devl/job-seeker/config/llm.docker.yaml:/app/config/llm.yaml:ro
|
||||
- /devl/job-seeker/config/user.docker.yaml:/app/config/user.yaml:ro
|
||||
environment:
|
||||
- STAGING_DB=/devl/job-seeker/staging.db
|
||||
- PYTHONUNBUFFERED=1
|
||||
- STREAMLIT_SERVER_BASE_URL_PATH=
|
||||
- CF_ORCH_URL=http://host.docker.internal:7700
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
restart: "no"
|
||||
97
compose.yml
97
compose.yml
|
|
@ -1,11 +1,54 @@
|
|||
# compose.yml — Peregrine by Circuit Forge LLC
|
||||
# Profiles: remote | cpu | single-gpu | dual-gpu
|
||||
# Profiles: remote | cpu | single-gpu | dual-gpu-ollama
|
||||
services:
|
||||
|
||||
app:
|
||||
build: .
|
||||
build:
|
||||
context: ..
|
||||
dockerfile: peregrine/Dockerfile.cfcore
|
||||
command: >
|
||||
bash -c "streamlit run app/app.py
|
||||
--server.port=8501
|
||||
--server.headless=true
|
||||
--server.fileWatcherType=none
|
||||
2>&1 | tee /app/data/.streamlit.log"
|
||||
ports:
|
||||
- "${STREAMLIT_PORT:-8501}:8501"
|
||||
volumes:
|
||||
- ./config:/app/config
|
||||
- ./data:/app/data
|
||||
- ${DOCS_DIR:-~/Documents/JobSearch}:/docs
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
- /usr/bin/docker:/usr/bin/docker:ro
|
||||
environment:
|
||||
- STAGING_DB=/app/data/staging.db
|
||||
- DOCS_DIR=/docs
|
||||
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
|
||||
- OPENAI_COMPAT_URL=${OPENAI_COMPAT_URL:-}
|
||||
- OPENAI_COMPAT_KEY=${OPENAI_COMPAT_KEY:-}
|
||||
- PEREGRINE_GPU_COUNT=${PEREGRINE_GPU_COUNT:-0}
|
||||
- PEREGRINE_GPU_NAMES=${PEREGRINE_GPU_NAMES:-}
|
||||
- RECOMMENDED_PROFILE=${RECOMMENDED_PROFILE:-remote}
|
||||
- STREAMLIT_SERVER_BASE_URL_PATH=${STREAMLIT_BASE_URL_PATH:-}
|
||||
- FORGEJO_API_TOKEN=${FORGEJO_API_TOKEN:-}
|
||||
- FORGEJO_REPO=${FORGEJO_REPO:-}
|
||||
- FORGEJO_API_URL=${FORGEJO_API_URL:-}
|
||||
- PYTHONUNBUFFERED=1
|
||||
- PYTHONLOGGING=WARNING
|
||||
- PEREGRINE_CADDY_PROXY=1
|
||||
depends_on:
|
||||
searxng:
|
||||
condition: service_healthy
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
restart: unless-stopped
|
||||
|
||||
api:
|
||||
build:
|
||||
context: ..
|
||||
dockerfile: peregrine/Dockerfile.cfcore
|
||||
command: >
|
||||
bash -c "uvicorn dev_api:app --host 0.0.0.0 --port 8601"
|
||||
volumes:
|
||||
- ./config:/app/config
|
||||
- ./data:/app/data
|
||||
|
|
@ -18,16 +61,21 @@ services:
|
|||
- OPENAI_COMPAT_KEY=${OPENAI_COMPAT_KEY:-}
|
||||
- PEREGRINE_GPU_COUNT=${PEREGRINE_GPU_COUNT:-0}
|
||||
- PEREGRINE_GPU_NAMES=${PEREGRINE_GPU_NAMES:-}
|
||||
- RECOMMENDED_PROFILE=${RECOMMENDED_PROFILE:-remote}
|
||||
- PYTHONUNBUFFERED=1
|
||||
- PYTHONLOGGING=WARNING
|
||||
depends_on:
|
||||
searxng:
|
||||
condition: service_healthy
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
restart: unless-stopped
|
||||
|
||||
web:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/web/Dockerfile
|
||||
ports:
|
||||
- "${VUE_PORT:-8506}:80"
|
||||
depends_on:
|
||||
- api
|
||||
restart: unless-stopped
|
||||
|
||||
searxng:
|
||||
image: searxng/searxng:latest
|
||||
ports:
|
||||
|
|
@ -52,7 +100,21 @@ services:
|
|||
- OLLAMA_MODELS=/root/.ollama
|
||||
- DEFAULT_OLLAMA_MODEL=${OLLAMA_DEFAULT_MODEL:-llama3.2:3b}
|
||||
entrypoint: ["/bin/bash", "/entrypoint.sh"]
|
||||
profiles: [cpu, single-gpu, dual-gpu]
|
||||
profiles: [cpu, single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
|
||||
restart: unless-stopped
|
||||
|
||||
ollama_research:
|
||||
image: ollama/ollama:latest
|
||||
ports:
|
||||
- "${OLLAMA_RESEARCH_PORT:-11435}:11434"
|
||||
volumes:
|
||||
- ${OLLAMA_MODELS_DIR:-~/models/ollama}:/root/.ollama
|
||||
- ./docker/ollama/entrypoint.sh:/entrypoint.sh
|
||||
environment:
|
||||
- OLLAMA_MODELS=/root/.ollama
|
||||
- DEFAULT_OLLAMA_MODEL=${OLLAMA_RESEARCH_MODEL:-llama3.2:3b}
|
||||
entrypoint: ["/bin/bash", "/entrypoint.sh"]
|
||||
profiles: [dual-gpu-ollama, dual-gpu-mixed]
|
||||
restart: unless-stopped
|
||||
|
||||
vision:
|
||||
|
|
@ -64,24 +126,7 @@ services:
|
|||
environment:
|
||||
- VISION_MODEL=${VISION_MODEL:-vikhyatk/moondream2}
|
||||
- VISION_REVISION=${VISION_REVISION:-2025-01-09}
|
||||
profiles: [single-gpu, dual-gpu]
|
||||
restart: unless-stopped
|
||||
|
||||
vllm:
|
||||
image: vllm/vllm-openai:latest
|
||||
ports:
|
||||
- "${VLLM_PORT:-8000}:8000"
|
||||
volumes:
|
||||
- ${VLLM_MODELS_DIR:-~/models/vllm}:/models
|
||||
command: >
|
||||
--model /models/${VLLM_MODEL:-Ouro-1.4B}
|
||||
--trust-remote-code
|
||||
--max-model-len 4096
|
||||
--gpu-memory-utilization 0.75
|
||||
--enforce-eager
|
||||
--max-num-seqs 8
|
||||
--cpu-offload-gb ${CPU_OFFLOAD_GB:-0}
|
||||
profiles: [dual-gpu]
|
||||
profiles: [single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
|
||||
restart: unless-stopped
|
||||
|
||||
finetune:
|
||||
|
|
|
|||
|
|
@ -3,7 +3,8 @@
|
|||
|
||||
# Company name blocklist — partial case-insensitive match on the company field.
|
||||
# e.g. "Amazon" blocks any listing where company contains "amazon".
|
||||
companies: []
|
||||
companies:
|
||||
- jobgether
|
||||
|
||||
# Industry/content blocklist — blocked if company name OR job description contains any keyword.
|
||||
# Use this for industries you will never work in regardless of company.
|
||||
|
|
|
|||
62
config/llm.cloud.yaml
Normal file
62
config/llm.cloud.yaml
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
backends:
|
||||
anthropic:
|
||||
api_key_env: ANTHROPIC_API_KEY
|
||||
enabled: false
|
||||
model: claude-sonnet-4-6
|
||||
supports_images: true
|
||||
type: anthropic
|
||||
claude_code:
|
||||
api_key: any
|
||||
base_url: http://localhost:3009/v1
|
||||
enabled: false
|
||||
model: claude-code-terminal
|
||||
supports_images: true
|
||||
type: openai_compat
|
||||
github_copilot:
|
||||
api_key: any
|
||||
base_url: http://localhost:3010/v1
|
||||
enabled: false
|
||||
model: gpt-4o
|
||||
supports_images: false
|
||||
type: openai_compat
|
||||
ollama:
|
||||
api_key: ollama
|
||||
base_url: http://host.docker.internal:11434/v1
|
||||
enabled: true
|
||||
model: llama3.1:8b # generic — no personal fine-tunes in cloud
|
||||
supports_images: false
|
||||
type: openai_compat
|
||||
ollama_research:
|
||||
api_key: ollama
|
||||
base_url: http://host.docker.internal:11434/v1
|
||||
enabled: true
|
||||
model: llama3.1:8b
|
||||
supports_images: false
|
||||
type: openai_compat
|
||||
vision_service:
|
||||
base_url: http://host.docker.internal:8002
|
||||
enabled: true
|
||||
supports_images: true
|
||||
type: vision_service
|
||||
vllm:
|
||||
api_key: ''
|
||||
base_url: http://host.docker.internal:8000/v1
|
||||
enabled: true
|
||||
model: __auto__
|
||||
supports_images: false
|
||||
type: openai_compat
|
||||
vllm_research:
|
||||
api_key: ''
|
||||
base_url: http://host.docker.internal:8000/v1
|
||||
enabled: true
|
||||
model: __auto__
|
||||
supports_images: false
|
||||
type: openai_compat
|
||||
fallback_order:
|
||||
- vllm
|
||||
- ollama
|
||||
research_fallback_order:
|
||||
- vllm_research
|
||||
- ollama_research
|
||||
vision_fallback_order:
|
||||
- vision_service
|
||||
|
|
@ -28,9 +28,9 @@ backends:
|
|||
type: openai_compat
|
||||
ollama_research:
|
||||
api_key: ollama
|
||||
base_url: http://host.docker.internal:11434/v1
|
||||
base_url: http://ollama_research:11434/v1
|
||||
enabled: true
|
||||
model: llama3.2:3b
|
||||
model: llama3.1:8b
|
||||
supports_images: false
|
||||
type: openai_compat
|
||||
vision_service:
|
||||
|
|
@ -45,6 +45,18 @@ backends:
|
|||
model: __auto__
|
||||
supports_images: false
|
||||
type: openai_compat
|
||||
cf_orch:
|
||||
service: vllm
|
||||
model_candidates:
|
||||
- Qwen2.5-3B-Instruct
|
||||
ttl_s: 300
|
||||
vllm_research:
|
||||
api_key: ''
|
||||
base_url: http://host.docker.internal:8000/v1
|
||||
enabled: true
|
||||
model: __auto__
|
||||
supports_images: false
|
||||
type: openai_compat
|
||||
fallback_order:
|
||||
- ollama
|
||||
- claude_code
|
||||
|
|
@ -53,7 +65,7 @@ fallback_order:
|
|||
- anthropic
|
||||
research_fallback_order:
|
||||
- claude_code
|
||||
- vllm
|
||||
- vllm_research
|
||||
- ollama_research
|
||||
- github_copilot
|
||||
- anthropic
|
||||
|
|
|
|||
|
|
@ -61,6 +61,17 @@ vision_fallback_order:
|
|||
- vision_service
|
||||
- claude_code
|
||||
- anthropic
|
||||
# Note: 'ollama' (meghan-cover-writer) intentionally excluded — research
|
||||
# Note: 'ollama' (alex-cover-writer) intentionally excluded — research
|
||||
# must never use the fine-tuned writer model, and this also avoids evicting
|
||||
# the writer from GPU memory while a cover letter task is in flight.
|
||||
|
||||
# ── Scheduler — LLM batch queue optimizer ─────────────────────────────────────
|
||||
# The scheduler batches LLM tasks by model type to avoid GPU model switching.
|
||||
# VRAM budgets are conservative peak estimates (GB) for each task type.
|
||||
# Increase if your models are larger; decrease if tasks share GPU memory well.
|
||||
scheduler:
|
||||
vram_budgets:
|
||||
cover_letter: 2.5 # alex-cover-writer:latest (~2GB GGUF + headroom)
|
||||
company_research: 5.0 # llama3.1:8b or vllm model
|
||||
wizard_generate: 2.5 # same model family as cover_letter
|
||||
max_queue_depth: 500 # max pending tasks per type before drops (with logged warning)
|
||||
|
|
|
|||
14
config/server.yaml.example
Normal file
14
config/server.yaml.example
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
# config/server.yaml — Peregrine deployment / server settings
|
||||
# Copy to config/server.yaml and edit. Gitignored — do not commit.
|
||||
# Changes require restarting Peregrine to take effect (./manage.sh restart).
|
||||
|
||||
# base_url_path: URL prefix when serving Peregrine behind a reverse proxy.
|
||||
# Leave empty ("") for direct access (http://localhost:8502).
|
||||
# Set to "peregrine" when proxied at https://example.com/peregrine.
|
||||
# Maps to STREAMLIT_BASE_URL_PATH in .env → STREAMLIT_SERVER_BASE_URL_PATH
|
||||
# in the container. See: https://docs.streamlit.io/develop/api-reference/configuration/config.toml#server
|
||||
base_url_path: ""
|
||||
|
||||
# server_port: Port Streamlit listens on inside the container (usually 8501).
|
||||
# The external/host port is set via STREAMLIT_PORT in .env.
|
||||
server_port: 8501
|
||||
|
|
@ -20,6 +20,14 @@ mission_preferences:
|
|||
music: "" # e.g. "I've played in bands for 15 years and care deeply about how artists get paid"
|
||||
animal_welfare: "" # e.g. "I volunteer at my local shelter every weekend"
|
||||
education: "" # e.g. "I tutored underserved kids for 3 years and care deeply about literacy"
|
||||
social_impact: "" # e.g. "I want my work to reach people who need help most"
|
||||
health: "" # e.g. "I care about people navigating rare or poorly-understood health conditions"
|
||||
# Note: if left empty, Para 3 defaults to focusing on the people the company
|
||||
# serves — not the industry. Fill in for a more personal connection.
|
||||
|
||||
# Optional: how you write and communicate. Used to shape cover letter voice.
|
||||
# e.g. "Warm and direct. Cares about people first. Finds rare and complex situations fascinating."
|
||||
candidate_voice: ""
|
||||
|
||||
# Set to true to include optional identity-related sections in research briefs.
|
||||
# Both are for your personal decision-making only — never included in applications.
|
||||
|
|
@ -35,6 +43,7 @@ dev_tier_override: null # overrides tier locally (for testing only)
|
|||
wizard_complete: false
|
||||
wizard_step: 0
|
||||
dismissed_banners: []
|
||||
ui_preference: streamlit # UI preference — "streamlit" (default) or "vue" (Beta: Paid tier)
|
||||
|
||||
docs_dir: "~/Documents/JobSearch"
|
||||
ollama_models_dir: "~/models/ollama"
|
||||
|
|
|
|||
8
data/email_score.jsonl.example
Normal file
8
data/email_score.jsonl.example
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
{"subject": "Interview Invitation — Senior Engineer", "body": "Hi Alex, we'd love to schedule a 30-min phone screen. Are you available Thursday at 2pm? Please reply to confirm.", "label": "interview_scheduled"}
|
||||
{"subject": "Your application to Acme Corp", "body": "Thank you for your interest in the Senior Engineer role. After careful consideration, we have decided to move forward with other candidates whose experience more closely matches our current needs.", "label": "rejected"}
|
||||
{"subject": "Offer Letter — Product Manager at Initech", "body": "Dear Alex, we are thrilled to extend an offer of employment for the Product Manager position. Please find the attached offer letter outlining compensation and start date.", "label": "offer_received"}
|
||||
{"subject": "Quick question about your background", "body": "Hi Alex, I came across your profile and would love to connect. We have a few roles that seem like a great match. Would you be open to a brief chat this week?", "label": "positive_response"}
|
||||
{"subject": "Company Culture Survey — Acme Corp", "body": "Alex, as part of our evaluation process, we invite all candidates to complete our culture fit assessment. The survey takes approximately 15 minutes. Please click the link below.", "label": "survey_received"}
|
||||
{"subject": "Application Received — DataCo", "body": "Thank you for submitting your application for the Data Engineer role at DataCo. We have received your materials and will be in touch if your qualifications match our needs.", "label": "neutral"}
|
||||
{"subject": "Following up on your application", "body": "Hi Alex, I wanted to follow up on your recent application. Your background looks interesting and we'd like to learn more. Can we set up a quick call?", "label": "positive_response"}
|
||||
{"subject": "We're moving forward with other candidates", "body": "Dear Alex, thank you for taking the time to interview with us. After thoughtful consideration, we have decided not to move forward with your candidacy at this time.", "label": "rejected"}
|
||||
68
demo/config/llm.yaml
Normal file
68
demo/config/llm.yaml
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
# Demo LLM config — all backends disabled.
|
||||
# DEMO_MODE=true in the environment blocks the router before any backend is tried,
|
||||
# so these values are never actually used. Kept for schema completeness.
|
||||
backends:
|
||||
anthropic:
|
||||
api_key_env: ANTHROPIC_API_KEY
|
||||
enabled: false
|
||||
model: claude-sonnet-4-6
|
||||
supports_images: true
|
||||
type: anthropic
|
||||
claude_code:
|
||||
api_key: any
|
||||
base_url: http://localhost:3009/v1
|
||||
enabled: false
|
||||
model: claude-code-terminal
|
||||
supports_images: true
|
||||
type: openai_compat
|
||||
github_copilot:
|
||||
api_key: any
|
||||
base_url: http://localhost:3010/v1
|
||||
enabled: false
|
||||
model: gpt-4o
|
||||
supports_images: false
|
||||
type: openai_compat
|
||||
ollama:
|
||||
api_key: ollama
|
||||
base_url: http://localhost:11434/v1
|
||||
enabled: false
|
||||
model: llama3.2:3b
|
||||
supports_images: false
|
||||
type: openai_compat
|
||||
ollama_research:
|
||||
api_key: ollama
|
||||
base_url: http://localhost:11434/v1
|
||||
enabled: false
|
||||
model: llama3.2:3b
|
||||
supports_images: false
|
||||
type: openai_compat
|
||||
vision_service:
|
||||
base_url: http://localhost:8002
|
||||
enabled: false
|
||||
supports_images: true
|
||||
type: vision_service
|
||||
vllm:
|
||||
api_key: ''
|
||||
base_url: http://localhost:8000/v1
|
||||
enabled: false
|
||||
model: __auto__
|
||||
supports_images: false
|
||||
type: openai_compat
|
||||
vllm_research:
|
||||
api_key: ''
|
||||
base_url: http://localhost:8000/v1
|
||||
enabled: false
|
||||
model: __auto__
|
||||
supports_images: false
|
||||
type: openai_compat
|
||||
fallback_order:
|
||||
- ollama
|
||||
- vllm
|
||||
- anthropic
|
||||
research_fallback_order:
|
||||
- vllm_research
|
||||
- ollama_research
|
||||
- anthropic
|
||||
vision_fallback_order:
|
||||
- vision_service
|
||||
- anthropic
|
||||
45
demo/config/user.yaml
Normal file
45
demo/config/user.yaml
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
candidate_accessibility_focus: false
|
||||
candidate_lgbtq_focus: false
|
||||
candidate_voice: Clear, direct, and human. Focuses on impact over jargon.
|
||||
career_summary: 'Experienced software engineer with a background in full-stack development,
|
||||
cloud infrastructure, and data pipelines. Passionate about building tools that help
|
||||
people navigate complex systems.
|
||||
|
||||
'
|
||||
dev_tier_override: null
|
||||
dismissed_banners:
|
||||
- connect_cloud
|
||||
- setup_email
|
||||
docs_dir: /docs
|
||||
email: demo@circuitforge.tech
|
||||
inference_profile: remote
|
||||
linkedin: ''
|
||||
mission_preferences:
|
||||
animal_welfare: ''
|
||||
education: ''
|
||||
health: ''
|
||||
music: ''
|
||||
social_impact: Want my work to reach people who need it most.
|
||||
name: Demo User
|
||||
nda_companies: []
|
||||
ollama_models_dir: /root/models/ollama
|
||||
phone: ''
|
||||
services:
|
||||
ollama_host: localhost
|
||||
ollama_port: 11434
|
||||
ollama_ssl: false
|
||||
ollama_ssl_verify: true
|
||||
searxng_host: searxng
|
||||
searxng_port: 8080
|
||||
searxng_ssl: false
|
||||
searxng_ssl_verify: true
|
||||
streamlit_port: 8501
|
||||
vllm_host: localhost
|
||||
vllm_port: 8000
|
||||
vllm_ssl: false
|
||||
vllm_ssl_verify: true
|
||||
tier: free
|
||||
ui_preference: streamlit
|
||||
vllm_models_dir: /root/models/vllm
|
||||
wizard_complete: true
|
||||
wizard_step: 0
|
||||
0
demo/data/.gitkeep
Normal file
0
demo/data/.gitkeep
Normal file
2793
dev-api.py
Normal file
2793
dev-api.py
Normal file
File diff suppressed because it is too large
Load diff
1
dev_api.py
Symbolic link
1
dev_api.py
Symbolic link
|
|
@ -0,0 +1 @@
|
|||
dev-api.py
|
||||
15
docker/web/Dockerfile
Normal file
15
docker/web/Dockerfile
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
# Stage 1: build
|
||||
FROM node:20-alpine AS build
|
||||
WORKDIR /app
|
||||
COPY web/package*.json ./
|
||||
RUN npm ci --prefer-offline
|
||||
COPY web/ ./
|
||||
ARG VITE_BASE_PATH=/
|
||||
ENV VITE_BASE_PATH=${VITE_BASE_PATH}
|
||||
RUN npm run build
|
||||
|
||||
# Stage 2: serve
|
||||
FROM nginx:alpine
|
||||
COPY docker/web/nginx.conf /etc/nginx/conf.d/default.conf
|
||||
COPY --from=build /app/dist /usr/share/nginx/html
|
||||
EXPOSE 80
|
||||
29
docker/web/nginx.conf
Normal file
29
docker/web/nginx.conf
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
server {
|
||||
listen 80;
|
||||
server_name _;
|
||||
|
||||
client_max_body_size 20m;
|
||||
|
||||
root /usr/share/nginx/html;
|
||||
index index.html;
|
||||
|
||||
# Proxy API calls to the FastAPI backend service
|
||||
location /api/ {
|
||||
proxy_pass http://api:8601;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_read_timeout 120s;
|
||||
}
|
||||
|
||||
# Cache static assets
|
||||
location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff2?)$ {
|
||||
expires 1y;
|
||||
add_header Cache-Control "public, immutable";
|
||||
}
|
||||
|
||||
# SPA fallback — must come after API and assets
|
||||
location / {
|
||||
try_files $uri $uri/ /index.html;
|
||||
}
|
||||
}
|
||||
0
docs/.gitkeep
Normal file
0
docs/.gitkeep
Normal file
101
docs/backlog.md
101
docs/backlog.md
|
|
@ -2,6 +2,82 @@
|
|||
|
||||
Unscheduled ideas and deferred features. Roughly grouped by area.
|
||||
|
||||
See also: `circuitforge-plans/shared/2026-03-07-launch-checklist.md` for pre-launch blockers
|
||||
(legal docs, Stripe live keys, website deployment, demo DB ownership fix).
|
||||
|
||||
---
|
||||
|
||||
## Launch Blockers (tracked in shared launch checklist)
|
||||
|
||||
- **ToS + Refund Policy** — required before live Stripe charges. Files go in `website/content/legal/`.
|
||||
- **Stripe live key rotation** — swap test keys to live in `website/.env` (zero code changes).
|
||||
- **Website deployment to bastion** — Caddy route for Nuxt frontend at `circuitforge.tech`.
|
||||
- **Demo DB ownership** — `demo/data/staging.db` is root-owned (Docker artifact); fix with `sudo chown alan:alan` then re-run `demo/seed_demo.py`.
|
||||
|
||||
---
|
||||
|
||||
## Post-Launch / Infrastructure
|
||||
|
||||
- **Accessibility Statement** — WCAG 2.1 conformance doc at `website/content/legal/accessibility.md`. High credibility value for ND audience.
|
||||
- **Data deletion request process** — published procedure at `website/content/legal/data-deletion.md` (GDPR/CCPA; references `privacy@circuitforge.tech`).
|
||||
- **Uptime Kuma monitors** — 6 monitors need to be added manually (website, Heimdall, demo, Directus, Forgejo, Peregrine container health).
|
||||
- **Directus admin password rotation** — change from `changeme-set-via-ui-on-first-run` before website goes public.
|
||||
|
||||
---
|
||||
|
||||
## Discovery — Community Scraper Plugin System
|
||||
|
||||
Design doc: `circuitforge-plans/peregrine/2026-03-07-community-scraper-plugin-design.md`
|
||||
|
||||
**Summary:** Add a `scripts/plugins/` directory with auto-discovery and a documented MIT-licensed
|
||||
plugin API. Separates CF-built custom scrapers (paid, BSL 1.1, in `scripts/custom_boards/`) from
|
||||
community-contributed and CF-freebie scrapers (free, MIT, in `scripts/plugins/`).
|
||||
|
||||
**Implementation tasks:**
|
||||
- [ ] Add `scripts/plugins/` with `__init__.py`, `README.md`, and `example_plugin.py`
|
||||
- [ ] Add `config/plugins/` directory with `.gitkeep`; gitignore `config/plugins/*.yaml` (not `.example`)
|
||||
- [ ] Update `discover.py`: `load_plugins()` auto-discovery + tier gate (`custom_boards` = paid, `plugins` = free)
|
||||
- [ ] Update `search_profiles.yaml` schema: add `plugins:` list + `plugin_config:` block
|
||||
- [ ] Migrate `scripts/custom_boards/craigslist.py` → `scripts/plugins/craigslist.py` (CF freebie)
|
||||
- [ ] Settings UI: render `CONFIG_SCHEMA` fields for installed plugins (Settings → Search)
|
||||
- [ ] Rewrite `docs/developer-guide/adding-scrapers.md` to document the plugin API
|
||||
- [ ] Add `scripts/plugins/LICENSE` (MIT) to make the dual-license split explicit
|
||||
|
||||
**CF freebie candidates** (future, after plugin system ships):
|
||||
- Dice.com (tech-focused, no API key)
|
||||
- We Work Remotely (remote-only, clean HTML)
|
||||
- Wellfound / AngelList (startup roles)
|
||||
|
||||
---
|
||||
|
||||
## Discovery — Jobgether Non-Headless Scraper
|
||||
|
||||
Design doc: `peregrine/docs/superpowers/specs/2026-03-15-jobgether-integration-design.md`
|
||||
|
||||
**Background:** Headless Playwright is blocked by Cloudflare Turnstile on all `jobgether.com` pages.
|
||||
A non-headless Playwright instance backed by `Xvfb` (virtual framebuffer) renders as a real browser and
|
||||
bypasses Turnstile. Heimdall already has Xvfb available.
|
||||
|
||||
**Live-inspection findings (2026-03-15):**
|
||||
- Search URL: `https://jobgether.com/search-offers?keyword=<query>`
|
||||
- Job cards: `div.new-opportunity` — one per listing
|
||||
- Card URL: `div.new-opportunity > a[href*="/offer/"]` (`href` attr)
|
||||
- Title: `#offer-body h3`
|
||||
- Company: `#offer-body p.font-medium`
|
||||
- Dedup: existing URL-based dedup in `discover.py` covers Jobgether↔other-board overlap
|
||||
|
||||
**Implementation tasks (blocked until Xvfb-Playwright integration is in place):**
|
||||
- [ ] Add `Xvfb` launch helper to `scripts/custom_boards/` (shared util, or inline in scraper)
|
||||
- [ ] Implement `scripts/custom_boards/jobgether.py` using `p.chromium.launch(headless=False)` with `DISPLAY=:99`
|
||||
- [ ] Pre-launch `Xvfb :99 -screen 0 1280x720x24` (or assert `DISPLAY` is already set)
|
||||
- [ ] Register `jobgether` in `discover.py` `CUSTOM_SCRAPERS` (currently omitted — no viable scraper)
|
||||
- [ ] Add `jobgether` to `custom_boards` in remote-eligible profiles in `config/search_profiles.yaml`
|
||||
- [ ] Remove or update the "Jobgether discovery scraper — decided against" note in the design spec
|
||||
|
||||
**Pre-condition:** Validate Xvfb approach manually (headless=False + `DISPLAY=:99`) before implementing.
|
||||
The `filter-api.jobgether.com` endpoint still requires auth and `robots.txt` still blocks bots —
|
||||
confirm Turnstile acceptance is the only remaining blocker before beginning.
|
||||
|
||||
---
|
||||
|
||||
## Settings / Data Management
|
||||
|
|
@ -17,6 +93,31 @@ Unscheduled ideas and deferred features. Roughly grouped by area.
|
|||
|
||||
---
|
||||
|
||||
## LinkedIn Import
|
||||
|
||||
Shipped in v0.4.0. Ongoing maintenance and known decisions:
|
||||
|
||||
- **Selector maintenance** — LinkedIn changes their DOM periodically. When import stops working, update
|
||||
CSS selectors in `scripts/linkedin_utils.py` only (all other files import from there). Real `data-section`
|
||||
attribute values (as of 2025 DOM): `summary`, `currentPositionsDetails`, `educationsDetails`,
|
||||
`certifications`, `posts`, `volunteering`, `publications`, `projects`.
|
||||
|
||||
- **Data export zip is the recommended path for full history** — LinkedIn's unauthenticated public profile
|
||||
page is server-side degraded: experience titles, past roles, education, and skills are blurred/omitted.
|
||||
Only available without login: name, About summary (truncated), current employer name, certifications.
|
||||
The "Import from LinkedIn data export zip" expander (Settings → Resume Profile and Wizard step 3) is the
|
||||
correct path for full career history. UI already shows an `ℹ️` callout explaining this.
|
||||
|
||||
- **LinkedIn OAuth — decided: not viable** — LinkedIn's OAuth API is restricted to approved partner
|
||||
programs. Even if approved, it only grants name + email (not career history, experience, or skills).
|
||||
This is a deliberate LinkedIn platform restriction, not a technical gap. Do not pursue this path.
|
||||
|
||||
- **Selector test harness** (future) — A lightweight test that fetches a known-public LinkedIn profile
|
||||
and asserts at least N fields non-empty would catch DOM breakage before users report it. Low priority
|
||||
until selector breakage becomes a recurring support issue.
|
||||
|
||||
---
|
||||
|
||||
## Cover Letter / Resume Generation
|
||||
|
||||
- ~~**Iterative refinement feedback loop**~~ — ✅ Done (`94225c9`): `generate()` accepts `previous_result`/`feedback`; task_runner parses params JSON; Apply Workspace has "Refine with Feedback" expander. Same pattern available for wizard `expand_bullets` via `_run_wizard_generate`.
|
||||
|
|
|
|||
|
|
@ -6,87 +6,179 @@ This page describes Peregrine's system structure, layer boundaries, and key desi
|
|||
|
||||
## System Overview
|
||||
|
||||
### Pipeline
|
||||
|
||||
```mermaid
|
||||
flowchart LR
|
||||
sources["JobSpy\nCustom Boards"]
|
||||
discover["discover.py"]
|
||||
db[("staging.db\nSQLite")]
|
||||
match["match.py\nScoring"]
|
||||
review["Job Review\nApprove / Reject"]
|
||||
apply["Apply Workspace\nCover letter + PDF"]
|
||||
kanban["Interviews\nphone_screen → hired"]
|
||||
sync["sync.py"]
|
||||
notion["Notion DB"]
|
||||
|
||||
sources --> discover --> db --> match --> review --> apply --> kanban
|
||||
db --> sync --> notion
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ Docker Compose │
|
||||
│ │
|
||||
│ ┌──────────┐ ┌──────────┐ ┌───────┐ ┌───────────────┐ │
|
||||
│ │ app │ │ ollama │ │ vllm │ │ vision │ │
|
||||
│ │ :8501 │ │ :11434 │ │ :8000 │ │ :8002 │ │
|
||||
│ │Streamlit │ │ Local LLM│ │ vLLM │ │ Moondream2 │ │
|
||||
│ └────┬─────┘ └──────────┘ └───────┘ └───────────────┘ │
|
||||
│ │ │
|
||||
│ ┌────┴───────┐ ┌─────────────┐ │
|
||||
│ │ searxng │ │ staging.db │ │
|
||||
│ │ :8888 │ │ (SQLite) │ │
|
||||
│ └────────────┘ └─────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ Streamlit App Layer │
|
||||
│ │
|
||||
│ app/app.py (entry point, navigation, sidebar task badge) │
|
||||
│ │
|
||||
│ app/pages/ │
|
||||
│ 0_Setup.py First-run wizard (gates everything) │
|
||||
│ 1_Job_Review.py Approve / reject queue │
|
||||
│ 2_Settings.py All user configuration │
|
||||
│ 4_Apply.py Cover letter gen + PDF export │
|
||||
│ 5_Interviews.py Kanban: phone_screen → hired │
|
||||
│ 6_Interview_Prep.py Research brief + practice Q&A │
|
||||
│ 7_Survey.py Culture-fit survey assistant │
|
||||
│ │
|
||||
│ app/wizard/ │
|
||||
│ step_hardware.py ... step_integrations.py │
|
||||
│ tiers.py Feature gate definitions │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
### Docker Compose Services
|
||||
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ Scripts Layer │
|
||||
│ (framework-independent — could be called by FastAPI) │
|
||||
│ │
|
||||
│ discover.py JobSpy + custom board orchestration │
|
||||
│ match.py Resume keyword scoring │
|
||||
│ db.py All SQLite helpers (single source) │
|
||||
│ llm_router.py LLM fallback chain │
|
||||
│ generate_cover_letter.py Cover letter generation │
|
||||
│ company_research.py Pre-interview research brief │
|
||||
│ task_runner.py Background daemon thread executor │
|
||||
│ imap_sync.py IMAP email fetch + classify │
|
||||
│ sync.py Push to external integrations │
|
||||
│ user_profile.py UserProfile wrapper for user.yaml │
|
||||
│ preflight.py Port + resource check │
|
||||
│ │
|
||||
│ custom_boards/ Per-board scrapers │
|
||||
│ integrations/ Per-service integration drivers │
|
||||
│ vision_service/ FastAPI Moondream2 inference server │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
Three compose files serve different deployment contexts:
|
||||
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ Config Layer │
|
||||
│ │
|
||||
│ config/user.yaml Personal data + wizard state │
|
||||
│ config/llm.yaml LLM backends + fallback chains │
|
||||
│ config/search_profiles.yaml Job search configuration │
|
||||
│ config/resume_keywords.yaml Scoring keywords │
|
||||
│ config/blocklist.yaml Excluded companies/domains │
|
||||
│ config/email.yaml IMAP credentials │
|
||||
│ config/integrations/ Per-integration credentials │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
| File | Project name | Port | Purpose |
|
||||
|------|-------------|------|---------|
|
||||
| `compose.yml` | `peregrine` | 8502 | Local self-hosted install (default) |
|
||||
| `compose.demo.yml` | `peregrine-demo` | 8504 | Public demo at `demo.circuitforge.tech/peregrine` — `DEMO_MODE=true`, no LLM |
|
||||
| `compose.cloud.yml` | `peregrine-cloud` | 8505 | Cloud managed instance at `menagerie.circuitforge.tech/peregrine` — `CLOUD_MODE=true`, per-user data |
|
||||
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ Database Layer │
|
||||
│ │
|
||||
│ staging.db (SQLite, local, gitignored) │
|
||||
│ │
|
||||
│ jobs Core pipeline — all job data │
|
||||
│ job_contacts Email thread log per job │
|
||||
│ company_research LLM-generated research briefs │
|
||||
│ background_tasks Async task queue state │
|
||||
│ survey_responses Culture-fit survey Q&A pairs │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
```mermaid
|
||||
flowchart TB
|
||||
subgraph local["compose.yml (local)"]
|
||||
app_l["**app** :8502\nStreamlit UI"]
|
||||
ollama_l["**ollama**\nLocal LLM"]
|
||||
vllm_l["**vllm**\nvLLM"]
|
||||
vision_l["**vision**\nMoondream2"]
|
||||
searxng_l["**searxng**\nWeb Search"]
|
||||
db_l[("staging.db\nSQLite")]
|
||||
end
|
||||
|
||||
subgraph cloud["compose.cloud.yml (cloud)"]
|
||||
app_c["**app** :8505\nStreamlit UI\nCLOUD_MODE=true"]
|
||||
searxng_c["**searxng**\nWeb Search"]
|
||||
db_c[("menagerie-data/\n<user-id>/staging.db\nSQLCipher")]
|
||||
pg[("Postgres\nplatform DB\n:5433")]
|
||||
end
|
||||
```
|
||||
|
||||
Solid lines = always connected. Dashed lines = optional/profile-dependent backends.
|
||||
|
||||
### Streamlit App Layer
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
entry["app/app.py\nEntry point · navigation · sidebar task badge"]
|
||||
|
||||
setup["0_Setup.py\nFirst-run wizard\n⚠️ Gates everything"]
|
||||
review["1_Job_Review.py\nApprove / reject queue"]
|
||||
settings["2_Settings.py\nAll user configuration"]
|
||||
apply["4_Apply.py\nCover letter gen + PDF export"]
|
||||
interviews["5_Interviews.py\nKanban: phone_screen → hired"]
|
||||
prep["6_Interview_Prep.py\nResearch brief + practice Q&A"]
|
||||
survey["7_Survey.py\nCulture-fit survey assistant"]
|
||||
wizard["app/wizard/\nstep_hardware.py … step_integrations.py\ntiers.py — feature gate definitions"]
|
||||
|
||||
entry --> setup
|
||||
entry --> review
|
||||
entry --> settings
|
||||
entry --> apply
|
||||
entry --> interviews
|
||||
entry --> prep
|
||||
entry --> survey
|
||||
setup <-.->|wizard steps| wizard
|
||||
```
|
||||
|
||||
### Scripts Layer
|
||||
|
||||
Framework-independent — no Streamlit imports. Can be called from CLI, FastAPI, or background threads.
|
||||
|
||||
| Script | Purpose |
|
||||
|--------|---------|
|
||||
| `discover.py` | JobSpy + custom board orchestration |
|
||||
| `match.py` | Resume keyword scoring |
|
||||
| `db.py` | All SQLite helpers (single source of truth) |
|
||||
| `llm_router.py` | LLM fallback chain |
|
||||
| `generate_cover_letter.py` | Cover letter generation |
|
||||
| `company_research.py` | Pre-interview research brief |
|
||||
| `task_runner.py` | Background daemon thread executor |
|
||||
| `imap_sync.py` | IMAP email fetch + classify |
|
||||
| `sync.py` | Push to external integrations |
|
||||
| `user_profile.py` | `UserProfile` wrapper for `user.yaml` |
|
||||
| `preflight.py` | Port + resource check |
|
||||
| `custom_boards/` | Per-board scrapers |
|
||||
| `integrations/` | Per-service integration drivers |
|
||||
| `vision_service/` | FastAPI Moondream2 inference server |
|
||||
|
||||
### Config Layer
|
||||
|
||||
Plain YAML files. Gitignored files contain secrets; `.example` files are committed as templates.
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `config/user.yaml` | Personal data + wizard state |
|
||||
| `config/llm.yaml` | LLM backends + fallback chains |
|
||||
| `config/search_profiles.yaml` | Job search configuration |
|
||||
| `config/resume_keywords.yaml` | Scoring keywords |
|
||||
| `config/blocklist.yaml` | Excluded companies/domains |
|
||||
| `config/email.yaml` | IMAP credentials |
|
||||
| `config/integrations/` | Per-integration credentials |
|
||||
|
||||
### Database Layer
|
||||
|
||||
**Local mode** — `staging.db`: SQLite, single file, gitignored.
|
||||
|
||||
**Cloud mode** — Hybrid:
|
||||
|
||||
- **Postgres (platform layer):** account data, subscriptions, telemetry consent. Shared across all users.
|
||||
- **SQLite-per-user (content layer):** each user's job data in an isolated, SQLCipher-encrypted file at `/devl/menagerie-data/<user-id>/peregrine/staging.db`. Schema is identical to local — the app sees no difference.
|
||||
|
||||
#### Local SQLite tables
|
||||
|
||||
| Table | Purpose |
|
||||
|-------|---------|
|
||||
| `jobs` | Core pipeline — all job data |
|
||||
| `job_contacts` | Email thread log per job |
|
||||
| `company_research` | LLM-generated research briefs |
|
||||
| `background_tasks` | Async task queue state |
|
||||
| `survey_responses` | Culture-fit survey Q&A pairs |
|
||||
|
||||
#### Postgres platform tables (cloud only)
|
||||
|
||||
| Table | Purpose |
|
||||
|-------|---------|
|
||||
| `subscriptions` | User tier, license JWT, product |
|
||||
| `usage_events` | Anonymous usage telemetry (consent-gated) |
|
||||
| `telemetry_consent` | Per-user telemetry preferences + hard kill switch |
|
||||
| `support_access_grants` | Time-limited support session grants |
|
||||
|
||||
---
|
||||
|
||||
### Cloud Session Middleware
|
||||
|
||||
`app/cloud_session.py` handles multi-tenant routing transparently:
|
||||
|
||||
```
|
||||
Request → Caddy injects X-CF-Session header (from Directus session cookie)
|
||||
→ resolve_session() validates JWT, derives db_path + db_key
|
||||
→ all DB calls use get_db_path() instead of DEFAULT_DB
|
||||
```
|
||||
|
||||
Key functions:
|
||||
|
||||
| Function | Purpose |
|
||||
|----------|---------|
|
||||
| `resolve_session(app)` | Called at top of every page — no-op in local mode |
|
||||
| `get_db_path()` | Returns per-user `db_path` (cloud) or `DEFAULT_DB` (local) |
|
||||
| `derive_db_key(user_id)` | `HMAC(SERVER_SECRET, user_id)` — deterministic per-user SQLCipher key |
|
||||
|
||||
The app code never branches on `CLOUD_MODE` except at the entry points (`resolve_session` and `get_db_path`). Everything downstream is transparent.
|
||||
|
||||
### Telemetry (cloud only)
|
||||
|
||||
`app/telemetry.py` is the **only** path to the `usage_events` table. No feature may write there directly.
|
||||
|
||||
```python
|
||||
from app.telemetry import log_usage_event
|
||||
|
||||
log_usage_event(user_id, "peregrine", "cover_letter_generated", {"words": 350})
|
||||
```
|
||||
|
||||
- Complete no-op when `CLOUD_MODE=false`
|
||||
- Checks `telemetry_consent.all_disabled` first — if set, nothing is written, no exceptions
|
||||
- Swallows all exceptions so telemetry never crashes the app
|
||||
|
||||
---
|
||||
|
||||
## Layer Boundaries
|
||||
|
|
@ -129,7 +221,18 @@ submit_task(db_path, task_type="cover_letter", job_id=42)
|
|||
submit_task(db_path, task_type="company_research", job_id=42)
|
||||
```
|
||||
|
||||
Tasks are recorded in the `background_tasks` table with statuses: `queued → running → completed / failed`.
|
||||
Tasks are recorded in the `background_tasks` table with the following state machine:
|
||||
|
||||
```mermaid
|
||||
stateDiagram-v2
|
||||
[*] --> queued : submit_task()
|
||||
queued --> running : daemon picks up
|
||||
running --> completed
|
||||
running --> failed
|
||||
queued --> failed : server restart clears stuck tasks
|
||||
completed --> [*]
|
||||
failed --> [*]
|
||||
```
|
||||
|
||||
**Dedup rule:** Only one `queued` or `running` task per `(task_type, job_id)` pair is allowed at a time. Submitting a duplicate is a silent no-op.
|
||||
|
||||
|
|
@ -166,3 +269,18 @@ The scripts layer was deliberately kept free of Streamlit imports. This means th
|
|||
### Vision service is a separate process
|
||||
|
||||
Moondream2 requires `torch` and `transformers`, which are incompatible with the lightweight main conda environment. The vision service runs as a separate FastAPI process in a separate conda environment (`job-seeker-vision`), keeping the main env free of GPU dependencies.
|
||||
|
||||
### Cloud mode is a transparent layer, not a fork
|
||||
|
||||
`CLOUD_MODE=true` activates two entry points (`resolve_session`, `get_db_path`) and the telemetry middleware. Every other line of app code is unchanged. There is no cloud branch, no conditional imports, no schema divergence. The local-first architecture is preserved end-to-end; the cloud layer sits on top of it.
|
||||
|
||||
### SQLite-per-user instead of shared Postgres
|
||||
|
||||
Each cloud user gets their own encrypted SQLite file. This means:
|
||||
|
||||
- No SQL migrations when the schema changes — new users get the latest schema, existing users keep their file as-is
|
||||
- Zero risk of cross-user data leakage at the DB layer
|
||||
- GDPR deletion is `rm -rf /devl/menagerie-data/<user-id>/` — auditable and complete
|
||||
- The app can be tested locally with `CLOUD_MODE=false` without any Postgres dependency
|
||||
|
||||
The Postgres platform DB holds only account metadata (subscriptions, consent, telemetry) — never job search content.
|
||||
|
|
|
|||
198
docs/developer-guide/cloud-deployment.md
Normal file
198
docs/developer-guide/cloud-deployment.md
Normal file
|
|
@ -0,0 +1,198 @@
|
|||
# Cloud Deployment
|
||||
|
||||
This page covers operating the Peregrine cloud managed instance at `menagerie.circuitforge.tech/peregrine`.
|
||||
|
||||
---
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
```
|
||||
Browser → Caddy (bastion) → host:8505 → peregrine-cloud container
|
||||
│
|
||||
┌─────────────────────────┼──────────────────────────┐
|
||||
│ │ │
|
||||
cloud_session.py /devl/menagerie-data/ Postgres :5433
|
||||
(session routing) <user-id>/peregrine/ (platform DB)
|
||||
staging.db (SQLCipher)
|
||||
```
|
||||
|
||||
Caddy injects the Directus session cookie as `X-CF-Session`. `cloud_session.py` validates the JWT, derives the per-user db path and SQLCipher key, and injects both into `st.session_state`. All downstream DB calls are transparent — the app never knows it's multi-tenant.
|
||||
|
||||
---
|
||||
|
||||
## Compose File
|
||||
|
||||
```bash
|
||||
# Start
|
||||
docker compose -f compose.cloud.yml --project-name peregrine-cloud --env-file .env up -d
|
||||
|
||||
# Stop
|
||||
docker compose -f compose.cloud.yml --project-name peregrine-cloud down
|
||||
|
||||
# Logs
|
||||
docker compose -f compose.cloud.yml --project-name peregrine-cloud logs app -f
|
||||
|
||||
# Rebuild after code changes
|
||||
docker compose -f compose.cloud.yml --project-name peregrine-cloud build app
|
||||
docker compose -f compose.cloud.yml --project-name peregrine-cloud up -d
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Required Environment Variables
|
||||
|
||||
These must be present in `.env` (gitignored) before starting the cloud stack:
|
||||
|
||||
| Variable | Description | Where to find |
|
||||
|----------|-------------|---------------|
|
||||
| `CLOUD_MODE` | Must be `true` | Hardcoded in compose.cloud.yml |
|
||||
| `CLOUD_DATA_ROOT` | Host path for per-user data trees | `/devl/menagerie-data` |
|
||||
| `DIRECTUS_JWT_SECRET` | Directus signing secret — validates session JWTs | `website/.env` → `DIRECTUS_SECRET` |
|
||||
| `CF_SERVER_SECRET` | Server secret for SQLCipher key derivation | Generate: `openssl rand -base64 32 \| tr -d '/=+' \| cut -c1-32` |
|
||||
| `PLATFORM_DB_URL` | Postgres connection string for platform DB | `postgresql://cf_platform:<pass>@host.docker.internal:5433/circuitforge_platform` |
|
||||
|
||||
!!! warning "SECRET ROTATION"
|
||||
`CF_SERVER_SECRET` is used to derive all per-user SQLCipher keys via `HMAC(secret, user_id)`. Rotating this secret renders all existing user databases unreadable. Do not rotate it without a migration plan.
|
||||
|
||||
---
|
||||
|
||||
## Data Root
|
||||
|
||||
User data lives at `/devl/menagerie-data/` on the host, bind-mounted into the container:
|
||||
|
||||
```
|
||||
/devl/menagerie-data/
|
||||
<directus-user-uuid>/
|
||||
peregrine/
|
||||
staging.db ← SQLCipher-encrypted (AES-256)
|
||||
config/ ← llm.yaml, server.yaml, user.yaml, etc.
|
||||
data/ ← documents, exports, attachments
|
||||
```
|
||||
|
||||
The directory is created automatically on first login. The SQLCipher key for each user is derived deterministically: `HMAC-SHA256(CF_SERVER_SECRET, user_id)`.
|
||||
|
||||
### GDPR / Data deletion
|
||||
|
||||
To fully delete a user's data:
|
||||
|
||||
```bash
|
||||
# Remove all content data
|
||||
rm -rf /devl/menagerie-data/<user-id>/
|
||||
|
||||
# Remove platform DB rows (cascades)
|
||||
docker exec cf-platform-db psql -U cf_platform -d circuitforge_platform \
|
||||
-c "DELETE FROM subscriptions WHERE user_id = '<user-id>';"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Platform Database
|
||||
|
||||
The Postgres platform DB runs as `cf-platform-db` in the website compose stack (port 5433 on host).
|
||||
|
||||
```bash
|
||||
# Connect
|
||||
docker exec cf-platform-db psql -U cf_platform -d circuitforge_platform
|
||||
|
||||
# Check tables
|
||||
\dt
|
||||
|
||||
# View telemetry consent for a user
|
||||
SELECT * FROM telemetry_consent WHERE user_id = '<uuid>';
|
||||
|
||||
# View recent usage events
|
||||
SELECT user_id, event_type, occurred_at FROM usage_events
|
||||
ORDER BY occurred_at DESC LIMIT 20;
|
||||
```
|
||||
|
||||
The schema is initialised on container start from `platform-db/init.sql` in the website repo.
|
||||
|
||||
---
|
||||
|
||||
## Telemetry
|
||||
|
||||
`app/telemetry.py` is the **only** entry point to `usage_events`. Never write to that table directly.
|
||||
|
||||
```python
|
||||
from app.telemetry import log_usage_event
|
||||
|
||||
# Fires in cloud mode only; no-op locally
|
||||
log_usage_event(user_id, "peregrine", "cover_letter_generated", {"words": 350})
|
||||
```
|
||||
|
||||
Events are blocked if:
|
||||
|
||||
1. `telemetry_consent.all_disabled = true` (hard kill switch, overrides all)
|
||||
2. `telemetry_consent.usage_events_enabled = false`
|
||||
|
||||
The user controls both from Settings → 🔒 Privacy.
|
||||
|
||||
---
|
||||
|
||||
## Backup / Restore (Cloud Mode)
|
||||
|
||||
The Settings → 💾 Data tab handles backup/restore transparently. In cloud mode:
|
||||
|
||||
- **Export:** the SQLCipher-encrypted DB is decrypted before zipping — the downloaded `.zip` is a portable plain SQLite archive, compatible with any local Docker install.
|
||||
- **Import:** a plain SQLite backup is re-encrypted with the user's key on restore.
|
||||
|
||||
The user's `base_dir` in cloud mode is `get_db_path().parent` (`/devl/menagerie-data/<user-id>/peregrine/`), not the app root.
|
||||
|
||||
---
|
||||
|
||||
## Routing (Caddy)
|
||||
|
||||
`menagerie.circuitforge.tech` in `/devl/caddy-proxy/Caddyfile`:
|
||||
|
||||
```caddy
|
||||
menagerie.circuitforge.tech {
|
||||
encode gzip zstd
|
||||
handle /peregrine* {
|
||||
reverse_proxy http://host.docker.internal:8505 {
|
||||
header_up X-CF-Session {header.Cookie}
|
||||
}
|
||||
}
|
||||
handle {
|
||||
respond "This app is not yet available in the managed cloud — check back soon." 503
|
||||
}
|
||||
log {
|
||||
output file /data/logs/menagerie.circuitforge.tech.log
|
||||
format json
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
`header_up X-CF-Session {header.Cookie}` passes the full cookie header so `cloud_session.py` can extract the Directus session token.
|
||||
|
||||
!!! note "Caddy inode gotcha"
|
||||
After editing the Caddyfile, run `docker restart caddy-proxy` — not `caddy reload`. The Edit tool creates a new inode; Docker bind mounts pin to the original inode and `caddy reload` re-reads the stale one.
|
||||
|
||||
---
|
||||
|
||||
## Demo Instance
|
||||
|
||||
The public demo at `demo.circuitforge.tech/peregrine` runs separately:
|
||||
|
||||
```bash
|
||||
# Start demo
|
||||
docker compose -f compose.demo.yml --project-name peregrine-demo up -d
|
||||
|
||||
# Rebuild after code changes
|
||||
docker compose -f compose.demo.yml --project-name peregrine-demo build app
|
||||
docker compose -f compose.demo.yml --project-name peregrine-demo up -d
|
||||
```
|
||||
|
||||
`DEMO_MODE=true` blocks all LLM inference calls at `llm_router.py`. Discovery, job enrichment, and the UI work normally. Demo data lives in `demo/config/` and `demo/data/` — isolated from personal data.
|
||||
|
||||
---
|
||||
|
||||
## Adding a New App to the Cloud
|
||||
|
||||
To onboard a new menagerie app (e.g. `falcon`) to the cloud:
|
||||
|
||||
1. Add `resolve_session("falcon")` at the top of each page (calls `cloud_session.py` with the app slug)
|
||||
2. Replace `DEFAULT_DB` references with `get_db_path()`
|
||||
3. Add `app/telemetry.py` import and `log_usage_event()` calls at key action points
|
||||
4. Create `compose.cloud.yml` following the Peregrine pattern (port, `CLOUD_MODE=true`, data mount)
|
||||
5. Add a Caddy `handle /falcon*` block in `menagerie.circuitforge.tech`, routing to the new port
|
||||
6. `cloud_session.py` automatically creates `<data_root>/<user-id>/falcon/` on first login
|
||||
|
|
@ -102,6 +102,23 @@ Before opening a pull request:
|
|||
|
||||
---
|
||||
|
||||
## Database Migrations
|
||||
|
||||
Peregrine uses a numbered SQL migration system (Rails-style). Each migration is a `.sql` file in the `migrations/` directory at the repo root, named `NNN_description.sql` (e.g. `002_add_foo_column.sql`). Applied migrations are tracked in a `schema_migrations` table in each user database.
|
||||
|
||||
### Adding a migration
|
||||
|
||||
1. Create `migrations/NNN_description.sql` where `NNN` is the next sequential number (zero-padded to 3 digits).
|
||||
2. Write standard SQL — `CREATE TABLE IF NOT EXISTS`, `ALTER TABLE ADD COLUMN`, etc. Keep each migration idempotent where possible.
|
||||
3. Do **not** modify `scripts/db.py`'s legacy `_MIGRATIONS` lists — those are superseded and will be removed once all active databases have been bootstrapped by the migration runner.
|
||||
4. The runner (`scripts/db_migrate.py`) applies pending migrations at startup automatically (both FastAPI and Streamlit paths call `migrate_db(db_path)`).
|
||||
|
||||
### Rollbacks
|
||||
|
||||
SQLite does not support transactional DDL for all statement types. Write forward-only migrations. If you need to undo a schema change, add a new migration that reverses it.
|
||||
|
||||
---
|
||||
|
||||
## What NOT to Do
|
||||
|
||||
- Do not commit `config/user.yaml`, `config/notion.yaml`, `config/email.yaml`, `config/adzuna.yaml`, or any `config/integrations/*.yaml` — all are gitignored
|
||||
|
|
|
|||
|
|
@ -1,201 +0,0 @@
|
|||
# Job Seeker Platform — Design Document
|
||||
**Date:** 2026-02-20
|
||||
**Status:** Approved
|
||||
**Candidate:** Meghan McCann
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
A monorepo project at `/devl/job-seeker/` that integrates three FOSS tools into a
|
||||
cohesive job search pipeline: automated discovery (JobSpy), resume-to-listing keyword
|
||||
matching (Resume Matcher), and automated application submission (AIHawk). Job listings
|
||||
and interactive documents are tracked in Notion; source documents live in
|
||||
`/Library/Documents/JobSearch/`.
|
||||
|
||||
---
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
/devl/job-seeker/
|
||||
├── config/
|
||||
│ ├── search_profiles.yaml # JobSpy queries (titles, locations, boards)
|
||||
│ ├── llm.yaml # LLM router: backends + fallback order
|
||||
│ └── notion.yaml # Notion DB IDs and field mappings
|
||||
├── aihawk/ # git clone — Auto_Jobs_Applier_AIHawk
|
||||
├── resume_matcher/ # git clone — Resume-Matcher
|
||||
├── scripts/
|
||||
│ ├── discover.py # JobSpy → deduplicate → push to Notion
|
||||
│ ├── match.py # Notion job URL → Resume Matcher → write score back
|
||||
│ └── llm_router.py # LLM abstraction layer with priority fallback chain
|
||||
├── docs/plans/ # Design and implementation docs (no resume files)
|
||||
├── environment.yml # conda env spec (env name: job-seeker)
|
||||
└── .gitignore
|
||||
```
|
||||
|
||||
**Document storage rule:** Resumes, cover letters, and any interactable documents live
|
||||
in `/Library/Documents/JobSearch/` or Notion — never committed to this repo.
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
### Data Flow
|
||||
|
||||
```
|
||||
JobSpy (LinkedIn / Indeed / Glassdoor / ZipRecruiter)
|
||||
└─▶ discover.py
|
||||
├─ deduplicate by URL against existing Notion records
|
||||
└─▶ Notion DB (Status: "New")
|
||||
|
||||
Notion DB (daily review — decide what to pursue)
|
||||
└─▶ match.py <notion-page-url>
|
||||
├─ fetch job description from listing URL
|
||||
├─ run Resume Matcher vs. /Library/Documents/JobSearch/Meghan_McCann_Resume_02-19-2025.pdf
|
||||
└─▶ write Match Score + Keyword Gaps back to Notion page
|
||||
|
||||
AIHawk (when ready to apply)
|
||||
├─ reads config pointing to same resume + personal_info.yaml
|
||||
├─ llm_router.py → best available LLM backend
|
||||
├─ submits LinkedIn Easy Apply
|
||||
└─▶ Notion status → "Applied"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Notion Database Schema
|
||||
|
||||
| Field | Type | Notes |
|
||||
|---------------|----------|------------------------------------------------------------|
|
||||
| Job Title | Title | Primary identifier |
|
||||
| Company | Text | |
|
||||
| Location | Text | |
|
||||
| Remote | Checkbox | |
|
||||
| URL | URL | Deduplication key |
|
||||
| Source | Select | LinkedIn / Indeed / Glassdoor / ZipRecruiter |
|
||||
| Status | Select | New → Reviewing → Applied → Interview → Offer → Rejected |
|
||||
| Match Score | Number | 0–100, written by match.py |
|
||||
| Keyword Gaps | Text | Comma-separated missing keywords from Resume Matcher |
|
||||
| Salary | Text | If listed |
|
||||
| Date Found | Date | Set at discovery time |
|
||||
| Notes | Text | Manual field |
|
||||
|
||||
---
|
||||
|
||||
## LLM Router (`scripts/llm_router.py`)
|
||||
|
||||
Single `complete(prompt, system=None)` interface. On each call: health-check each
|
||||
backend in configured order, use the first that responds. Falls back silently on
|
||||
connection error, timeout, or 5xx. Logs which backend was used.
|
||||
|
||||
All backends except Anthropic use the `openai` Python package (OpenAI-compatible
|
||||
endpoints). Anthropic uses the `anthropic` package.
|
||||
|
||||
### `config/llm.yaml`
|
||||
|
||||
```yaml
|
||||
fallback_order:
|
||||
- claude_code # port 3009 — Claude via local pipeline (highest quality)
|
||||
- ollama # port 11434 — local, always-on
|
||||
- vllm # port 8000 — start when needed
|
||||
- github_copilot # port 3010 — Copilot via gh token
|
||||
- anthropic # cloud fallback, burns API credits
|
||||
|
||||
backends:
|
||||
claude_code:
|
||||
type: openai_compat
|
||||
base_url: http://localhost:3009/v1
|
||||
model: claude-code-terminal
|
||||
api_key: "any"
|
||||
|
||||
ollama:
|
||||
type: openai_compat
|
||||
base_url: http://localhost:11434/v1
|
||||
model: llama3.2
|
||||
api_key: "ollama"
|
||||
|
||||
vllm:
|
||||
type: openai_compat
|
||||
base_url: http://localhost:8000/v1
|
||||
model: __auto__
|
||||
api_key: ""
|
||||
|
||||
github_copilot:
|
||||
type: openai_compat
|
||||
base_url: http://localhost:3010/v1
|
||||
model: gpt-4o
|
||||
api_key: "any"
|
||||
|
||||
anthropic:
|
||||
type: anthropic
|
||||
model: claude-sonnet-4-6
|
||||
api_key_env: ANTHROPIC_API_KEY
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Job Search Profile
|
||||
|
||||
### `config/search_profiles.yaml` (initial)
|
||||
|
||||
```yaml
|
||||
profiles:
|
||||
- name: cs_leadership
|
||||
titles:
|
||||
- "Customer Success Manager"
|
||||
- "Director of Customer Success"
|
||||
- "VP Customer Success"
|
||||
- "Head of Customer Success"
|
||||
- "Technical Account Manager"
|
||||
- "Revenue Operations Manager"
|
||||
- "Customer Experience Lead"
|
||||
locations:
|
||||
- "Remote"
|
||||
- "San Francisco Bay Area, CA"
|
||||
boards:
|
||||
- linkedin
|
||||
- indeed
|
||||
- glassdoor
|
||||
- zip_recruiter
|
||||
results_per_board: 25
|
||||
remote_only: false # remote preferred but Bay Area in-person ok
|
||||
hours_old: 72 # listings posted in last 3 days
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Conda Environment
|
||||
|
||||
New dedicated env `job-seeker` (not base). Core packages:
|
||||
|
||||
- `python-jobspy` — job scraping
|
||||
- `notion-client` — Notion API
|
||||
- `openai` — OpenAI-compatible calls (Ollama, vLLM, Copilot, Claude pipeline)
|
||||
- `anthropic` — Anthropic API fallback
|
||||
- `pyyaml` — config parsing
|
||||
- `pandas` — CSV handling and dedup
|
||||
- Resume Matcher dependencies (sentence-transformers, streamlit — installed from clone)
|
||||
|
||||
Resume Matcher Streamlit UI runs on port **8501** (confirmed clear).
|
||||
|
||||
---
|
||||
|
||||
## Port Map
|
||||
|
||||
| Port | Service | Status |
|
||||
|-------|--------------------------------|----------------|
|
||||
| 3009 | Claude Code OpenAI wrapper | Start via manage.sh in Post Fight Processing |
|
||||
| 3010 | GitHub Copilot wrapper | Start via manage-copilot.sh |
|
||||
| 11434 | Ollama | Running |
|
||||
| 8000 | vLLM | Start when needed |
|
||||
| 8501 | Resume Matcher (Streamlit) | Start when needed |
|
||||
|
||||
---
|
||||
|
||||
## Out of Scope (this phase)
|
||||
|
||||
- Scheduled/cron automation (run discover.py manually for now)
|
||||
- Email/SMS alerts for new listings
|
||||
- ATS resume rebuild (separate task)
|
||||
- Applications to non-LinkedIn platforms via AIHawk
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,148 +0,0 @@
|
|||
# Job Seeker Platform — Web UI Design
|
||||
|
||||
**Date:** 2026-02-20
|
||||
**Status:** Approved
|
||||
|
||||
## Overview
|
||||
|
||||
A Streamlit multi-page web UI that gives Meghan (and her partner) a friendly interface to review scraped job listings, curate them before they hit Notion, edit search/LLM/Notion settings, and fill out her AIHawk application profile. Designed to be usable by anyone — no technical knowledge required.
|
||||
|
||||
---
|
||||
|
||||
## Architecture & Data Flow
|
||||
|
||||
```
|
||||
discover.py → SQLite staging.db (status: pending)
|
||||
↓
|
||||
Streamlit UI
|
||||
review / approve / reject
|
||||
↓
|
||||
"Sync N approved jobs" button
|
||||
↓
|
||||
Notion DB (status: synced)
|
||||
```
|
||||
|
||||
`discover.py` is modified to write to SQLite instead of directly to Notion.
|
||||
A new `sync.py` handles the approved → Notion push.
|
||||
`db.py` provides shared SQLite helpers used by both scripts and UI pages.
|
||||
|
||||
### SQLite Schema (`staging.db`, gitignored)
|
||||
|
||||
```sql
|
||||
CREATE TABLE jobs (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
title TEXT,
|
||||
company TEXT,
|
||||
url TEXT UNIQUE,
|
||||
source TEXT,
|
||||
location TEXT,
|
||||
is_remote INTEGER,
|
||||
salary TEXT,
|
||||
description TEXT,
|
||||
match_score REAL,
|
||||
keyword_gaps TEXT,
|
||||
date_found TEXT,
|
||||
status TEXT DEFAULT 'pending', -- pending / approved / rejected / synced
|
||||
notion_page_id TEXT
|
||||
);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pages
|
||||
|
||||
### Home (Dashboard)
|
||||
- Stat cards: Pending / Approved / Rejected / Synced counts
|
||||
- "Run Discovery" button — runs `discover.py` as subprocess, streams output
|
||||
- "Sync N approved jobs → Notion" button — visible only when approved count > 0
|
||||
- Recent activity list (last 10 jobs found)
|
||||
|
||||
### Job Review
|
||||
- Filterable table/card view of pending jobs
|
||||
- Filters: source (LinkedIn/Indeed/etc), remote only toggle, minimum match score slider
|
||||
- Checkboxes for batch selection
|
||||
- "Approve Selected" / "Reject Selected" buttons
|
||||
- Rejected jobs hidden by default, togglable
|
||||
- Match score shown as colored badge (green ≥70, amber 40–69, red <40)
|
||||
|
||||
### Settings
|
||||
Three tabs:
|
||||
|
||||
**Search** — edit `config/search_profiles.yaml`:
|
||||
- Job titles (add/remove tags)
|
||||
- Locations (add/remove)
|
||||
- Boards checkboxes
|
||||
- Hours old slider
|
||||
- Results per board slider
|
||||
|
||||
**LLM Backends** — edit `config/llm.yaml`:
|
||||
- Fallback order (drag or up/down arrows)
|
||||
- Per-backend: URL, model name, enabled toggle
|
||||
- "Test connection" button per backend
|
||||
|
||||
**Notion** — edit `config/notion.yaml`:
|
||||
- Token field (masked, show/hide toggle)
|
||||
- Database ID
|
||||
- "Test connection" button
|
||||
|
||||
### Resume Editor
|
||||
Sectioned form over `aihawk/data_folder/plain_text_resume.yaml`:
|
||||
- **Personal Info** — name, email, phone, LinkedIn, city, zip
|
||||
- **Education** — list of entries, add/remove buttons
|
||||
- **Experience** — list of entries, add/remove buttons
|
||||
- **Skills & Interests** — tag-style inputs
|
||||
- **Preferences** — salary range, notice period, remote/relocation toggles
|
||||
- **Self-Identification** — gender, pronouns, veteran, disability, ethnicity (with "prefer not to say" options)
|
||||
- **Legal** — work authorization checkboxes
|
||||
|
||||
`FILL_IN` fields highlighted in amber with "Needs your attention" note.
|
||||
Save button writes back to YAML. No raw YAML shown by default.
|
||||
|
||||
---
|
||||
|
||||
## Theme & Styling
|
||||
|
||||
Central theme at `app/.streamlit/config.toml`:
|
||||
- Dark base, accent color teal/green (job search = growth)
|
||||
- Consistent font (Inter or system sans-serif)
|
||||
- Responsive column layouts — usable on tablet/mobile
|
||||
- No jargon — "Run Discovery" not "Execute scrape", "Sync to Notion" not "Push records"
|
||||
|
||||
---
|
||||
|
||||
## File Layout
|
||||
|
||||
```
|
||||
app/
|
||||
├── .streamlit/
|
||||
│ └── config.toml # central theme
|
||||
├── Home.py # dashboard
|
||||
└── pages/
|
||||
├── 1_Job_Review.py
|
||||
├── 2_Settings.py
|
||||
└── 3_Resume_Editor.py
|
||||
scripts/
|
||||
├── db.py # new: SQLite helpers
|
||||
├── sync.py # new: approved → Notion push
|
||||
├── discover.py # modified: write to SQLite not Notion
|
||||
├── match.py # unchanged
|
||||
└── llm_router.py # unchanged
|
||||
```
|
||||
|
||||
Run: `conda run -n job-seeker streamlit run app/Home.py`
|
||||
|
||||
---
|
||||
|
||||
## New Dependencies
|
||||
|
||||
None — `streamlit` already installed via resume_matcher deps.
|
||||
`sqlite3` is Python stdlib.
|
||||
|
||||
---
|
||||
|
||||
## Out of Scope
|
||||
|
||||
- Real-time collaboration
|
||||
- Mobile native app
|
||||
- Cover letter editor (handled separately via LoRA fine-tune task)
|
||||
- AIHawk trigger from UI (run manually for now)
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,100 +0,0 @@
|
|||
# Background Task Processing — Design
|
||||
|
||||
**Date:** 2026-02-21
|
||||
**Status:** Approved
|
||||
|
||||
## Problem
|
||||
|
||||
Cover letter generation (`4_Apply.py`) and company research (`6_Interview_Prep.py`) call LLM scripts synchronously inside `st.spinner()`. If the user navigates away during generation, Streamlit abandons the in-progress call and the result is lost. Both results are already persisted to SQLite on completion, so if the task kept running in the background the result would be available on return.
|
||||
|
||||
## Solution Overview
|
||||
|
||||
Python threading + SQLite task table. When a user clicks Generate, a daemon thread is spawned immediately and the task is recorded in a new `background_tasks` table. The thread writes results to the existing tables (`jobs.cover_letter`, `company_research`) and marks itself complete/failed. All pages share a sidebar indicator that auto-refreshes while tasks are active. Individual pages show task-level status inline.
|
||||
|
||||
## SQLite Schema
|
||||
|
||||
New table `background_tasks` added in `scripts/db.py`:
|
||||
|
||||
```sql
|
||||
CREATE TABLE IF NOT EXISTS background_tasks (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
task_type TEXT NOT NULL, -- "cover_letter" | "company_research"
|
||||
job_id INTEGER NOT NULL,
|
||||
status TEXT NOT NULL DEFAULT 'queued', -- queued | running | completed | failed
|
||||
error TEXT,
|
||||
created_at DATETIME DEFAULT (datetime('now')),
|
||||
started_at DATETIME,
|
||||
finished_at DATETIME
|
||||
)
|
||||
```
|
||||
|
||||
## Deduplication Rule
|
||||
|
||||
Before inserting a new task, check for an existing `queued` or `running` row with the same `(task_type, job_id)`. If one exists, reject the submission (return the existing task's id). Different task types for the same job (e.g. cover letter + research) are allowed to run concurrently. Different jobs of the same type are allowed concurrently.
|
||||
|
||||
## Components
|
||||
|
||||
### `scripts/task_runner.py` (new)
|
||||
|
||||
- `submit_task(db, task_type, job_id) -> int` — dedup check, insert row, spawn daemon thread, return task id
|
||||
- `_run_task(db, task_id, task_type, job_id)` — thread body: mark running, call generator, save result, mark completed/failed
|
||||
- `get_active_tasks(db) -> list[dict]` — all queued/running rows with job title+company joined
|
||||
- `get_task_for_job(db, task_type, job_id) -> dict | None` — latest task row for a specific job+type
|
||||
|
||||
### `scripts/db.py` (modified)
|
||||
|
||||
- Add `init_background_tasks(conn)` called inside `init_db()`
|
||||
- Add `insert_task`, `update_task_status`, `get_active_tasks`, `get_task_for_job` helpers
|
||||
|
||||
### `app/app.py` (modified)
|
||||
|
||||
- After `st.navigation()`, call `get_active_tasks()` and render sidebar indicator
|
||||
- Use `st.fragment` with `time.sleep(3)` + `st.rerun(scope="fragment")` to poll while tasks are active
|
||||
- Sidebar shows: `⏳ N task(s) running` count + per-task line (type + company name)
|
||||
- Fragment polling stops when active task count reaches zero
|
||||
|
||||
### `app/pages/4_Apply.py` (modified)
|
||||
|
||||
- Generate button calls `submit_task(db, "cover_letter", job_id)` instead of running inline
|
||||
- If a task is `queued`/`running` for the selected job, disable button and show inline status fragment (polls every 3s)
|
||||
- On `completed`, load cover letter from `jobs` row (already saved by thread)
|
||||
- On `failed`, show error message and re-enable button
|
||||
|
||||
### `app/pages/6_Interview_Prep.py` (modified)
|
||||
|
||||
- Generate/Refresh buttons call `submit_task(db, "company_research", job_id)` instead of running inline
|
||||
- Same inline status fragment pattern as Apply page
|
||||
|
||||
## Data Flow
|
||||
|
||||
```
|
||||
User clicks Generate
|
||||
→ submit_task(db, type, job_id)
|
||||
→ dedup check (reject if already queued/running for same type+job)
|
||||
→ INSERT background_tasks row (status=queued)
|
||||
→ spawn daemon thread
|
||||
→ return task_id
|
||||
→ page shows inline "⏳ Queued…" fragment
|
||||
|
||||
Thread runs
|
||||
→ UPDATE status=running, started_at=now
|
||||
→ call generate_cover_letter.generate() OR research_company()
|
||||
→ write result to jobs.cover_letter OR company_research table
|
||||
→ UPDATE status=completed, finished_at=now
|
||||
(on exception: UPDATE status=failed, error=str(e))
|
||||
|
||||
Sidebar fragment (every 3s while active tasks > 0)
|
||||
→ get_active_tasks() → render count + list
|
||||
→ st.rerun(scope="fragment")
|
||||
|
||||
Page fragment (every 3s while task for this job is running)
|
||||
→ get_task_for_job() → render status
|
||||
→ on completed: st.rerun() (full rerun to reload cover letter / research)
|
||||
```
|
||||
|
||||
## What Is Not Changed
|
||||
|
||||
- `generate_cover_letter.generate()` and `research_company()` are called unchanged from the thread
|
||||
- `update_cover_letter()` and `save_research()` DB helpers are reused unchanged
|
||||
- No new Python packages required
|
||||
- No separate worker process — daemon threads die with the Streamlit server, but results already written to SQLite survive
|
||||
|
|
@ -1,933 +0,0 @@
|
|||
# Background Task Processing Implementation Plan
|
||||
|
||||
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
|
||||
|
||||
**Goal:** Replace synchronous LLM calls in Apply and Interview Prep pages with background threads so cover letter and research generation survive page navigation.
|
||||
|
||||
**Architecture:** A new `background_tasks` SQLite table tracks task state. `scripts/task_runner.py` spawns daemon threads that call existing generator functions and write results via existing DB helpers. The Streamlit sidebar polls active tasks every 3s via `@st.fragment(run_every=3)`; individual pages show per-job status with the same pattern.
|
||||
|
||||
**Tech Stack:** Python `threading` (stdlib), SQLite, Streamlit `st.fragment` (≥1.33 — already installed)
|
||||
|
||||
---
|
||||
|
||||
## Task 1: Add background_tasks table and DB helpers
|
||||
|
||||
**Files:**
|
||||
- Modify: `scripts/db.py`
|
||||
- Test: `tests/test_db.py`
|
||||
|
||||
### Step 1: Write the failing tests
|
||||
|
||||
Add to `tests/test_db.py`:
|
||||
|
||||
```python
|
||||
# ── background_tasks tests ────────────────────────────────────────────────────
|
||||
|
||||
def test_init_db_creates_background_tasks_table(tmp_path):
|
||||
"""init_db creates a background_tasks table."""
|
||||
from scripts.db import init_db
|
||||
db_path = tmp_path / "test.db"
|
||||
init_db(db_path)
|
||||
import sqlite3
|
||||
conn = sqlite3.connect(db_path)
|
||||
cur = conn.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='background_tasks'"
|
||||
)
|
||||
assert cur.fetchone() is not None
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_insert_task_returns_id_and_true(tmp_path):
|
||||
"""insert_task returns (task_id, True) for a new task."""
|
||||
from scripts.db import init_db, insert_job, insert_task
|
||||
db_path = tmp_path / "test.db"
|
||||
init_db(db_path)
|
||||
job_id = insert_job(db_path, {
|
||||
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||
"salary": "", "description": "", "date_found": "2026-02-20",
|
||||
})
|
||||
task_id, is_new = insert_task(db_path, "cover_letter", job_id)
|
||||
assert isinstance(task_id, int) and task_id > 0
|
||||
assert is_new is True
|
||||
|
||||
|
||||
def test_insert_task_deduplicates_active_task(tmp_path):
|
||||
"""insert_task returns (existing_id, False) if a queued/running task already exists."""
|
||||
from scripts.db import init_db, insert_job, insert_task
|
||||
db_path = tmp_path / "test.db"
|
||||
init_db(db_path)
|
||||
job_id = insert_job(db_path, {
|
||||
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||
"salary": "", "description": "", "date_found": "2026-02-20",
|
||||
})
|
||||
first_id, _ = insert_task(db_path, "cover_letter", job_id)
|
||||
second_id, is_new = insert_task(db_path, "cover_letter", job_id)
|
||||
assert second_id == first_id
|
||||
assert is_new is False
|
||||
|
||||
|
||||
def test_insert_task_allows_different_types_same_job(tmp_path):
|
||||
"""insert_task allows cover_letter and company_research for the same job concurrently."""
|
||||
from scripts.db import init_db, insert_job, insert_task
|
||||
db_path = tmp_path / "test.db"
|
||||
init_db(db_path)
|
||||
job_id = insert_job(db_path, {
|
||||
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||
"salary": "", "description": "", "date_found": "2026-02-20",
|
||||
})
|
||||
_, cl_new = insert_task(db_path, "cover_letter", job_id)
|
||||
_, res_new = insert_task(db_path, "company_research", job_id)
|
||||
assert cl_new is True
|
||||
assert res_new is True
|
||||
|
||||
|
||||
def test_update_task_status_running(tmp_path):
|
||||
"""update_task_status('running') sets started_at."""
|
||||
from scripts.db import init_db, insert_job, insert_task, update_task_status
|
||||
import sqlite3
|
||||
db_path = tmp_path / "test.db"
|
||||
init_db(db_path)
|
||||
job_id = insert_job(db_path, {
|
||||
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||
"salary": "", "description": "", "date_found": "2026-02-20",
|
||||
})
|
||||
task_id, _ = insert_task(db_path, "cover_letter", job_id)
|
||||
update_task_status(db_path, task_id, "running")
|
||||
conn = sqlite3.connect(db_path)
|
||||
row = conn.execute("SELECT status, started_at FROM background_tasks WHERE id=?", (task_id,)).fetchone()
|
||||
conn.close()
|
||||
assert row[0] == "running"
|
||||
assert row[1] is not None
|
||||
|
||||
|
||||
def test_update_task_status_completed(tmp_path):
|
||||
"""update_task_status('completed') sets finished_at."""
|
||||
from scripts.db import init_db, insert_job, insert_task, update_task_status
|
||||
import sqlite3
|
||||
db_path = tmp_path / "test.db"
|
||||
init_db(db_path)
|
||||
job_id = insert_job(db_path, {
|
||||
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||
"salary": "", "description": "", "date_found": "2026-02-20",
|
||||
})
|
||||
task_id, _ = insert_task(db_path, "cover_letter", job_id)
|
||||
update_task_status(db_path, task_id, "completed")
|
||||
conn = sqlite3.connect(db_path)
|
||||
row = conn.execute("SELECT status, finished_at FROM background_tasks WHERE id=?", (task_id,)).fetchone()
|
||||
conn.close()
|
||||
assert row[0] == "completed"
|
||||
assert row[1] is not None
|
||||
|
||||
|
||||
def test_update_task_status_failed_stores_error(tmp_path):
|
||||
"""update_task_status('failed') stores error message and sets finished_at."""
|
||||
from scripts.db import init_db, insert_job, insert_task, update_task_status
|
||||
import sqlite3
|
||||
db_path = tmp_path / "test.db"
|
||||
init_db(db_path)
|
||||
job_id = insert_job(db_path, {
|
||||
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||
"salary": "", "description": "", "date_found": "2026-02-20",
|
||||
})
|
||||
task_id, _ = insert_task(db_path, "cover_letter", job_id)
|
||||
update_task_status(db_path, task_id, "failed", error="LLM timeout")
|
||||
conn = sqlite3.connect(db_path)
|
||||
row = conn.execute("SELECT status, error, finished_at FROM background_tasks WHERE id=?", (task_id,)).fetchone()
|
||||
conn.close()
|
||||
assert row[0] == "failed"
|
||||
assert row[1] == "LLM timeout"
|
||||
assert row[2] is not None
|
||||
|
||||
|
||||
def test_get_active_tasks_returns_only_active(tmp_path):
|
||||
"""get_active_tasks returns only queued/running tasks with job info joined."""
|
||||
from scripts.db import init_db, insert_job, insert_task, update_task_status, get_active_tasks
|
||||
db_path = tmp_path / "test.db"
|
||||
init_db(db_path)
|
||||
job_id = insert_job(db_path, {
|
||||
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||
"salary": "", "description": "", "date_found": "2026-02-20",
|
||||
})
|
||||
active_id, _ = insert_task(db_path, "cover_letter", job_id)
|
||||
done_id, _ = insert_task(db_path, "company_research", job_id)
|
||||
update_task_status(db_path, done_id, "completed")
|
||||
|
||||
tasks = get_active_tasks(db_path)
|
||||
assert len(tasks) == 1
|
||||
assert tasks[0]["id"] == active_id
|
||||
assert tasks[0]["company"] == "Acme"
|
||||
assert tasks[0]["title"] == "CSM"
|
||||
|
||||
|
||||
def test_get_task_for_job_returns_latest(tmp_path):
|
||||
"""get_task_for_job returns the most recent task for the given type+job."""
|
||||
from scripts.db import init_db, insert_job, insert_task, update_task_status, get_task_for_job
|
||||
db_path = tmp_path / "test.db"
|
||||
init_db(db_path)
|
||||
job_id = insert_job(db_path, {
|
||||
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||
"salary": "", "description": "", "date_found": "2026-02-20",
|
||||
})
|
||||
first_id, _ = insert_task(db_path, "cover_letter", job_id)
|
||||
update_task_status(db_path, first_id, "completed")
|
||||
second_id, _ = insert_task(db_path, "cover_letter", job_id) # allowed since first is done
|
||||
|
||||
task = get_task_for_job(db_path, "cover_letter", job_id)
|
||||
assert task is not None
|
||||
assert task["id"] == second_id
|
||||
|
||||
|
||||
def test_get_task_for_job_returns_none_when_absent(tmp_path):
|
||||
"""get_task_for_job returns None when no task exists for that job+type."""
|
||||
from scripts.db import init_db, insert_job, get_task_for_job
|
||||
db_path = tmp_path / "test.db"
|
||||
init_db(db_path)
|
||||
job_id = insert_job(db_path, {
|
||||
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||
"salary": "", "description": "", "date_found": "2026-02-20",
|
||||
})
|
||||
assert get_task_for_job(db_path, "cover_letter", job_id) is None
|
||||
```
|
||||
|
||||
### Step 2: Run tests to verify they fail
|
||||
|
||||
```bash
|
||||
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_db.py -v -k "background_tasks or insert_task or update_task_status or get_active_tasks or get_task_for_job"
|
||||
```
|
||||
|
||||
Expected: FAIL with `ImportError: cannot import name 'insert_task'`
|
||||
|
||||
### Step 3: Implement in scripts/db.py
|
||||
|
||||
Add the DDL constant after `CREATE_COMPANY_RESEARCH`:
|
||||
|
||||
```python
|
||||
CREATE_BACKGROUND_TASKS = """
|
||||
CREATE TABLE IF NOT EXISTS background_tasks (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
task_type TEXT NOT NULL,
|
||||
job_id INTEGER NOT NULL,
|
||||
status TEXT NOT NULL DEFAULT 'queued',
|
||||
error TEXT,
|
||||
created_at DATETIME DEFAULT (datetime('now')),
|
||||
started_at DATETIME,
|
||||
finished_at DATETIME
|
||||
)
|
||||
"""
|
||||
```
|
||||
|
||||
Add `conn.execute(CREATE_BACKGROUND_TASKS)` inside `init_db()`, after the existing three `conn.execute()` calls:
|
||||
|
||||
```python
|
||||
def init_db(db_path: Path = DEFAULT_DB) -> None:
|
||||
"""Create tables if they don't exist, then run migrations."""
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.execute(CREATE_JOBS)
|
||||
conn.execute(CREATE_JOB_CONTACTS)
|
||||
conn.execute(CREATE_COMPANY_RESEARCH)
|
||||
conn.execute(CREATE_BACKGROUND_TASKS) # ← add this line
|
||||
conn.commit()
|
||||
conn.close()
|
||||
_migrate_db(db_path)
|
||||
```
|
||||
|
||||
Add the four helper functions at the end of `scripts/db.py`:
|
||||
|
||||
```python
|
||||
# ── Background task helpers ───────────────────────────────────────────────────
|
||||
|
||||
def insert_task(db_path: Path = DEFAULT_DB, task_type: str = "",
|
||||
job_id: int = None) -> tuple[int, bool]:
|
||||
"""Insert a new background task.
|
||||
|
||||
Returns (task_id, True) if inserted, or (existing_id, False) if a
|
||||
queued/running task for the same (task_type, job_id) already exists.
|
||||
"""
|
||||
conn = sqlite3.connect(db_path)
|
||||
existing = conn.execute(
|
||||
"SELECT id FROM background_tasks WHERE task_type=? AND job_id=? AND status IN ('queued','running')",
|
||||
(task_type, job_id),
|
||||
).fetchone()
|
||||
if existing:
|
||||
conn.close()
|
||||
return existing[0], False
|
||||
cur = conn.execute(
|
||||
"INSERT INTO background_tasks (task_type, job_id, status) VALUES (?, ?, 'queued')",
|
||||
(task_type, job_id),
|
||||
)
|
||||
task_id = cur.lastrowid
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return task_id, True
|
||||
|
||||
|
||||
def update_task_status(db_path: Path = DEFAULT_DB, task_id: int = None,
|
||||
status: str = "", error: Optional[str] = None) -> None:
|
||||
"""Update a task's status and set the appropriate timestamp."""
|
||||
now = datetime.now().isoformat()[:16]
|
||||
conn = sqlite3.connect(db_path)
|
||||
if status == "running":
|
||||
conn.execute(
|
||||
"UPDATE background_tasks SET status=?, started_at=? WHERE id=?",
|
||||
(status, now, task_id),
|
||||
)
|
||||
elif status in ("completed", "failed"):
|
||||
conn.execute(
|
||||
"UPDATE background_tasks SET status=?, finished_at=?, error=? WHERE id=?",
|
||||
(status, now, error, task_id),
|
||||
)
|
||||
else:
|
||||
conn.execute("UPDATE background_tasks SET status=? WHERE id=?", (status, task_id))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def get_active_tasks(db_path: Path = DEFAULT_DB) -> list[dict]:
|
||||
"""Return all queued/running tasks with job title and company joined in."""
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
rows = conn.execute("""
|
||||
SELECT bt.*, j.title, j.company
|
||||
FROM background_tasks bt
|
||||
LEFT JOIN jobs j ON j.id = bt.job_id
|
||||
WHERE bt.status IN ('queued', 'running')
|
||||
ORDER BY bt.created_at ASC
|
||||
""").fetchall()
|
||||
conn.close()
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
|
||||
def get_task_for_job(db_path: Path = DEFAULT_DB, task_type: str = "",
|
||||
job_id: int = None) -> Optional[dict]:
|
||||
"""Return the most recent task row for a (task_type, job_id) pair, or None."""
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
row = conn.execute(
|
||||
"""SELECT * FROM background_tasks
|
||||
WHERE task_type=? AND job_id=?
|
||||
ORDER BY id DESC LIMIT 1""",
|
||||
(task_type, job_id),
|
||||
).fetchone()
|
||||
conn.close()
|
||||
return dict(row) if row else None
|
||||
```
|
||||
|
||||
### Step 4: Run tests to verify they pass
|
||||
|
||||
```bash
|
||||
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_db.py -v -k "background_tasks or insert_task or update_task_status or get_active_tasks or get_task_for_job"
|
||||
```
|
||||
|
||||
Expected: all new tests PASS, no regressions
|
||||
|
||||
### Step 5: Run full test suite
|
||||
|
||||
```bash
|
||||
/devl/miniconda3/envs/job-seeker/bin/pytest tests/ -v
|
||||
```
|
||||
|
||||
Expected: all tests PASS
|
||||
|
||||
### Step 6: Commit
|
||||
|
||||
```bash
|
||||
git add scripts/db.py tests/test_db.py
|
||||
git commit -m "feat: add background_tasks table and DB helpers"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 2: Create scripts/task_runner.py
|
||||
|
||||
**Files:**
|
||||
- Create: `scripts/task_runner.py`
|
||||
- Test: `tests/test_task_runner.py`
|
||||
|
||||
### Step 1: Write the failing tests
|
||||
|
||||
Create `tests/test_task_runner.py`:
|
||||
|
||||
```python
|
||||
import threading
|
||||
import time
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
import sqlite3
|
||||
|
||||
|
||||
def _make_db(tmp_path):
|
||||
from scripts.db import init_db, insert_job
|
||||
db = tmp_path / "test.db"
|
||||
init_db(db)
|
||||
job_id = insert_job(db, {
|
||||
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||
"salary": "", "description": "Great role.", "date_found": "2026-02-20",
|
||||
})
|
||||
return db, job_id
|
||||
|
||||
|
||||
def test_submit_task_returns_id_and_true(tmp_path):
|
||||
"""submit_task returns (task_id, True) and spawns a thread."""
|
||||
db, job_id = _make_db(tmp_path)
|
||||
with patch("scripts.task_runner._run_task"): # don't actually call LLM
|
||||
from scripts.task_runner import submit_task
|
||||
task_id, is_new = submit_task(db, "cover_letter", job_id)
|
||||
assert isinstance(task_id, int) and task_id > 0
|
||||
assert is_new is True
|
||||
|
||||
|
||||
def test_submit_task_deduplicates(tmp_path):
|
||||
"""submit_task returns (existing_id, False) for a duplicate in-flight task."""
|
||||
db, job_id = _make_db(tmp_path)
|
||||
with patch("scripts.task_runner._run_task"):
|
||||
from scripts.task_runner import submit_task
|
||||
first_id, _ = submit_task(db, "cover_letter", job_id)
|
||||
second_id, is_new = submit_task(db, "cover_letter", job_id)
|
||||
assert second_id == first_id
|
||||
assert is_new is False
|
||||
|
||||
|
||||
def test_run_task_cover_letter_success(tmp_path):
|
||||
"""_run_task marks running→completed and saves cover letter to DB."""
|
||||
db, job_id = _make_db(tmp_path)
|
||||
from scripts.db import insert_task, get_task_for_job, get_jobs_by_status
|
||||
task_id, _ = insert_task(db, "cover_letter", job_id)
|
||||
|
||||
with patch("scripts.generate_cover_letter.generate", return_value="Dear Hiring Manager,\nGreat fit!"):
|
||||
from scripts.task_runner import _run_task
|
||||
_run_task(db, task_id, "cover_letter", job_id)
|
||||
|
||||
task = get_task_for_job(db, "cover_letter", job_id)
|
||||
assert task["status"] == "completed"
|
||||
assert task["error"] is None
|
||||
|
||||
conn = sqlite3.connect(db)
|
||||
row = conn.execute("SELECT cover_letter FROM jobs WHERE id=?", (job_id,)).fetchone()
|
||||
conn.close()
|
||||
assert row[0] == "Dear Hiring Manager,\nGreat fit!"
|
||||
|
||||
|
||||
def test_run_task_company_research_success(tmp_path):
|
||||
"""_run_task marks running→completed and saves research to DB."""
|
||||
db, job_id = _make_db(tmp_path)
|
||||
from scripts.db import insert_task, get_task_for_job, get_research
|
||||
|
||||
task_id, _ = insert_task(db, "company_research", job_id)
|
||||
fake_result = {
|
||||
"raw_output": "raw", "company_brief": "brief",
|
||||
"ceo_brief": "ceo", "talking_points": "points",
|
||||
}
|
||||
with patch("scripts.company_research.research_company", return_value=fake_result):
|
||||
from scripts.task_runner import _run_task
|
||||
_run_task(db, task_id, "company_research", job_id)
|
||||
|
||||
task = get_task_for_job(db, "company_research", job_id)
|
||||
assert task["status"] == "completed"
|
||||
|
||||
research = get_research(db, job_id=job_id)
|
||||
assert research["company_brief"] == "brief"
|
||||
|
||||
|
||||
def test_run_task_marks_failed_on_exception(tmp_path):
|
||||
"""_run_task marks status=failed and stores error when generator raises."""
|
||||
db, job_id = _make_db(tmp_path)
|
||||
from scripts.db import insert_task, get_task_for_job
|
||||
task_id, _ = insert_task(db, "cover_letter", job_id)
|
||||
|
||||
with patch("scripts.generate_cover_letter.generate", side_effect=RuntimeError("LLM timeout")):
|
||||
from scripts.task_runner import _run_task
|
||||
_run_task(db, task_id, "cover_letter", job_id)
|
||||
|
||||
task = get_task_for_job(db, "cover_letter", job_id)
|
||||
assert task["status"] == "failed"
|
||||
assert "LLM timeout" in task["error"]
|
||||
|
||||
|
||||
def test_submit_task_actually_completes(tmp_path):
|
||||
"""Integration: submit_task spawns a thread that completes asynchronously."""
|
||||
db, job_id = _make_db(tmp_path)
|
||||
from scripts.db import get_task_for_job
|
||||
|
||||
with patch("scripts.generate_cover_letter.generate", return_value="Cover letter text"):
|
||||
from scripts.task_runner import submit_task
|
||||
task_id, _ = submit_task(db, "cover_letter", job_id)
|
||||
# Wait for thread to complete (max 5s)
|
||||
for _ in range(50):
|
||||
task = get_task_for_job(db, "cover_letter", job_id)
|
||||
if task and task["status"] in ("completed", "failed"):
|
||||
break
|
||||
time.sleep(0.1)
|
||||
|
||||
task = get_task_for_job(db, "cover_letter", job_id)
|
||||
assert task["status"] == "completed"
|
||||
```
|
||||
|
||||
### Step 2: Run tests to verify they fail
|
||||
|
||||
```bash
|
||||
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_task_runner.py -v
|
||||
```
|
||||
|
||||
Expected: FAIL with `ModuleNotFoundError: No module named 'scripts.task_runner'`
|
||||
|
||||
### Step 3: Implement scripts/task_runner.py
|
||||
|
||||
Create `scripts/task_runner.py`:
|
||||
|
||||
```python
|
||||
# scripts/task_runner.py
|
||||
"""
|
||||
Background task runner for LLM generation tasks.
|
||||
|
||||
Submitting a task inserts a row in background_tasks and spawns a daemon thread.
|
||||
The thread calls the appropriate generator, writes results to existing tables,
|
||||
and marks the task completed or failed.
|
||||
|
||||
Deduplication: only one queued/running task per (task_type, job_id) is allowed.
|
||||
Different task types for the same job run concurrently (e.g. cover letter + research).
|
||||
"""
|
||||
import sqlite3
|
||||
import threading
|
||||
from pathlib import Path
|
||||
|
||||
from scripts.db import (
|
||||
DEFAULT_DB,
|
||||
insert_task,
|
||||
update_task_status,
|
||||
update_cover_letter,
|
||||
save_research,
|
||||
)
|
||||
|
||||
|
||||
def submit_task(db_path: Path = DEFAULT_DB, task_type: str = "",
|
||||
job_id: int = None) -> tuple[int, bool]:
|
||||
"""Submit a background LLM task.
|
||||
|
||||
Returns (task_id, True) if a new task was queued and a thread spawned.
|
||||
Returns (existing_id, False) if an identical task is already in-flight.
|
||||
"""
|
||||
task_id, is_new = insert_task(db_path, task_type, job_id)
|
||||
if is_new:
|
||||
t = threading.Thread(
|
||||
target=_run_task,
|
||||
args=(db_path, task_id, task_type, job_id),
|
||||
daemon=True,
|
||||
)
|
||||
t.start()
|
||||
return task_id, is_new
|
||||
|
||||
|
||||
def _run_task(db_path: Path, task_id: int, task_type: str, job_id: int) -> None:
|
||||
"""Thread body: run the generator and persist the result."""
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
row = conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone()
|
||||
conn.close()
|
||||
if row is None:
|
||||
update_task_status(db_path, task_id, "failed", error=f"Job {job_id} not found")
|
||||
return
|
||||
|
||||
job = dict(row)
|
||||
update_task_status(db_path, task_id, "running")
|
||||
|
||||
try:
|
||||
if task_type == "cover_letter":
|
||||
from scripts.generate_cover_letter import generate
|
||||
result = generate(
|
||||
job.get("title", ""),
|
||||
job.get("company", ""),
|
||||
job.get("description", ""),
|
||||
)
|
||||
update_cover_letter(db_path, job_id, result)
|
||||
|
||||
elif task_type == "company_research":
|
||||
from scripts.company_research import research_company
|
||||
result = research_company(job)
|
||||
save_research(db_path, job_id=job_id, **result)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown task_type: {task_type!r}")
|
||||
|
||||
update_task_status(db_path, task_id, "completed")
|
||||
|
||||
except Exception as exc:
|
||||
update_task_status(db_path, task_id, "failed", error=str(exc))
|
||||
```
|
||||
|
||||
### Step 4: Run tests to verify they pass
|
||||
|
||||
```bash
|
||||
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_task_runner.py -v
|
||||
```
|
||||
|
||||
Expected: all tests PASS
|
||||
|
||||
### Step 5: Run full test suite
|
||||
|
||||
```bash
|
||||
/devl/miniconda3/envs/job-seeker/bin/pytest tests/ -v
|
||||
```
|
||||
|
||||
Expected: all tests PASS
|
||||
|
||||
### Step 6: Commit
|
||||
|
||||
```bash
|
||||
git add scripts/task_runner.py tests/test_task_runner.py
|
||||
git commit -m "feat: add task_runner — background thread executor for LLM tasks"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 3: Add sidebar task indicator to app/app.py
|
||||
|
||||
**Files:**
|
||||
- Modify: `app/app.py`
|
||||
|
||||
No new tests needed — this is pure UI wiring.
|
||||
|
||||
### Step 1: Replace the contents of app/app.py
|
||||
|
||||
Current file is 33 lines. Replace entirely with:
|
||||
|
||||
```python
|
||||
# app/app.py
|
||||
"""
|
||||
Streamlit entry point — uses st.navigation() to control the sidebar.
|
||||
Main workflow pages are listed at the top; Settings is separated into
|
||||
a "System" section so it doesn't crowd the navigation.
|
||||
|
||||
Run: streamlit run app/app.py
|
||||
bash scripts/manage-ui.sh start
|
||||
"""
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
import streamlit as st
|
||||
from scripts.db import DEFAULT_DB, init_db, get_active_tasks
|
||||
|
||||
st.set_page_config(
|
||||
page_title="Job Seeker",
|
||||
page_icon="💼",
|
||||
layout="wide",
|
||||
)
|
||||
|
||||
init_db(DEFAULT_DB)
|
||||
|
||||
# ── Background task sidebar indicator ─────────────────────────────────────────
|
||||
@st.fragment(run_every=3)
|
||||
def _task_sidebar() -> None:
|
||||
tasks = get_active_tasks(DEFAULT_DB)
|
||||
if not tasks:
|
||||
return
|
||||
with st.sidebar:
|
||||
st.divider()
|
||||
st.markdown(f"**⏳ {len(tasks)} task(s) running**")
|
||||
for t in tasks:
|
||||
icon = "⏳" if t["status"] == "running" else "🕐"
|
||||
label = "Cover letter" if t["task_type"] == "cover_letter" else "Research"
|
||||
st.caption(f"{icon} {label} — {t.get('company') or 'unknown'}")
|
||||
|
||||
_task_sidebar()
|
||||
|
||||
# ── Navigation ─────────────────────────────────────────────────────────────────
|
||||
pages = {
|
||||
"": [
|
||||
st.Page("Home.py", title="Home", icon="🏠"),
|
||||
st.Page("pages/1_Job_Review.py", title="Job Review", icon="📋"),
|
||||
st.Page("pages/4_Apply.py", title="Apply Workspace", icon="🚀"),
|
||||
st.Page("pages/5_Interviews.py", title="Interviews", icon="🎯"),
|
||||
st.Page("pages/6_Interview_Prep.py", title="Interview Prep", icon="📞"),
|
||||
],
|
||||
"System": [
|
||||
st.Page("pages/2_Settings.py", title="Settings", icon="⚙️"),
|
||||
],
|
||||
}
|
||||
|
||||
pg = st.navigation(pages)
|
||||
pg.run()
|
||||
```
|
||||
|
||||
### Step 2: Smoke-test by running the UI
|
||||
|
||||
```bash
|
||||
bash /devl/job-seeker/scripts/manage-ui.sh restart
|
||||
```
|
||||
|
||||
Navigate to http://localhost:8501 and confirm the app loads without error. The sidebar task indicator does not appear when no tasks are running (correct).
|
||||
|
||||
### Step 3: Commit
|
||||
|
||||
```bash
|
||||
git add app/app.py
|
||||
git commit -m "feat: sidebar background task indicator with 3s auto-refresh"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 4: Update 4_Apply.py to use background generation
|
||||
|
||||
**Files:**
|
||||
- Modify: `app/pages/4_Apply.py`
|
||||
|
||||
No new unit tests — covered by existing test suite for DB layer. Smoke-test in browser.
|
||||
|
||||
### Step 1: Add imports at the top of 4_Apply.py
|
||||
|
||||
After the existing imports block (after `from scripts.db import ...`), add:
|
||||
|
||||
```python
|
||||
from scripts.db import get_task_for_job
|
||||
from scripts.task_runner import submit_task
|
||||
```
|
||||
|
||||
So the full import block becomes:
|
||||
|
||||
```python
|
||||
from scripts.db import (
|
||||
DEFAULT_DB, init_db, get_jobs_by_status,
|
||||
update_cover_letter, mark_applied,
|
||||
get_task_for_job,
|
||||
)
|
||||
from scripts.task_runner import submit_task
|
||||
```
|
||||
|
||||
### Step 2: Replace the Generate button section
|
||||
|
||||
Find this block (around line 174–185):
|
||||
|
||||
```python
|
||||
if st.button("✨ Generate / Regenerate", use_container_width=True):
|
||||
with st.spinner("Generating via LLM…"):
|
||||
try:
|
||||
from scripts.generate_cover_letter import generate as _gen
|
||||
st.session_state[_cl_key] = _gen(
|
||||
job.get("title", ""),
|
||||
job.get("company", ""),
|
||||
job.get("description", ""),
|
||||
)
|
||||
st.rerun()
|
||||
except Exception as e:
|
||||
st.error(f"Generation failed: {e}")
|
||||
```
|
||||
|
||||
Replace with:
|
||||
|
||||
```python
|
||||
_cl_task = get_task_for_job(DEFAULT_DB, "cover_letter", selected_id)
|
||||
_cl_running = _cl_task and _cl_task["status"] in ("queued", "running")
|
||||
|
||||
if st.button("✨ Generate / Regenerate", use_container_width=True, disabled=bool(_cl_running)):
|
||||
submit_task(DEFAULT_DB, "cover_letter", selected_id)
|
||||
st.rerun()
|
||||
|
||||
if _cl_running:
|
||||
@st.fragment(run_every=3)
|
||||
def _cl_status_fragment():
|
||||
t = get_task_for_job(DEFAULT_DB, "cover_letter", selected_id)
|
||||
if t and t["status"] in ("queued", "running"):
|
||||
lbl = "Queued…" if t["status"] == "queued" else "Generating via LLM…"
|
||||
st.info(f"⏳ {lbl}")
|
||||
else:
|
||||
st.rerun() # full page rerun — reloads cover letter from DB
|
||||
_cl_status_fragment()
|
||||
elif _cl_task and _cl_task["status"] == "failed":
|
||||
st.error(f"Generation failed: {_cl_task.get('error', 'unknown error')}")
|
||||
```
|
||||
|
||||
Also update the session-state initialiser just below (line 171–172) so it loads from DB after background completion. The existing code already does this correctly:
|
||||
|
||||
```python
|
||||
if _cl_key not in st.session_state:
|
||||
st.session_state[_cl_key] = job.get("cover_letter") or ""
|
||||
```
|
||||
|
||||
This is fine — `job` is fetched fresh on each full-page rerun, so when the background thread writes to `jobs.cover_letter`, the next full rerun picks it up.
|
||||
|
||||
### Step 3: Smoke-test in browser
|
||||
|
||||
1. Navigate to Apply Workspace
|
||||
2. Select an approved job
|
||||
3. Click "Generate / Regenerate"
|
||||
4. Navigate away to Home
|
||||
5. Navigate back to Apply Workspace for the same job
|
||||
6. Observe: button is disabled and "⏳ Generating via LLM…" shows while running; cover letter appears when done
|
||||
|
||||
### Step 4: Commit
|
||||
|
||||
```bash
|
||||
git add app/pages/4_Apply.py
|
||||
git commit -m "feat: cover letter generation runs in background, survives navigation"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 5: Update 6_Interview_Prep.py to use background research
|
||||
|
||||
**Files:**
|
||||
- Modify: `app/pages/6_Interview_Prep.py`
|
||||
|
||||
### Step 1: Add imports at the top of 6_Interview_Prep.py
|
||||
|
||||
After the existing `from scripts.db import (...)` block, add:
|
||||
|
||||
```python
|
||||
from scripts.db import get_task_for_job
|
||||
from scripts.task_runner import submit_task
|
||||
```
|
||||
|
||||
So the full import block becomes:
|
||||
|
||||
```python
|
||||
from scripts.db import (
|
||||
DEFAULT_DB, init_db,
|
||||
get_interview_jobs, get_contacts, get_research,
|
||||
save_research, get_task_for_job,
|
||||
)
|
||||
from scripts.task_runner import submit_task
|
||||
```
|
||||
|
||||
### Step 2: Replace the "no research yet" generate button block
|
||||
|
||||
Find this block (around line 99–111):
|
||||
|
||||
```python
|
||||
if not research:
|
||||
st.warning("No research brief yet for this job.")
|
||||
if st.button("🔬 Generate research brief", type="primary", use_container_width=True):
|
||||
with st.spinner("Generating… this may take 30–60 seconds"):
|
||||
try:
|
||||
from scripts.company_research import research_company
|
||||
result = research_company(job)
|
||||
save_research(DEFAULT_DB, job_id=selected_id, **result)
|
||||
st.success("Done!")
|
||||
st.rerun()
|
||||
except Exception as e:
|
||||
st.error(f"Error: {e}")
|
||||
st.stop()
|
||||
else:
|
||||
```
|
||||
|
||||
Replace with:
|
||||
|
||||
```python
|
||||
_res_task = get_task_for_job(DEFAULT_DB, "company_research", selected_id)
|
||||
_res_running = _res_task and _res_task["status"] in ("queued", "running")
|
||||
|
||||
if not research:
|
||||
if not _res_running:
|
||||
st.warning("No research brief yet for this job.")
|
||||
if _res_task and _res_task["status"] == "failed":
|
||||
st.error(f"Last attempt failed: {_res_task.get('error', '')}")
|
||||
if st.button("🔬 Generate research brief", type="primary", use_container_width=True):
|
||||
submit_task(DEFAULT_DB, "company_research", selected_id)
|
||||
st.rerun()
|
||||
|
||||
if _res_running:
|
||||
@st.fragment(run_every=3)
|
||||
def _res_status_initial():
|
||||
t = get_task_for_job(DEFAULT_DB, "company_research", selected_id)
|
||||
if t and t["status"] in ("queued", "running"):
|
||||
lbl = "Queued…" if t["status"] == "queued" else "Generating… this may take 30–60 seconds"
|
||||
st.info(f"⏳ {lbl}")
|
||||
else:
|
||||
st.rerun()
|
||||
_res_status_initial()
|
||||
|
||||
st.stop()
|
||||
else:
|
||||
```
|
||||
|
||||
### Step 3: Replace the "refresh" button block
|
||||
|
||||
Find this block (around line 113–124):
|
||||
|
||||
```python
|
||||
generated_at = research.get("generated_at", "")
|
||||
col_ts, col_btn = st.columns([3, 1])
|
||||
col_ts.caption(f"Research generated: {generated_at}")
|
||||
if col_btn.button("🔄 Refresh", use_container_width=True):
|
||||
with st.spinner("Refreshing…"):
|
||||
try:
|
||||
from scripts.company_research import research_company
|
||||
result = research_company(job)
|
||||
save_research(DEFAULT_DB, job_id=selected_id, **result)
|
||||
st.rerun()
|
||||
except Exception as e:
|
||||
st.error(f"Error: {e}")
|
||||
```
|
||||
|
||||
Replace with:
|
||||
|
||||
```python
|
||||
generated_at = research.get("generated_at", "")
|
||||
col_ts, col_btn = st.columns([3, 1])
|
||||
col_ts.caption(f"Research generated: {generated_at}")
|
||||
if col_btn.button("🔄 Refresh", use_container_width=True, disabled=bool(_res_running)):
|
||||
submit_task(DEFAULT_DB, "company_research", selected_id)
|
||||
st.rerun()
|
||||
|
||||
if _res_running:
|
||||
@st.fragment(run_every=3)
|
||||
def _res_status_refresh():
|
||||
t = get_task_for_job(DEFAULT_DB, "company_research", selected_id)
|
||||
if t and t["status"] in ("queued", "running"):
|
||||
lbl = "Queued…" if t["status"] == "queued" else "Refreshing research…"
|
||||
st.info(f"⏳ {lbl}")
|
||||
else:
|
||||
st.rerun()
|
||||
_res_status_refresh()
|
||||
elif _res_task and _res_task["status"] == "failed":
|
||||
st.error(f"Refresh failed: {_res_task.get('error', '')}")
|
||||
```
|
||||
|
||||
### Step 4: Smoke-test in browser
|
||||
|
||||
1. Move a job to Phone Screen on the Interviews page
|
||||
2. Navigate to Interview Prep, select that job
|
||||
3. Click "Generate research brief"
|
||||
4. Navigate away to Home
|
||||
5. Navigate back — observe "⏳ Generating…" inline indicator
|
||||
6. Wait for completion — research sections populate automatically
|
||||
|
||||
### Step 5: Run full test suite one final time
|
||||
|
||||
```bash
|
||||
/devl/miniconda3/envs/job-seeker/bin/pytest tests/ -v
|
||||
```
|
||||
|
||||
Expected: all tests PASS
|
||||
|
||||
### Step 6: Commit
|
||||
|
||||
```bash
|
||||
git add app/pages/6_Interview_Prep.py
|
||||
git commit -m "feat: company research generation runs in background, survives navigation"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Summary of Changes
|
||||
|
||||
| File | Change |
|
||||
|------|--------|
|
||||
| `scripts/db.py` | Add `CREATE_BACKGROUND_TASKS`, `init_db` call, 4 new helpers |
|
||||
| `scripts/task_runner.py` | New file — `submit_task` + `_run_task` thread body |
|
||||
| `app/app.py` | Add `_task_sidebar` fragment with 3s auto-refresh |
|
||||
| `app/pages/4_Apply.py` | Generate button → `submit_task`; inline status fragment |
|
||||
| `app/pages/6_Interview_Prep.py` | Generate/Refresh buttons → `submit_task`; inline status fragments |
|
||||
| `tests/test_db.py` | 9 new tests for background_tasks helpers |
|
||||
| `tests/test_task_runner.py` | New file — 6 tests for task_runner |
|
||||
|
|
@ -1,91 +0,0 @@
|
|||
# Email Handling Design
|
||||
|
||||
**Date:** 2026-02-21
|
||||
**Status:** Approved
|
||||
|
||||
## Problem
|
||||
|
||||
IMAP sync already pulls emails for active pipeline jobs, but two gaps exist:
|
||||
1. Inbound emails suggesting a stage change (e.g. "let's schedule a call") produce no signal — the recruiter's message just sits in the email log.
|
||||
2. Recruiter outreach to email addresses not yet in the pipeline is invisible — those leads never enter Job Review.
|
||||
|
||||
## Goals
|
||||
|
||||
- Surface stage-change suggestions inline on the Interviews kanban card (suggest-only, never auto-advance).
|
||||
- Capture recruiter leads from unmatched inbound email and surface them in Job Review.
|
||||
- Make email sync a background task triggerable from the UI (Home page + Interviews sidebar).
|
||||
|
||||
## Data Model
|
||||
|
||||
**No new tables.** Two columns added to `job_contacts`:
|
||||
|
||||
```sql
|
||||
ALTER TABLE job_contacts ADD COLUMN stage_signal TEXT;
|
||||
ALTER TABLE job_contacts ADD COLUMN suggestion_dismissed INTEGER DEFAULT 0;
|
||||
```
|
||||
|
||||
- `stage_signal` — one of: `interview_scheduled`, `offer_received`, `rejected`, `positive_response`, `neutral` (or NULL if not yet classified).
|
||||
- `suggestion_dismissed` — 1 when the user clicks Dismiss; prevents the banner re-appearing.
|
||||
|
||||
Email leads reuse the existing `jobs` table with `source = 'email'` and `status = 'pending'`. No new columns needed.
|
||||
|
||||
## Components
|
||||
|
||||
### 1. Stage Signal Classification (`scripts/imap_sync.py`)
|
||||
|
||||
After saving each **inbound** contact row, call `phi3:mini` via Ollama to classify the email into one of the five labels. Store the result in `stage_signal`. If classification fails, default to `NULL` (no suggestion shown).
|
||||
|
||||
**Model:** `phi3:mini` via `LLMRouter.complete(model_override="phi3:mini", fallback_order=["ollama_research"])`.
|
||||
Benchmarked at 100% accuracy / 3.0 s per email on a 12-case test suite. Runner-up Qwen2.5-3B untested but phi3-mini is the safe choice.
|
||||
|
||||
### 2. Recruiter Lead Extraction (`scripts/imap_sync.py`)
|
||||
|
||||
A second pass after per-job sync: scan INBOX broadly for recruitment-keyword emails that don't match any known pipeline company. For each unmatched email, call **Nemotron 1.5B** (already in use for company research) to extract `{company, title}`. If extraction returns a company name not already in the DB, insert a new job row `source='email', status='pending'`.
|
||||
|
||||
**Dedup:** checked by `message_id` against all known contacts (cross-job), plus `url` uniqueness on the jobs table (the email lead URL is set to a synthetic `email://<from_domain>/<message_id>` value).
|
||||
|
||||
### 3. Background Task (`scripts/task_runner.py`)
|
||||
|
||||
New task type: `email_sync` with `job_id = 0`.
|
||||
`submit_task(db, "email_sync", 0)` → daemon thread → `sync_all()` → returns summary via task `error` field.
|
||||
|
||||
Deduplication: only one `email_sync` can be queued/running at a time (existing insert_task logic handles this).
|
||||
|
||||
### 4. UI — Sync Button (Home + Interviews)
|
||||
|
||||
**Home.py:** New "Sync Emails" section alongside Find Jobs / Score / Notion sync.
|
||||
**5_Interviews.py:** Existing sync button already present in sidebar; convert from synchronous `sync_all()` call to `submit_task()` + fragment polling.
|
||||
|
||||
### 5. UI — Email Leads (Job Review)
|
||||
|
||||
When `show_status == "pending"`, prepend email leads (`source = 'email'`) at the top of the list with a distinct `📧 Email Lead` badge. Actions are identical to scraped pending jobs (Approve / Reject).
|
||||
|
||||
### 6. UI — Stage Suggestion Banner (Interviews Kanban)
|
||||
|
||||
Inside `_render_card()`, before the advance/reject buttons, check for unseen stage signals:
|
||||
|
||||
```
|
||||
💡 Email suggests: interview_scheduled
|
||||
From: sarah@company.com · "Let's book a call"
|
||||
[→ Move to Phone Screen] [Dismiss]
|
||||
```
|
||||
|
||||
- "Move" calls `advance_to_stage()` + `submit_task("company_research")` then reruns.
|
||||
- "Dismiss" calls `dismiss_stage_signal(contact_id)` then reruns.
|
||||
- Only the most recent undismissed signal is shown per card.
|
||||
|
||||
## Error Handling
|
||||
|
||||
| Failure | Behaviour |
|
||||
|---------|-----------|
|
||||
| IMAP connection fails | Error stored in task `error` field; shown as warning in UI after sync |
|
||||
| Classifier call fails | `stage_signal` left NULL; no suggestion shown; sync continues |
|
||||
| Lead extractor fails | Email skipped; appended to `result["errors"]`; sync continues |
|
||||
| Duplicate `email_sync` task | `insert_task` returns existing id; no new thread spawned |
|
||||
| LLM extraction returns no company | Email silently skipped (not a lead) |
|
||||
|
||||
## Out of Scope
|
||||
|
||||
- Auto-advancing pipeline stage (suggest only).
|
||||
- Sending email replies from the app (draft helper already exists).
|
||||
- OAuth / token-refresh IMAP (config/email.yaml credentials only).
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,187 +0,0 @@
|
|||
# Research Workflow Redesign
|
||||
|
||||
**Date:** 2026-02-22
|
||||
**Status:** Approved
|
||||
|
||||
## Problem
|
||||
|
||||
The current `company_research.py` produces shallow output:
|
||||
- Resume context is a hardcoded 2-sentence blurb — talking points aren't grounded in Meghan's actual experience
|
||||
- Search coverage is limited: CEO, HQ, LinkedIn, one generic news query
|
||||
- Output has 4 sections; new data categories (tech stack, funding, culture, competitors) have nowhere to go
|
||||
- No skills/keyword config to drive experience matching against the JD
|
||||
|
||||
## Approach: Query Expansion + Parallel JSON Searches + Single LLM Pass
|
||||
|
||||
Run all searches (companyScraper sequential + new parallel SearXNG JSON queries), aggregate into a structured context block, pre-select resume experiences by keyword score, single LLM call produces all expanded sections.
|
||||
|
||||
---
|
||||
|
||||
## Design
|
||||
|
||||
### 1. Search Pipeline
|
||||
|
||||
**Phase 1 — companyScraper (unchanged, sequential)**
|
||||
- CEO name, HQ address, LinkedIn URL
|
||||
|
||||
**Phase 1b — Parallel SearXNG JSON queries (new/expanded)**
|
||||
|
||||
Six queries run concurrently via daemon threads:
|
||||
|
||||
| Intent | Query pattern |
|
||||
|---|---|
|
||||
| Recent news/press | `"{company}" news 2025 2026` |
|
||||
| Funding & investors | `"{company}" funding round investors Series valuation` |
|
||||
| Tech stack | `"{company}" tech stack engineering technology platform` |
|
||||
| Competitors | `"{company}" competitors alternatives vs market` |
|
||||
| Culture / Glassdoor | `"{company}" glassdoor culture reviews employees` |
|
||||
| CEO press (if found) | `"{ceo}" "{company}"` |
|
||||
|
||||
Each returns 3–4 deduplicated snippets (title + content + URL), labeled by type.
|
||||
Results are best-effort — any failed query is silently skipped.
|
||||
|
||||
---
|
||||
|
||||
### 2. Resume Matching
|
||||
|
||||
**`config/resume_keywords.yaml`** — three categories, tag-managed via Settings UI:
|
||||
|
||||
```yaml
|
||||
skills:
|
||||
- Customer Success
|
||||
- Technical Account Management
|
||||
- Revenue Operations
|
||||
- Salesforce
|
||||
- Gainsight
|
||||
- data analysis
|
||||
- stakeholder management
|
||||
|
||||
domains:
|
||||
- B2B SaaS
|
||||
- enterprise software
|
||||
- security / compliance
|
||||
- post-sale lifecycle
|
||||
|
||||
keywords:
|
||||
- QBR
|
||||
- churn reduction
|
||||
- NRR / ARR
|
||||
- onboarding
|
||||
- renewal
|
||||
- executive sponsorship
|
||||
- VOC
|
||||
```
|
||||
|
||||
**Matching logic:**
|
||||
1. Case-insensitive substring check of all keywords against JD text → `matched_keywords` list
|
||||
2. Score each experience entry: count of matched keywords appearing in position title + responsibility bullets
|
||||
3. Top 2 by score → included in prompt as full detail (position, company, period, all bullets)
|
||||
4. Remaining entries → condensed one-liners ("Founder @ M3 Consulting, 2023–present")
|
||||
|
||||
**UpGuard NDA rule** (explicit in prompt): reference as "enterprise security vendor" in general; only name UpGuard directly if the role has a strong security/compliance focus.
|
||||
|
||||
---
|
||||
|
||||
### 3. LLM Context Block Structure
|
||||
|
||||
```
|
||||
## Role Context
|
||||
{title} at {company}
|
||||
|
||||
## Job Description
|
||||
{JD text, up to 2500 chars}
|
||||
|
||||
## Meghan's Matched Experience
|
||||
[Top 2 scored experience entries — full detail]
|
||||
|
||||
Also in Meghan's background: [remaining entries as one-liners]
|
||||
|
||||
## Matched Skills & Keywords
|
||||
Skills matching this JD: {matched_keywords joined}
|
||||
|
||||
## Live Company Data
|
||||
- CEO: {name}
|
||||
- HQ: {location}
|
||||
- LinkedIn: {url}
|
||||
|
||||
## News & Press
|
||||
[snippets]
|
||||
|
||||
## Funding & Investors
|
||||
[snippets]
|
||||
|
||||
## Tech Stack
|
||||
[snippets]
|
||||
|
||||
## Competitors
|
||||
[snippets]
|
||||
|
||||
## Culture & Employee Signals
|
||||
[snippets]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4. Output Sections (7, up from 4)
|
||||
|
||||
| Section header | Purpose |
|
||||
|---|---|
|
||||
| `## Company Overview` | What they do, business model, size/stage, market position |
|
||||
| `## Leadership & Culture` | CEO background, leadership team, philosophy |
|
||||
| `## Tech Stack & Product` | What they build, relevant technology, product direction |
|
||||
| `## Funding & Market Position` | Stage, investors, recent rounds, competitor landscape |
|
||||
| `## Recent Developments` | News, launches, pivots, exec moves |
|
||||
| `## Red Flags & Watch-outs` | Culture issues, layoffs, exec departures, financial stress |
|
||||
| `## Talking Points for Meghan` | 5 role-matched, resume-grounded, UpGuard-aware talking points ready to speak aloud |
|
||||
|
||||
Talking points prompt instructs LLM to: cite the specific matched experience by name, reference matched skills, apply UpGuard NDA rule, frame each as a ready-to-speak sentence.
|
||||
|
||||
---
|
||||
|
||||
### 5. DB Schema Changes
|
||||
|
||||
Add columns to `company_research` table:
|
||||
|
||||
```sql
|
||||
ALTER TABLE company_research ADD COLUMN tech_brief TEXT;
|
||||
ALTER TABLE company_research ADD COLUMN funding_brief TEXT;
|
||||
ALTER TABLE company_research ADD COLUMN competitors_brief TEXT;
|
||||
ALTER TABLE company_research ADD COLUMN red_flags TEXT;
|
||||
```
|
||||
|
||||
Existing columns (`company_brief`, `ceo_brief`, `talking_points`, `raw_output`) unchanged.
|
||||
|
||||
---
|
||||
|
||||
### 6. Settings UI — Skills & Keywords Tab
|
||||
|
||||
New tab in `app/pages/2_Settings.py`:
|
||||
- One expander or subheader per category (Skills, Domains, Keywords)
|
||||
- Tag chips rendered with `st.pills` or columns of `st.badge`-style buttons with ×
|
||||
- Inline text input + Add button per category
|
||||
- Each add/remove saves immediately to `config/resume_keywords.yaml`
|
||||
|
||||
---
|
||||
|
||||
### 7. Interview Prep UI Changes
|
||||
|
||||
`app/pages/6_Interview_Prep.py` — render new sections alongside existing ones:
|
||||
- Tech Stack & Product (new panel)
|
||||
- Funding & Market Position (new panel)
|
||||
- Red Flags & Watch-outs (new panel, visually distinct — e.g. orange/amber)
|
||||
- Talking Points promoted to top (most useful during a live call)
|
||||
|
||||
---
|
||||
|
||||
## Files Affected
|
||||
|
||||
| File | Change |
|
||||
|---|---|
|
||||
| `scripts/company_research.py` | Parallel search queries, resume matching, expanded prompt + sections |
|
||||
| `scripts/db.py` | Add 4 new columns to `company_research`; update `save_research` / `get_research` |
|
||||
| `config/resume_keywords.yaml` | New file |
|
||||
| `config/resume_keywords.yaml.example` | New committed template |
|
||||
| `app/pages/2_Settings.py` | New Skills & Keywords tab |
|
||||
| `app/pages/6_Interview_Prep.py` | Render new sections |
|
||||
| `tests/test_db.py` | Tests for new columns |
|
||||
| `tests/test_company_research.py` | New test file for matching logic + section parsing |
|
||||
|
|
@ -1,869 +0,0 @@
|
|||
# Research Workflow Redesign — Implementation Plan
|
||||
|
||||
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
|
||||
|
||||
**Goal:** Expand company research to gather richer web data (funding, tech stack, competitors, culture/Glassdoor, news), match Meghan's resume experience against the JD, and produce a 7-section brief with role-grounded talking points.
|
||||
|
||||
**Architecture:** Parallel SearXNG JSON queries (6 types) feed a structured context block alongside tiered resume experience (top-2 scored full, rest condensed) from `config/resume_keywords.yaml`. Single LLM call produces 7 output sections stored in expanded DB columns.
|
||||
|
||||
**Tech Stack:** Python threading, requests (SearXNG JSON API at `http://localhost:8888/search?format=json`), PyYAML, SQLite ALTER TABLE migrations, Streamlit `st.pills` / column chips.
|
||||
|
||||
**Design doc:** `docs/plans/2026-02-22-research-workflow-design.md`
|
||||
|
||||
**Run tests:** `/devl/miniconda3/envs/job-seeker/bin/pytest tests/ -v`
|
||||
**Python:** `conda run -n job-seeker python <script>`
|
||||
|
||||
---
|
||||
|
||||
### Task 1: DB migration — add 4 new columns to `company_research`
|
||||
|
||||
The project uses `_RESEARCH_MIGRATIONS` list + `_migrate_db()` pattern (see `scripts/db.py:81-107`). Add columns there so existing DBs are upgraded automatically on `init_db()`.
|
||||
|
||||
**Files:**
|
||||
- Modify: `scripts/db.py`
|
||||
- Modify: `tests/test_db.py`
|
||||
|
||||
**Step 1: Write the failing tests**
|
||||
|
||||
Add to `tests/test_db.py`:
|
||||
|
||||
```python
|
||||
def test_company_research_has_new_columns(tmp_path):
|
||||
db = tmp_path / "test.db"
|
||||
init_db(db)
|
||||
conn = sqlite3.connect(db)
|
||||
cols = [r[1] for r in conn.execute("PRAGMA table_info(company_research)").fetchall()]
|
||||
conn.close()
|
||||
assert "tech_brief" in cols
|
||||
assert "funding_brief" in cols
|
||||
assert "competitors_brief" in cols
|
||||
assert "red_flags" in cols
|
||||
|
||||
def test_save_and_get_research_new_fields(tmp_path):
|
||||
db = tmp_path / "test.db"
|
||||
init_db(db)
|
||||
# Insert a job first
|
||||
conn = sqlite3.connect(db)
|
||||
conn.execute("INSERT INTO jobs (title, company) VALUES ('TAM', 'Acme')")
|
||||
job_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
save_research(db, job_id=job_id,
|
||||
company_brief="overview", ceo_brief="ceo",
|
||||
talking_points="points", raw_output="raw",
|
||||
tech_brief="tech stack", funding_brief="series B",
|
||||
competitors_brief="vs competitors", red_flags="none")
|
||||
r = get_research(db, job_id=job_id)
|
||||
assert r["tech_brief"] == "tech stack"
|
||||
assert r["funding_brief"] == "series B"
|
||||
assert r["competitors_brief"] == "vs competitors"
|
||||
assert r["red_flags"] == "none"
|
||||
```
|
||||
|
||||
**Step 2: Run to confirm failure**
|
||||
|
||||
```
|
||||
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_db.py::test_company_research_has_new_columns tests/test_db.py::test_save_and_get_research_new_fields -v
|
||||
```
|
||||
|
||||
Expected: FAIL — columns and parameters don't exist yet.
|
||||
|
||||
**Step 3: Add `_RESEARCH_MIGRATIONS` and wire into `_migrate_db`**
|
||||
|
||||
In `scripts/db.py`, after `_CONTACT_MIGRATIONS` (line ~53), add:
|
||||
|
||||
```python
|
||||
_RESEARCH_MIGRATIONS = [
|
||||
("tech_brief", "TEXT"),
|
||||
("funding_brief", "TEXT"),
|
||||
("competitors_brief", "TEXT"),
|
||||
("red_flags", "TEXT"),
|
||||
]
|
||||
```
|
||||
|
||||
In `_migrate_db()`, after the `_CONTACT_MIGRATIONS` loop, add:
|
||||
|
||||
```python
|
||||
for col, coltype in _RESEARCH_MIGRATIONS:
|
||||
try:
|
||||
conn.execute(f"ALTER TABLE company_research ADD COLUMN {col} {coltype}")
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
```
|
||||
|
||||
**Step 4: Update `save_research` signature and SQL**
|
||||
|
||||
Replace the existing `save_research` function:
|
||||
|
||||
```python
|
||||
def save_research(db_path: Path = DEFAULT_DB, job_id: int = None,
|
||||
company_brief: str = "", ceo_brief: str = "",
|
||||
talking_points: str = "", raw_output: str = "",
|
||||
tech_brief: str = "", funding_brief: str = "",
|
||||
competitors_brief: str = "", red_flags: str = "") -> None:
|
||||
"""Insert or replace a company research record for a job."""
|
||||
now = datetime.now().isoformat()[:16]
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.execute(
|
||||
"""INSERT INTO company_research
|
||||
(job_id, generated_at, company_brief, ceo_brief, talking_points,
|
||||
raw_output, tech_brief, funding_brief, competitors_brief, red_flags)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(job_id) DO UPDATE SET
|
||||
generated_at = excluded.generated_at,
|
||||
company_brief = excluded.company_brief,
|
||||
ceo_brief = excluded.ceo_brief,
|
||||
talking_points = excluded.talking_points,
|
||||
raw_output = excluded.raw_output,
|
||||
tech_brief = excluded.tech_brief,
|
||||
funding_brief = excluded.funding_brief,
|
||||
competitors_brief = excluded.competitors_brief,
|
||||
red_flags = excluded.red_flags""",
|
||||
(job_id, now, company_brief, ceo_brief, talking_points, raw_output,
|
||||
tech_brief, funding_brief, competitors_brief, red_flags),
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
```
|
||||
|
||||
(`get_research` uses `SELECT *` so it picks up new columns automatically — no change needed.)
|
||||
|
||||
**Step 5: Run tests**
|
||||
|
||||
```
|
||||
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_db.py -v
|
||||
```
|
||||
|
||||
Expected: all pass.
|
||||
|
||||
**Step 6: Commit**
|
||||
|
||||
```bash
|
||||
git add scripts/db.py tests/test_db.py
|
||||
git commit -m "feat: add tech_brief, funding_brief, competitors_brief, red_flags to company_research"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 2: Create `config/resume_keywords.yaml` and example
|
||||
|
||||
**Files:**
|
||||
- Create: `config/resume_keywords.yaml`
|
||||
- Create: `config/resume_keywords.yaml.example`
|
||||
|
||||
**Step 1: Create `config/resume_keywords.yaml`**
|
||||
|
||||
```yaml
|
||||
skills:
|
||||
- Customer Success
|
||||
- Technical Account Management
|
||||
- Revenue Operations
|
||||
- Salesforce
|
||||
- Gainsight
|
||||
- data analysis
|
||||
- stakeholder management
|
||||
- project management
|
||||
- onboarding
|
||||
- renewal management
|
||||
|
||||
domains:
|
||||
- B2B SaaS
|
||||
- enterprise software
|
||||
- security / compliance
|
||||
- post-sale lifecycle
|
||||
- SaaS metrics
|
||||
|
||||
keywords:
|
||||
- QBR
|
||||
- churn reduction
|
||||
- NRR
|
||||
- ARR
|
||||
- MRR
|
||||
- executive sponsorship
|
||||
- VOC
|
||||
- health score
|
||||
- escalation management
|
||||
- cross-functional
|
||||
- product feedback loop
|
||||
- customer advocacy
|
||||
```
|
||||
|
||||
**Step 2: Copy to `.example`**
|
||||
|
||||
```bash
|
||||
cp config/resume_keywords.yaml config/resume_keywords.yaml.example
|
||||
```
|
||||
|
||||
**Step 3: Add to `.gitignore` if personal, or commit both**
|
||||
|
||||
`resume_keywords.yaml` contains Meghan's personal keywords — commit both (no secrets).
|
||||
|
||||
**Step 4: Commit**
|
||||
|
||||
```bash
|
||||
git add config/resume_keywords.yaml config/resume_keywords.yaml.example
|
||||
git commit -m "feat: add resume_keywords.yaml for research experience matching"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 3: Resume matching logic in `company_research.py`
|
||||
|
||||
Load the resume YAML and keywords config, score experience entries against the JD, return tiered context string.
|
||||
|
||||
**Files:**
|
||||
- Modify: `scripts/company_research.py`
|
||||
- Create: `tests/test_company_research.py`
|
||||
|
||||
**Step 1: Write failing tests**
|
||||
|
||||
Create `tests/test_company_research.py`:
|
||||
|
||||
```python
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from scripts.company_research import _score_experiences, _build_resume_context
|
||||
|
||||
|
||||
RESUME_YAML = {
|
||||
"experience_details": [
|
||||
{
|
||||
"position": "Lead Technical Account Manager",
|
||||
"company": "UpGuard",
|
||||
"employment_period": "10/2022 - 05/2023",
|
||||
"key_responsibilities": [
|
||||
{"r1": "Managed enterprise security accounts worth $2M ARR"},
|
||||
{"r2": "Led QBR cadence with C-suite stakeholders"},
|
||||
],
|
||||
},
|
||||
{
|
||||
"position": "Founder and Principal Consultant",
|
||||
"company": "M3 Consulting Services",
|
||||
"employment_period": "07/2023 - Present",
|
||||
"key_responsibilities": [
|
||||
{"r1": "Revenue operations consulting for SaaS clients"},
|
||||
{"r2": "Built customer success frameworks"},
|
||||
],
|
||||
},
|
||||
{
|
||||
"position": "Customer Success Manager",
|
||||
"company": "Generic Co",
|
||||
"employment_period": "01/2020 - 09/2022",
|
||||
"key_responsibilities": [
|
||||
{"r1": "Managed SMB portfolio"},
|
||||
],
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
KEYWORDS = ["ARR", "QBR", "enterprise", "security", "stakeholder"]
|
||||
JD = "Looking for a TAM with enterprise ARR experience and QBR facilitation skills."
|
||||
|
||||
|
||||
def test_score_experiences_returns_sorted():
|
||||
scored = _score_experiences(RESUME_YAML["experience_details"], KEYWORDS, JD)
|
||||
# UpGuard should score highest (ARR + QBR + enterprise + stakeholder all in bullets)
|
||||
assert scored[0]["company"] == "UpGuard"
|
||||
|
||||
|
||||
def test_build_resume_context_top2_full_rest_condensed():
|
||||
ctx = _build_resume_context(RESUME_YAML, KEYWORDS, JD)
|
||||
# Full detail for top 2
|
||||
assert "Lead Technical Account Manager" in ctx
|
||||
assert "Managed enterprise security accounts" in ctx
|
||||
# Condensed for rest
|
||||
assert "Also in Meghan" in ctx
|
||||
assert "Generic Co" in ctx
|
||||
# UpGuard NDA note present
|
||||
assert "NDA" in ctx or "enterprise security vendor" in ctx
|
||||
```
|
||||
|
||||
**Step 2: Run to confirm failure**
|
||||
|
||||
```
|
||||
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_company_research.py -v
|
||||
```
|
||||
|
||||
Expected: FAIL — functions don't exist.
|
||||
|
||||
**Step 3: Implement `_score_experiences` and `_build_resume_context`**
|
||||
|
||||
Add to `scripts/company_research.py`, after the `_parse_sections` function:
|
||||
|
||||
```python
|
||||
_RESUME_YAML = Path(__file__).parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml"
|
||||
_KEYWORDS_YAML = Path(__file__).parent.parent / "config" / "resume_keywords.yaml"
|
||||
|
||||
# Companies where Meghan has an NDA — reference engagement but not specifics
|
||||
# unless the role is a strong security/compliance match (score >= 3 on JD).
|
||||
_NDA_COMPANIES = {"upguard"}
|
||||
|
||||
|
||||
def _score_experiences(experiences: list[dict], keywords: list[str], jd: str) -> list[dict]:
|
||||
"""
|
||||
Score each experience entry by how many keywords appear in its text.
|
||||
Returns experiences sorted descending by score, with 'score' key added.
|
||||
"""
|
||||
jd_lower = jd.lower()
|
||||
scored = []
|
||||
for exp in experiences:
|
||||
text = " ".join([
|
||||
exp.get("position", ""),
|
||||
exp.get("company", ""),
|
||||
" ".join(
|
||||
v
|
||||
for resp in exp.get("key_responsibilities", [])
|
||||
for v in resp.values()
|
||||
),
|
||||
]).lower()
|
||||
score = sum(1 for kw in keywords if kw.lower() in text and kw.lower() in jd_lower)
|
||||
scored.append({**exp, "score": score})
|
||||
return sorted(scored, key=lambda x: x["score"], reverse=True)
|
||||
|
||||
|
||||
def _build_resume_context(resume: dict, keywords: list[str], jd: str) -> str:
|
||||
"""
|
||||
Build the resume section of the LLM context block.
|
||||
Top 2 scored experiences included in full detail; rest as one-liners.
|
||||
Applies UpGuard NDA rule: reference as 'enterprise security vendor' unless
|
||||
the role is security-focused (score >= 3).
|
||||
"""
|
||||
import yaml as _yaml
|
||||
|
||||
experiences = resume.get("experience_details", [])
|
||||
if not experiences:
|
||||
return ""
|
||||
|
||||
scored = _score_experiences(experiences, keywords, jd)
|
||||
top2 = scored[:2]
|
||||
rest = scored[2:]
|
||||
|
||||
def _exp_label(exp: dict) -> str:
|
||||
company = exp.get("company", "")
|
||||
if company.lower() in _NDA_COMPANIES and exp.get("score", 0) < 3:
|
||||
company = "enterprise security vendor (NDA)"
|
||||
return f"{exp.get('position', '')} @ {company} ({exp.get('employment_period', '')})"
|
||||
|
||||
def _exp_bullets(exp: dict) -> str:
|
||||
bullets = []
|
||||
for resp in exp.get("key_responsibilities", []):
|
||||
bullets.extend(resp.values())
|
||||
return "\n".join(f" - {b}" for b in bullets)
|
||||
|
||||
lines = ["## Meghan's Matched Experience"]
|
||||
for exp in top2:
|
||||
lines.append(f"\n**{_exp_label(exp)}** (match score: {exp['score']})")
|
||||
lines.append(_exp_bullets(exp))
|
||||
|
||||
if rest:
|
||||
condensed = ", ".join(_exp_label(e) for e in rest)
|
||||
lines.append(f"\nAlso in Meghan's background: {condensed}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _load_resume_and_keywords() -> tuple[dict, list[str]]:
|
||||
"""Load resume YAML and keywords config. Returns (resume_dict, all_keywords)."""
|
||||
import yaml as _yaml
|
||||
|
||||
resume = {}
|
||||
if _RESUME_YAML.exists():
|
||||
resume = _yaml.safe_load(_RESUME_YAML.read_text()) or {}
|
||||
|
||||
keywords: list[str] = []
|
||||
if _KEYWORDS_YAML.exists():
|
||||
kw_cfg = _yaml.safe_load(_KEYWORDS_YAML.read_text()) or {}
|
||||
for lst in kw_cfg.values():
|
||||
if isinstance(lst, list):
|
||||
keywords.extend(lst)
|
||||
|
||||
return resume, keywords
|
||||
```
|
||||
|
||||
**Step 4: Run tests**
|
||||
|
||||
```
|
||||
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_company_research.py -v
|
||||
```
|
||||
|
||||
Expected: all pass.
|
||||
|
||||
**Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add scripts/company_research.py tests/test_company_research.py
|
||||
git commit -m "feat: add resume experience matching and tiered context builder"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 4: Parallel search queries (Phase 1b expansion)
|
||||
|
||||
Replace the current single-threaded news fetch with 6 parallel SearXNG queries. Each runs in its own daemon thread and writes to a shared results dict.
|
||||
|
||||
**Files:**
|
||||
- Modify: `scripts/company_research.py`
|
||||
|
||||
**Step 1: Replace `_fetch_recent_news` with `_fetch_search_data`**
|
||||
|
||||
Remove the existing `_fetch_recent_news` function and replace with:
|
||||
|
||||
```python
|
||||
_SEARCH_QUERIES = {
|
||||
"news": '"{company}" news 2025 2026',
|
||||
"funding": '"{company}" funding round investors Series valuation',
|
||||
"tech": '"{company}" tech stack engineering technology platform',
|
||||
"competitors": '"{company}" competitors alternatives vs market',
|
||||
"culture": '"{company}" glassdoor culture reviews employees',
|
||||
"ceo_press": '"{ceo}" "{company}"', # only used if ceo is known
|
||||
}
|
||||
|
||||
|
||||
def _run_search_query(query: str, results: dict, key: str) -> None:
|
||||
"""Thread target: run one SearXNG JSON query, store up to 4 snippets in results[key]."""
|
||||
import requests
|
||||
|
||||
snippets: list[str] = []
|
||||
seen: set[str] = set()
|
||||
try:
|
||||
resp = requests.get(
|
||||
"http://localhost:8888/search",
|
||||
params={"q": query, "format": "json", "language": "en-US"},
|
||||
timeout=12,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
return
|
||||
for r in resp.json().get("results", [])[:4]:
|
||||
url = r.get("url", "")
|
||||
if url in seen:
|
||||
continue
|
||||
seen.add(url)
|
||||
title = r.get("title", "").strip()
|
||||
content = r.get("content", "").strip()
|
||||
if title or content:
|
||||
snippets.append(f"- **{title}**\n {content}\n <{url}>")
|
||||
except Exception:
|
||||
pass
|
||||
results[key] = "\n\n".join(snippets)
|
||||
|
||||
|
||||
def _fetch_search_data(company: str, ceo: str = "") -> dict[str, str]:
|
||||
"""
|
||||
Run all search queries in parallel threads.
|
||||
Returns dict keyed by search type (news, funding, tech, competitors, culture, ceo_press).
|
||||
Missing/failed queries produce empty strings.
|
||||
"""
|
||||
import threading
|
||||
|
||||
results: dict[str, str] = {}
|
||||
threads = []
|
||||
|
||||
for key, pattern in _SEARCH_QUERIES.items():
|
||||
if key == "ceo_press" and (not ceo or ceo.lower() in ("not found", "")):
|
||||
continue
|
||||
query = pattern.format(company=company, ceo=ceo)
|
||||
t = threading.Thread(
|
||||
target=_run_search_query,
|
||||
args=(query, results, key),
|
||||
daemon=True,
|
||||
)
|
||||
threads.append(t)
|
||||
t.start()
|
||||
|
||||
for t in threads:
|
||||
t.join(timeout=15) # don't block the task indefinitely
|
||||
|
||||
return results
|
||||
```
|
||||
|
||||
**Step 2: Update Phase 1b in `research_company()` to call `_fetch_search_data`**
|
||||
|
||||
Replace the Phase 1b block:
|
||||
|
||||
```python
|
||||
# ── Phase 1b: parallel search queries ────────────────────────────────────
|
||||
search_data: dict[str, str] = {}
|
||||
if use_scraper and _searxng_running():
|
||||
try:
|
||||
ceo_name = (live_data.get("ceo") or "") if live_data else ""
|
||||
search_data = _fetch_search_data(company, ceo=ceo_name)
|
||||
except BaseException:
|
||||
pass # best-effort; never fail the whole task
|
||||
```
|
||||
|
||||
**Step 3: Build per-section notes for the prompt**
|
||||
|
||||
After the Phase 1b block, add:
|
||||
|
||||
```python
|
||||
def _section_note(key: str, label: str) -> str:
|
||||
text = search_data.get(key, "").strip()
|
||||
return f"\n\n## {label} (live web search)\n\n{text}" if text else ""
|
||||
|
||||
news_note = _section_note("news", "News & Press")
|
||||
funding_note = _section_note("funding", "Funding & Investors")
|
||||
tech_note = _section_note("tech", "Tech Stack")
|
||||
competitors_note= _section_note("competitors", "Competitors")
|
||||
culture_note = _section_note("culture", "Culture & Employee Signals")
|
||||
ceo_press_note = _section_note("ceo_press", "CEO in the News")
|
||||
```
|
||||
|
||||
**Step 4: No automated test (threading + network) — manual smoke test**
|
||||
|
||||
```bash
|
||||
conda run -n job-seeker python scripts/company_research.py --job-id <any_valid_id>
|
||||
```
|
||||
|
||||
Verify log output shows 6 search threads completing within ~15s total.
|
||||
|
||||
**Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add scripts/company_research.py
|
||||
git commit -m "feat: parallel SearXNG search queries (funding, tech, competitors, culture, news)"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 5: Expanded LLM prompt and section parsing
|
||||
|
||||
Wire resume context + all search data into the prompt, update section headers, update `_parse_sections` mapping, update `research_company()` return dict.
|
||||
|
||||
**Files:**
|
||||
- Modify: `scripts/company_research.py`
|
||||
|
||||
**Step 1: Load resume in `research_company()` and build context**
|
||||
|
||||
At the top of `research_company()`, after `jd_excerpt`, add:
|
||||
|
||||
```python
|
||||
resume, keywords = _load_resume_and_keywords()
|
||||
matched_keywords = [kw for kw in keywords if kw.lower() in jd_excerpt.lower()]
|
||||
resume_context = _build_resume_context(resume, keywords, jd_excerpt)
|
||||
keywords_note = (
|
||||
f"\n\n## Matched Skills & Keywords\nSkills matching this JD: {', '.join(matched_keywords)}"
|
||||
if matched_keywords else ""
|
||||
)
|
||||
```
|
||||
|
||||
**Step 2: Replace the Phase 2 LLM prompt**
|
||||
|
||||
Replace the existing `prompt = f"""..."""` block with:
|
||||
|
||||
```python
|
||||
prompt = f"""You are preparing Meghan McCann for a job interview.
|
||||
|
||||
Role: **{title}** at **{company}**
|
||||
|
||||
## Job Description
|
||||
{jd_excerpt}
|
||||
{resume_context}{keywords_note}
|
||||
|
||||
## Live Company Data (SearXNG)
|
||||
{scrape_note.strip() or "_(scrape unavailable)_"}
|
||||
{news_note}{funding_note}{tech_note}{competitors_note}{culture_note}{ceo_press_note}
|
||||
|
||||
---
|
||||
|
||||
Produce a structured research brief using **exactly** these seven markdown section headers
|
||||
(include all seven even if a section has limited data — say so honestly):
|
||||
|
||||
## Company Overview
|
||||
What {company} does, core product/service, business model, size/stage (startup / scale-up / enterprise), market positioning.
|
||||
|
||||
## Leadership & Culture
|
||||
CEO background and leadership style, key execs, mission/values statements, Glassdoor themes.
|
||||
|
||||
## Tech Stack & Product
|
||||
Technologies, platforms, and product direction relevant to the {title} role.
|
||||
|
||||
## Funding & Market Position
|
||||
Funding stage, key investors, recent rounds, burn/growth signals, competitor landscape.
|
||||
|
||||
## Recent Developments
|
||||
News, launches, acquisitions, exec moves, pivots, or press from the past 12–18 months.
|
||||
Draw on the live snippets above; if none available, note what is publicly known.
|
||||
|
||||
## Red Flags & Watch-outs
|
||||
Culture issues, layoffs, exec departures, financial stress, or Glassdoor concerns worth knowing before the call.
|
||||
If nothing notable, write "No significant red flags identified."
|
||||
|
||||
## Talking Points for Meghan
|
||||
Five specific talking points for the phone screen. Each must:
|
||||
- Reference a concrete experience from Meghan's matched background by name
|
||||
(UpGuard NDA rule: say "enterprise security vendor" unless role has clear security focus)
|
||||
- Connect to a specific signal from the JD or company context above
|
||||
- Be 1–2 sentences, ready to speak aloud
|
||||
- Never give generic advice
|
||||
|
||||
---
|
||||
⚠️ This brief combines live web data and LLM training knowledge. Verify key facts before the call.
|
||||
"""
|
||||
```
|
||||
|
||||
**Step 3: Update the return dict**
|
||||
|
||||
Replace the existing return block:
|
||||
|
||||
```python
|
||||
return {
|
||||
"raw_output": raw,
|
||||
"company_brief": sections.get("Company Overview", ""),
|
||||
"ceo_brief": sections.get("Leadership & Culture", ""),
|
||||
"tech_brief": sections.get("Tech Stack & Product", ""),
|
||||
"funding_brief": sections.get("Funding & Market Position", ""),
|
||||
"talking_points": sections.get("Talking Points for Meghan", ""),
|
||||
# Recent Developments and Red Flags stored in raw_output; rendered from there
|
||||
# (avoids adding more columns right now — can migrate later if needed)
|
||||
}
|
||||
```
|
||||
|
||||
Wait — `Recent Developments` and `Red Flags` aren't in the return dict above. We have `red_flags` column from Task 1. Add them:
|
||||
|
||||
```python
|
||||
return {
|
||||
"raw_output": raw,
|
||||
"company_brief": sections.get("Company Overview", ""),
|
||||
"ceo_brief": sections.get("Leadership & Culture", ""),
|
||||
"tech_brief": sections.get("Tech Stack & Product", ""),
|
||||
"funding_brief": sections.get("Funding & Market Position", ""),
|
||||
"competitors_brief": sections.get("Funding & Market Position", ""), # same section
|
||||
"red_flags": sections.get("Red Flags & Watch-outs", ""),
|
||||
"talking_points": sections.get("Talking Points for Meghan", ""),
|
||||
}
|
||||
```
|
||||
|
||||
Note: `competitors_brief` pulls from the Funding & Market Position section (which includes competitors). `recent_developments` is only in `raw_output` — no separate column needed.
|
||||
|
||||
**Step 4: Manual smoke test**
|
||||
|
||||
```bash
|
||||
conda run -n job-seeker python scripts/company_research.py --job-id <valid_id>
|
||||
```
|
||||
|
||||
Verify all 7 sections appear in output and `save_research` receives all fields.
|
||||
|
||||
**Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add scripts/company_research.py
|
||||
git commit -m "feat: expanded research prompt with resume context, 7 output sections"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 6: Interview Prep UI — render new sections
|
||||
|
||||
**Files:**
|
||||
- Modify: `app/pages/6_Interview_Prep.py`
|
||||
|
||||
**Step 1: Replace the left-panel section rendering**
|
||||
|
||||
Find the existing section block (after `st.divider()` at line ~145) and replace with:
|
||||
|
||||
```python
|
||||
# ── Talking Points (top — most useful during a live call) ─────────────────
|
||||
st.subheader("🎯 Talking Points")
|
||||
tp = research.get("talking_points", "").strip()
|
||||
if tp:
|
||||
st.markdown(tp)
|
||||
else:
|
||||
st.caption("_No talking points extracted — try regenerating._")
|
||||
|
||||
st.divider()
|
||||
|
||||
# ── Company brief ─────────────────────────────────────────────────────────
|
||||
st.subheader("🏢 Company Overview")
|
||||
st.markdown(research.get("company_brief") or "_—_")
|
||||
|
||||
st.divider()
|
||||
|
||||
# ── Leadership & culture ──────────────────────────────────────────────────
|
||||
st.subheader("👤 Leadership & Culture")
|
||||
st.markdown(research.get("ceo_brief") or "_—_")
|
||||
|
||||
st.divider()
|
||||
|
||||
# ── Tech Stack ────────────────────────────────────────────────────────────
|
||||
tech = research.get("tech_brief", "").strip()
|
||||
if tech:
|
||||
st.subheader("⚙️ Tech Stack & Product")
|
||||
st.markdown(tech)
|
||||
st.divider()
|
||||
|
||||
# ── Funding & Market ──────────────────────────────────────────────────────
|
||||
funding = research.get("funding_brief", "").strip()
|
||||
if funding:
|
||||
st.subheader("💰 Funding & Market Position")
|
||||
st.markdown(funding)
|
||||
st.divider()
|
||||
|
||||
# ── Red Flags ─────────────────────────────────────────────────────────────
|
||||
red = research.get("red_flags", "").strip()
|
||||
if red and "no significant red flags" not in red.lower():
|
||||
st.subheader("⚠️ Red Flags & Watch-outs")
|
||||
st.warning(red)
|
||||
st.divider()
|
||||
|
||||
# ── Practice Q&A ──────────────────────────────────────────────────────────
|
||||
with st.expander("🎤 Practice Q&A (pre-call prep)", expanded=False):
|
||||
# ... existing Q&A code unchanged ...
|
||||
```
|
||||
|
||||
Note: The existing Practice Q&A expander code stays exactly as-is inside the expander — only move/restructure the section headers above it.
|
||||
|
||||
**Step 2: Restart Streamlit and visually verify**
|
||||
|
||||
```bash
|
||||
bash scripts/manage-ui.sh restart
|
||||
```
|
||||
|
||||
Navigate to Interview Prep → verify new sections appear, Red Flags renders in amber warning box, Tech/Funding sections only show when populated.
|
||||
|
||||
**Step 3: Commit**
|
||||
|
||||
```bash
|
||||
git add app/pages/6_Interview_Prep.py
|
||||
git commit -m "feat: render tech, funding, red flags sections in Interview Prep"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 7: Settings UI — Skills & Keywords tab
|
||||
|
||||
**Files:**
|
||||
- Modify: `app/pages/2_Settings.py`
|
||||
|
||||
**Step 1: Add `KEYWORDS_CFG` path constant**
|
||||
|
||||
After the existing config path constants (line ~19), add:
|
||||
|
||||
```python
|
||||
KEYWORDS_CFG = CONFIG_DIR / "resume_keywords.yaml"
|
||||
```
|
||||
|
||||
**Step 2: Add the tab to the tab bar**
|
||||
|
||||
Change:
|
||||
```python
|
||||
tab_search, tab_llm, tab_notion, tab_services, tab_resume, tab_email = st.tabs(
|
||||
["🔎 Search", "🤖 LLM Backends", "📚 Notion", "🔌 Services", "📝 Resume Profile", "📧 Email"]
|
||||
)
|
||||
```
|
||||
To:
|
||||
```python
|
||||
tab_search, tab_llm, tab_notion, tab_services, tab_resume, tab_email, tab_skills = st.tabs(
|
||||
["🔎 Search", "🤖 LLM Backends", "📚 Notion", "🔌 Services", "📝 Resume Profile", "📧 Email", "🏷️ Skills"]
|
||||
)
|
||||
```
|
||||
|
||||
**Step 3: Add the Skills & Keywords tab body**
|
||||
|
||||
Append at the end of the file:
|
||||
|
||||
```python
|
||||
# ── Skills & Keywords tab ─────────────────────────────────────────────────────
|
||||
with tab_skills:
|
||||
st.subheader("🏷️ Skills & Keywords")
|
||||
st.caption(
|
||||
"These are matched against job descriptions to select Meghan's most relevant "
|
||||
"experience and highlight keyword overlap in the research brief."
|
||||
)
|
||||
|
||||
if not KEYWORDS_CFG.exists():
|
||||
st.warning("resume_keywords.yaml not found — create it at config/resume_keywords.yaml")
|
||||
st.stop()
|
||||
|
||||
kw_data = load_yaml(KEYWORDS_CFG)
|
||||
|
||||
changed = False
|
||||
for category in ["skills", "domains", "keywords"]:
|
||||
st.markdown(f"**{category.title()}**")
|
||||
tags: list[str] = kw_data.get(category, [])
|
||||
|
||||
# Render existing tags as removable chips
|
||||
cols = st.columns(min(len(tags), 6) or 1)
|
||||
to_remove = None
|
||||
for i, tag in enumerate(tags):
|
||||
with cols[i % 6]:
|
||||
if st.button(f"× {tag}", key=f"rm_{category}_{i}", use_container_width=True):
|
||||
to_remove = tag
|
||||
if to_remove:
|
||||
tags.remove(to_remove)
|
||||
kw_data[category] = tags
|
||||
changed = True
|
||||
|
||||
# Add new tag
|
||||
new_col, btn_col = st.columns([4, 1])
|
||||
new_tag = new_col.text_input(
|
||||
"Add", key=f"new_{category}", label_visibility="collapsed",
|
||||
placeholder=f"Add {category[:-1] if category.endswith('s') else category}…"
|
||||
)
|
||||
if btn_col.button("+ Add", key=f"add_{category}"):
|
||||
tag = new_tag.strip()
|
||||
if tag and tag not in tags:
|
||||
tags.append(tag)
|
||||
kw_data[category] = tags
|
||||
changed = True
|
||||
|
||||
st.markdown("---")
|
||||
|
||||
if changed:
|
||||
save_yaml(KEYWORDS_CFG, kw_data)
|
||||
st.success("Saved.")
|
||||
st.rerun()
|
||||
```
|
||||
|
||||
**Step 4: Restart and verify**
|
||||
|
||||
```bash
|
||||
bash scripts/manage-ui.sh restart
|
||||
```
|
||||
|
||||
Navigate to Settings → Skills tab. Verify:
|
||||
- Tags render as `× tag` buttons; clicking one removes it immediately
|
||||
- Text input + Add button appends new tag
|
||||
- Changes persist to `config/resume_keywords.yaml`
|
||||
|
||||
**Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add app/pages/2_Settings.py
|
||||
git commit -m "feat: add Skills & Keywords tag editor to Settings"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 8: Run full test suite + final smoke test
|
||||
|
||||
**Step 1: Full test suite**
|
||||
|
||||
```
|
||||
/devl/miniconda3/envs/job-seeker/bin/pytest tests/ -v
|
||||
```
|
||||
|
||||
Expected: all existing + new tests pass.
|
||||
|
||||
**Step 2: End-to-end smoke test**
|
||||
|
||||
With SearXNG running (`docker compose up -d` in `/Library/Development/scrapers/SearXNG/`):
|
||||
|
||||
```bash
|
||||
conda run -n job-seeker python scripts/company_research.py --job-id <valid_id>
|
||||
```
|
||||
|
||||
Verify:
|
||||
- 6 search threads complete
|
||||
- All 7 sections present in output
|
||||
- Talking points reference real experience entries (not generic blurb)
|
||||
- `get_research()` returns all new fields populated
|
||||
|
||||
**Step 3: Final commit if any cleanup needed**
|
||||
|
||||
```bash
|
||||
git add -p # stage only intentional changes
|
||||
git commit -m "chore: research workflow final cleanup"
|
||||
```
|
||||
|
|
@ -1,176 +0,0 @@
|
|||
# Survey Assistant — Design Doc
|
||||
|
||||
**Date:** 2026-02-23
|
||||
**Status:** Approved
|
||||
|
||||
---
|
||||
|
||||
## Goal
|
||||
|
||||
Add a real-time Survey Assistant to the job application pipeline that helps the user answer culture-fit and values surveys during the application process. Supports timed surveys via screenshot ingestion and text paste, with a quick ("just give me the answer") or detailed ("explain each option") mode toggle.
|
||||
|
||||
---
|
||||
|
||||
## Pipeline Stage
|
||||
|
||||
A new `survey` stage is inserted between `applied` and `phone_screen`:
|
||||
|
||||
```
|
||||
pending → approved → applied → survey → phone_screen → interviewing → offer → hired
|
||||
```
|
||||
|
||||
- Promotion to `survey` is triggered manually (banner prompt) or automatically when the email classifier detects a `survey_received` signal.
|
||||
- Jobs can skip `survey` entirely — it is not required.
|
||||
- `survey_at` timestamp column added to `jobs` table.
|
||||
|
||||
---
|
||||
|
||||
## Email Classifier
|
||||
|
||||
`classify_stage_signal` in `scripts/imap_sync.py` gains a 6th label: `survey_received`.
|
||||
|
||||
When detected:
|
||||
- The Interviews page shows the existing stage-suggestion banner style: "Survey email received — move to Survey stage?"
|
||||
- One-click promote button moves the job to `survey` and records `survey_at`.
|
||||
|
||||
---
|
||||
|
||||
## Kanban Consolidation (Interviews Page)
|
||||
|
||||
### Change A — Pre-kanban section
|
||||
`applied` and `survey` jobs appear above the kanban columns in a pre-pipeline section, not as their own columns. Visual differentiation: `survey` jobs show a badge/chip.
|
||||
|
||||
### Change B — Offer + Hired merged
|
||||
`offer` and `hired` are combined into one column. `hired` jobs are visually differentiated (e.g. green highlight or checkmark icon) rather than occupying a separate column.
|
||||
|
||||
**Result:** Kanban columns are `phone_screen | interviewing | offer/hired` (3 columns), with applied/survey as a pre-section above.
|
||||
|
||||
---
|
||||
|
||||
## Survey Assistant Page (`app/pages/7_Survey.py`)
|
||||
|
||||
### Layout
|
||||
|
||||
**Left panel — Input**
|
||||
- Job selector dropdown (defaults to `survey`-stage jobs, allows any job)
|
||||
- Survey name field (optional label, e.g. "Culture Fit Round 1")
|
||||
- Mode toggle: **Quick** / **Detailed** (persisted in session state)
|
||||
- Two input tabs:
|
||||
- **Paste Text** — textarea for pasted survey content
|
||||
- **Screenshot** — `streamlit-paste-button` (clipboard paste) + file uploader side by side; either method populates an image preview
|
||||
- Analyze button
|
||||
|
||||
**Right panel — Output**
|
||||
- **Quick mode:** numbered list, each item is bold option letter + one-line rationale
|
||||
e.g. `**B** — most aligns with a collaborative, team-first culture`
|
||||
- **Detailed mode:** each question expanded — option-by-option breakdown, recommendation, brief "why"
|
||||
- "Save to Job" button — persists Q&A to `survey_responses`; shows reported score field before saving
|
||||
|
||||
**Below both panels — History**
|
||||
- Accordion: prior saved survey responses for the selected job, newest first
|
||||
- Shows survey name, mode, reported score, timestamp, and LLM output summary
|
||||
|
||||
---
|
||||
|
||||
## Data Model
|
||||
|
||||
### `survey_responses` table (new)
|
||||
|
||||
```sql
|
||||
CREATE TABLE survey_responses (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
job_id INTEGER NOT NULL REFERENCES jobs(id),
|
||||
survey_name TEXT, -- e.g. "Culture Fit Round 1"
|
||||
received_at DATETIME, -- when the survey email arrived (if known)
|
||||
source TEXT, -- 'text_paste' | 'screenshot'
|
||||
raw_input TEXT, -- pasted text content, or NULL for screenshots
|
||||
image_path TEXT, -- path to saved screenshot, or NULL
|
||||
mode TEXT, -- 'quick' | 'detailed'
|
||||
llm_output TEXT, -- full LLM response
|
||||
reported_score TEXT, -- optional score shown by the survey app
|
||||
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
```
|
||||
|
||||
Screenshots saved to `data/survey_screenshots/<job_id>/<timestamp>.png` (directory gitignored). Stored by path, not BLOB.
|
||||
|
||||
Multiple rows per job are allowed (multiple survey rounds).
|
||||
|
||||
### `jobs` table addition
|
||||
- `survey_at DATETIME` — timestamp when job entered `survey` stage
|
||||
|
||||
---
|
||||
|
||||
## Vision Service (`scripts/vision_service/`)
|
||||
|
||||
A dedicated, optional FastAPI microservice for image-based survey analysis. Independent of thoth.
|
||||
|
||||
### Model
|
||||
- **Primary:** `moondream2` (~1.5GB VRAM at 4-bit quantization)
|
||||
- **Reserve:** `Qwen2.5-VL-3B` if moondream2 accuracy proves insufficient
|
||||
|
||||
### Architecture
|
||||
- Separate conda env: `job-seeker-vision` (torch + transformers + FastAPI + moondream2)
|
||||
- Port: **8002** (avoids conflict with vLLM on 8000 and thoth on 8001)
|
||||
- Model loaded lazily on first request, stays resident (no reload between calls)
|
||||
- GPU loaded on first inference request; 4-bit quantization keeps VRAM footprint ~1.5GB
|
||||
|
||||
### Endpoints
|
||||
```
|
||||
POST /analyze
|
||||
Body: { "prompt": str, "image_base64": str }
|
||||
Returns: { "text": str }
|
||||
|
||||
GET /health
|
||||
Returns: { "status": "ok"|"loading", "model": str, "gpu": bool }
|
||||
```
|
||||
|
||||
### Management
|
||||
`scripts/manage-vision.sh start|stop|restart|status|logs` — same pattern as `manage-ui.sh`.
|
||||
|
||||
### Optional install
|
||||
- If the vision service is not running, the Screenshot tab on the Survey page is hidden
|
||||
- A note in its place explains how to enable: "Install vision service — see docs/vision-service.md"
|
||||
- Text Paste mode always available regardless of vision service status
|
||||
|
||||
---
|
||||
|
||||
## LLM Router Changes (`scripts/llm_router.py`)
|
||||
|
||||
`LLMRouter.complete()` gains an optional `images` parameter:
|
||||
|
||||
```python
|
||||
def complete(self, prompt: str, images: list[str] | None = None) -> str:
|
||||
# images: list of base64-encoded PNG/JPG strings
|
||||
```
|
||||
|
||||
- Backends that don't support images are skipped when `images` is provided
|
||||
- Survey analysis fallback order: `vision_service → claude_code`
|
||||
- `vision_service` backend entry added to `config/llm.yaml` (enabled: false by default — optional install)
|
||||
|
||||
---
|
||||
|
||||
## Generalized Version Notes
|
||||
|
||||
- Vision service is an **optional feature** in the generalized app
|
||||
- `config/llm.yaml` ships with `vision_service.enabled: false`
|
||||
- `scripts/manage-vision.sh` and `scripts/vision_service/` included but documented as optional
|
||||
- Survey page renders in degraded (text-only) mode if vision service is absent
|
||||
- Install instructions in `docs/vision-service.md` (to be written during implementation)
|
||||
|
||||
---
|
||||
|
||||
## Files Affected
|
||||
|
||||
| File | Change |
|
||||
|------|--------|
|
||||
| `app/pages/7_Survey.py` | New page |
|
||||
| `app/pages/5_Interviews.py` | Kanban consolidation (A+B), survey banner |
|
||||
| `scripts/imap_sync.py` | Add `survey_received` classifier label |
|
||||
| `scripts/db.py` | `survey_responses` table, `survey_at` column, CRUD helpers |
|
||||
| `scripts/llm_router.py` | `images=` parameter, skip non-vision backends |
|
||||
| `scripts/vision_service/main.py` | New FastAPI vision service |
|
||||
| `scripts/vision_service/environment.yml` | New conda env spec |
|
||||
| `scripts/manage-vision.sh` | New management script |
|
||||
| `config/llm.yaml` | Add `vision_service` backend entry (enabled: false) |
|
||||
| `config/llm.yaml.example` | Same |
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,174 +0,0 @@
|
|||
# Design: Craigslist Custom Board Scraper
|
||||
|
||||
**Date:** 2026-02-24
|
||||
**Status:** Approved
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Add a Craigslist scraper to `scripts/custom_boards/craigslist.py` following the existing
|
||||
adzuna/theladders pattern. Craigslist is regional (one subdomain per metro), has no native
|
||||
remote filter, and exposes an RSS feed that gives clean structured data without Playwright.
|
||||
|
||||
Discovery uses RSS for speed and reliability. Full job description is populated by the
|
||||
existing `scrape_url` background task. Company name and salary — not present in Craigslist
|
||||
listings as structured fields — are extracted from the description body by the existing
|
||||
`enrich_descriptions` LLM pipeline after the posting is fetched.
|
||||
|
||||
---
|
||||
|
||||
## Files
|
||||
|
||||
| Action | File |
|
||||
|---|---|
|
||||
| Create | `scripts/custom_boards/craigslist.py` |
|
||||
| Create | `config/craigslist.yaml` (gitignored) |
|
||||
| Create | `config/craigslist.yaml.example` |
|
||||
| Create | `tests/test_craigslist.py` |
|
||||
| Modify | `scripts/discover.py` — add to `CUSTOM_SCRAPERS` registry |
|
||||
| Modify | `scripts/enrich_descriptions.py` — add company/salary extraction for craigslist source |
|
||||
| Modify | `config/search_profiles.yaml` — add `craigslist` to `custom_boards` on relevant profiles |
|
||||
| Modify | `.gitignore` — add `config/craigslist.yaml` |
|
||||
|
||||
---
|
||||
|
||||
## Config (`config/craigslist.yaml`)
|
||||
|
||||
Gitignored. `.example` committed alongside it.
|
||||
|
||||
```yaml
|
||||
# Craigslist metro subdomains to search.
|
||||
# Full list at: https://www.craigslist.org/about/sites
|
||||
metros:
|
||||
- sfbay
|
||||
- newyork
|
||||
- chicago
|
||||
- losangeles
|
||||
- seattle
|
||||
- austin
|
||||
|
||||
# Maps search profile location strings to a single metro subdomain.
|
||||
# Locations not listed here are skipped silently.
|
||||
location_map:
|
||||
"San Francisco Bay Area, CA": sfbay
|
||||
"New York, NY": newyork
|
||||
"Chicago, IL": chicago
|
||||
"Los Angeles, CA": losangeles
|
||||
"Seattle, WA": seattle
|
||||
"Austin, TX": austin
|
||||
|
||||
# Craigslist job category. Defaults to 'jjj' (general jobs) if omitted.
|
||||
# Other useful values: csr (customer service), mar (marketing), sof (software)
|
||||
# category: jjj
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Scraper Architecture
|
||||
|
||||
### RSS URL pattern
|
||||
```
|
||||
https://{metro}.craigslist.org/search/{category}?query={title}&format=rss&sort=date
|
||||
```
|
||||
|
||||
Default category: `jjj`. Overridable via `category` key in config.
|
||||
|
||||
### `scrape(profile, location, results_wanted)` flow
|
||||
|
||||
1. Load `config/craigslist.yaml` — return `[]` with a printed warning if missing or malformed
|
||||
2. Determine metros to search:
|
||||
- `location.lower() == "remote"` → all configured metros (Craigslist has no native remote filter)
|
||||
- Any other string → `location_map.get(location)` → single metro; skip silently if not mapped
|
||||
3. For each metro × each title in `profile["titles"]`:
|
||||
- Fetch RSS via `requests.get` with a standard User-Agent header
|
||||
- Parse with `xml.etree.ElementTree` (stdlib — no extra deps)
|
||||
- Filter `<item>` entries by `<pubDate>` against `profile["hours_old"]`
|
||||
- Extract title, URL, and description snippet from each item
|
||||
- `time.sleep(0.5)` between fetches (polite pacing; easy to make configurable later)
|
||||
4. Dedup by URL within the run via a `seen_urls` set
|
||||
5. Stop when `results_wanted` is reached
|
||||
6. Return list of job dicts
|
||||
|
||||
### Return dict shape
|
||||
|
||||
```python
|
||||
{
|
||||
"title": "<RSS item title, cleaned>",
|
||||
"company": "", # not in Craigslist — filled by LLM enrichment
|
||||
"url": "<item link>",
|
||||
"source": "craigslist",
|
||||
"location": "<metro> (Craigslist)",
|
||||
"is_remote": True, # if remote search, else False
|
||||
"salary": "", # not reliably structured — filled by LLM enrichment
|
||||
"description": "", # scrape_url background task fills this in
|
||||
}
|
||||
```
|
||||
|
||||
### Error handling
|
||||
|
||||
- Missing config → `[]` + printed warning, never raises
|
||||
- `requests.RequestException` → skip that metro/title, print warning, continue
|
||||
- Malformed RSS XML → skip that response, print warning, continue
|
||||
- HTTP non-200 → skip, print status code
|
||||
|
||||
---
|
||||
|
||||
## LLM Enrichment for company/salary
|
||||
|
||||
Craigslist postings frequently include company name and salary in the body text, but not as
|
||||
structured fields. After `scrape_url` populates `description`, the `enrich_descriptions`
|
||||
task handles extraction.
|
||||
|
||||
**Trigger condition:** `source == "craigslist"` AND `company == ""` AND `description != ""`
|
||||
|
||||
**Prompt addition:** Extend the existing enrichment prompt to also extract:
|
||||
- Company name (if present in the posting body)
|
||||
- Salary or compensation range (if mentioned)
|
||||
|
||||
Results written back via `update_job_fields`. If the LLM cannot extract a company name,
|
||||
the field stays blank — this is expected and acceptable for Craigslist.
|
||||
|
||||
---
|
||||
|
||||
## discover.py Integration
|
||||
|
||||
One-line addition to the `CUSTOM_SCRAPERS` registry:
|
||||
|
||||
```python
|
||||
from scripts.custom_boards import craigslist as _craigslist
|
||||
|
||||
CUSTOM_SCRAPERS: dict[str, object] = {
|
||||
"adzuna": _adzuna.scrape,
|
||||
"theladders": _theladders.scrape,
|
||||
"craigslist": _craigslist.scrape, # new
|
||||
}
|
||||
```
|
||||
|
||||
Add `craigslist` to `custom_boards` in `config/search_profiles.yaml` for relevant profiles.
|
||||
|
||||
---
|
||||
|
||||
## Tests (`tests/test_craigslist.py`)
|
||||
|
||||
All tests use mocked `requests.get` with fixture RSS XML — no network calls.
|
||||
|
||||
| Test | Asserts |
|
||||
|---|---|
|
||||
| `test_scrape_returns_empty_on_missing_config` | Missing yaml → `[]`, no raise |
|
||||
| `test_scrape_remote_hits_all_metros` | `location="Remote"` → one fetch per configured metro |
|
||||
| `test_scrape_location_map_resolves` | `"San Francisco Bay Area, CA"` → `sfbay` only |
|
||||
| `test_scrape_location_not_in_map_returns_empty` | Unknown location → `[]`, no raise |
|
||||
| `test_hours_old_filter` | Items older than `hours_old` are excluded |
|
||||
| `test_dedup_within_run` | Same URL appearing in two metros only returned once |
|
||||
| `test_http_error_graceful` | `RequestException` → `[]`, no raise |
|
||||
| `test_results_wanted_cap` | Never returns more than `results_wanted` |
|
||||
|
||||
---
|
||||
|
||||
## Out of Scope
|
||||
|
||||
- Playwright-based scraping (RSS is sufficient; Playwright adds a dep for no gain)
|
||||
- Craigslist subcategory multi-search per profile (config `category` override is sufficient)
|
||||
- Salary/company extraction directly in the scraper (LLM enrichment is the right layer)
|
||||
- Windows support (deferred globally)
|
||||
|
|
@ -1,728 +0,0 @@
|
|||
# Craigslist Scraper Implementation Plan
|
||||
|
||||
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
|
||||
|
||||
**Goal:** Add a Craigslist RSS-based job scraper to `scripts/custom_boards/craigslist.py`, wired into the existing discovery pipeline, with LLM extraction of company name and salary from the fetched posting body.
|
||||
|
||||
**Architecture:** RSS fetch per metro × title → `scrape_url` background task fills description → new `enrich_craigslist` task type extracts company/salary via LLM. Config-driven metro list in `config/craigslist.yaml`. Integrates via the existing `CUSTOM_SCRAPERS` registry in `discover.py`.
|
||||
|
||||
**Tech Stack:** Python 3.11, `requests`, `xml.etree.ElementTree` (stdlib), `PyYAML`, `email.utils.parsedate_to_datetime` (stdlib), existing `llm_router.py`
|
||||
|
||||
**Test runner:** `/devl/miniconda3/envs/job-seeker/bin/pytest tests/ -v`
|
||||
|
||||
---
|
||||
|
||||
## Task 1: Config files + .gitignore
|
||||
|
||||
**Files:**
|
||||
- Create: `config/craigslist.yaml.example`
|
||||
- Create: `config/craigslist.yaml`
|
||||
- Modify: `.gitignore`
|
||||
|
||||
**Step 1: Create `config/craigslist.yaml.example`**
|
||||
|
||||
```yaml
|
||||
# Craigslist metro subdomains to search.
|
||||
# Copy to config/craigslist.yaml and adjust for your markets.
|
||||
# Full subdomain list: https://www.craigslist.org/about/sites
|
||||
metros:
|
||||
- sfbay
|
||||
- newyork
|
||||
- chicago
|
||||
- losangeles
|
||||
- seattle
|
||||
- austin
|
||||
|
||||
# Maps search profile location strings → Craigslist metro subdomain.
|
||||
# Locations not listed here are silently skipped.
|
||||
location_map:
|
||||
"San Francisco Bay Area, CA": sfbay
|
||||
"New York, NY": newyork
|
||||
"Chicago, IL": chicago
|
||||
"Los Angeles, CA": losangeles
|
||||
"Seattle, WA": seattle
|
||||
"Austin, TX": austin
|
||||
|
||||
# Craigslist job category. Defaults to 'jjj' (general jobs) if omitted.
|
||||
# Other options: csr (customer service), mar (marketing), sof (software/qa/dba)
|
||||
# category: jjj
|
||||
```
|
||||
|
||||
**Step 2: Create `config/craigslist.yaml`** (personal config — gitignored)
|
||||
|
||||
Copy `.example` as-is (Meghan targets sfbay + remote, so this default is correct).
|
||||
|
||||
**Step 3: Add to `.gitignore`**
|
||||
|
||||
Add `config/craigslist.yaml` after the existing `config/adzuna.yaml` line:
|
||||
|
||||
```
|
||||
config/adzuna.yaml
|
||||
config/craigslist.yaml
|
||||
```
|
||||
|
||||
**Step 4: Commit**
|
||||
|
||||
```bash
|
||||
git add config/craigslist.yaml.example .gitignore
|
||||
git commit -m "feat: add craigslist config template and gitignore entry"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 2: Core scraper tests (write failing first)
|
||||
|
||||
**Files:**
|
||||
- Create: `tests/test_craigslist.py`
|
||||
|
||||
**Step 1: Create `tests/test_craigslist.py` with all fixtures and tests**
|
||||
|
||||
```python
|
||||
"""Tests for Craigslist RSS scraper."""
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from email.utils import format_datetime
|
||||
from unittest.mock import patch, MagicMock
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
|
||||
# ── RSS fixture helpers ────────────────────────────────────────────────────────
|
||||
|
||||
def _make_rss(items: list[dict]) -> bytes:
|
||||
"""Build minimal Craigslist-style RSS XML from a list of item dicts."""
|
||||
channel = ET.Element("channel")
|
||||
for item_data in items:
|
||||
item = ET.SubElement(channel, "item")
|
||||
for tag, value in item_data.items():
|
||||
el = ET.SubElement(item, tag)
|
||||
el.text = value
|
||||
rss = ET.Element("rss")
|
||||
rss.append(channel)
|
||||
return ET.tostring(rss, encoding="utf-8", xml_declaration=True)
|
||||
|
||||
|
||||
def _pubdate(hours_ago: float = 1.0) -> str:
|
||||
"""Return an RFC 2822 pubDate string for N hours ago."""
|
||||
dt = datetime.now(tz=timezone.utc) - timedelta(hours=hours_ago)
|
||||
return format_datetime(dt)
|
||||
|
||||
|
||||
def _mock_resp(content: bytes, status_code: int = 200) -> MagicMock:
|
||||
mock = MagicMock()
|
||||
mock.status_code = status_code
|
||||
mock.content = content
|
||||
mock.raise_for_status = MagicMock()
|
||||
if status_code >= 400:
|
||||
mock.raise_for_status.side_effect = requests.HTTPError(f"HTTP {status_code}")
|
||||
return mock
|
||||
|
||||
|
||||
# ── Fixtures ──────────────────────────────────────────────────────────────────
|
||||
|
||||
_SAMPLE_RSS = _make_rss([{
|
||||
"title": "Customer Success Manager",
|
||||
"link": "https://sfbay.craigslist.org/jjj/d/csm-role/1234567890.html",
|
||||
"description": "Great CSM role at Acme Corp. Salary $120k.",
|
||||
"pubDate": _pubdate(1),
|
||||
}])
|
||||
|
||||
_TWO_ITEM_RSS = _make_rss([
|
||||
{
|
||||
"title": "Customer Success Manager",
|
||||
"link": "https://sfbay.craigslist.org/jjj/d/csm-role/1111111111.html",
|
||||
"description": "CSM role 1.",
|
||||
"pubDate": _pubdate(1),
|
||||
},
|
||||
{
|
||||
"title": "Account Manager",
|
||||
"link": "https://sfbay.craigslist.org/jjj/d/am-role/2222222222.html",
|
||||
"description": "AM role.",
|
||||
"pubDate": _pubdate(2),
|
||||
},
|
||||
])
|
||||
|
||||
_OLD_ITEM_RSS = _make_rss([{
|
||||
"title": "Old Job",
|
||||
"link": "https://sfbay.craigslist.org/jjj/d/old-job/9999999999.html",
|
||||
"description": "Very old posting.",
|
||||
"pubDate": _pubdate(hours_ago=500),
|
||||
}])
|
||||
|
||||
_TWO_METRO_CONFIG = {
|
||||
"metros": ["sfbay", "newyork"],
|
||||
"location_map": {
|
||||
"San Francisco Bay Area, CA": "sfbay",
|
||||
"New York, NY": "newyork",
|
||||
},
|
||||
"category": "jjj",
|
||||
}
|
||||
|
||||
_SINGLE_METRO_CONFIG = {
|
||||
"metros": ["sfbay"],
|
||||
"location_map": {"San Francisco Bay Area, CA": "sfbay"},
|
||||
}
|
||||
|
||||
_PROFILE = {"titles": ["Customer Success Manager"], "hours_old": 240}
|
||||
|
||||
|
||||
# ── Tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def test_scrape_returns_empty_on_missing_config(tmp_path):
|
||||
"""Missing craigslist.yaml → returns [] without raising."""
|
||||
with patch("scripts.custom_boards.craigslist._CONFIG_PATH",
|
||||
tmp_path / "craigslist.yaml"):
|
||||
import importlib
|
||||
import scripts.custom_boards.craigslist as cl
|
||||
importlib.reload(cl)
|
||||
result = cl.scrape(_PROFILE, "San Francisco Bay Area, CA")
|
||||
assert result == []
|
||||
|
||||
|
||||
def test_scrape_remote_hits_all_metros():
|
||||
"""location='Remote' triggers one RSS fetch per configured metro."""
|
||||
with patch("scripts.custom_boards.craigslist._load_config",
|
||||
return_value=_TWO_METRO_CONFIG):
|
||||
with patch("scripts.custom_boards.craigslist.requests.get",
|
||||
return_value=_mock_resp(_SAMPLE_RSS)) as mock_get:
|
||||
from scripts.custom_boards import craigslist
|
||||
result = craigslist.scrape(_PROFILE, "Remote")
|
||||
|
||||
assert mock_get.call_count == 2
|
||||
fetched_urls = [call.args[0] for call in mock_get.call_args_list]
|
||||
assert any("sfbay" in u for u in fetched_urls)
|
||||
assert any("newyork" in u for u in fetched_urls)
|
||||
assert all(r["is_remote"] for r in result)
|
||||
|
||||
|
||||
def test_scrape_location_map_resolves():
|
||||
"""Known location string maps to exactly one metro."""
|
||||
with patch("scripts.custom_boards.craigslist._load_config",
|
||||
return_value=_TWO_METRO_CONFIG):
|
||||
with patch("scripts.custom_boards.craigslist.requests.get",
|
||||
return_value=_mock_resp(_SAMPLE_RSS)) as mock_get:
|
||||
from scripts.custom_boards import craigslist
|
||||
result = craigslist.scrape(_PROFILE, "San Francisco Bay Area, CA")
|
||||
|
||||
assert mock_get.call_count == 1
|
||||
assert "sfbay" in mock_get.call_args.args[0]
|
||||
assert len(result) == 1
|
||||
assert result[0]["is_remote"] is False
|
||||
|
||||
|
||||
def test_scrape_location_not_in_map_returns_empty():
|
||||
"""Location not in location_map → [] without raising."""
|
||||
with patch("scripts.custom_boards.craigslist._load_config",
|
||||
return_value=_SINGLE_METRO_CONFIG):
|
||||
with patch("scripts.custom_boards.craigslist.requests.get") as mock_get:
|
||||
from scripts.custom_boards import craigslist
|
||||
result = craigslist.scrape(_PROFILE, "Portland, OR")
|
||||
|
||||
assert result == []
|
||||
mock_get.assert_not_called()
|
||||
|
||||
|
||||
def test_hours_old_filter():
|
||||
"""Items older than hours_old are excluded."""
|
||||
profile = {"titles": ["Customer Success Manager"], "hours_old": 48}
|
||||
with patch("scripts.custom_boards.craigslist._load_config",
|
||||
return_value=_SINGLE_METRO_CONFIG):
|
||||
with patch("scripts.custom_boards.craigslist.requests.get",
|
||||
return_value=_mock_resp(_OLD_ITEM_RSS)):
|
||||
from scripts.custom_boards import craigslist
|
||||
result = craigslist.scrape(profile, "San Francisco Bay Area, CA")
|
||||
|
||||
assert result == []
|
||||
|
||||
|
||||
def test_dedup_within_run():
|
||||
"""Same URL from two different metros is only returned once."""
|
||||
same_url_rss = _make_rss([{
|
||||
"title": "CSM Role",
|
||||
"link": "https://sfbay.craigslist.org/jjj/d/csm/1234.html",
|
||||
"description": "Same job.",
|
||||
"pubDate": _pubdate(1),
|
||||
}])
|
||||
with patch("scripts.custom_boards.craigslist._load_config",
|
||||
return_value=_TWO_METRO_CONFIG):
|
||||
with patch("scripts.custom_boards.craigslist.requests.get",
|
||||
return_value=_mock_resp(same_url_rss)):
|
||||
from scripts.custom_boards import craigslist
|
||||
result = craigslist.scrape(_PROFILE, "Remote")
|
||||
|
||||
urls = [r["url"] for r in result]
|
||||
assert len(urls) == len(set(urls))
|
||||
|
||||
|
||||
def test_http_error_graceful():
|
||||
"""HTTP error → [] without raising."""
|
||||
with patch("scripts.custom_boards.craigslist._load_config",
|
||||
return_value=_SINGLE_METRO_CONFIG):
|
||||
with patch("scripts.custom_boards.craigslist.requests.get",
|
||||
side_effect=requests.RequestException("timeout")):
|
||||
from scripts.custom_boards import craigslist
|
||||
result = craigslist.scrape(_PROFILE, "San Francisco Bay Area, CA")
|
||||
|
||||
assert result == []
|
||||
|
||||
|
||||
def test_results_wanted_cap():
|
||||
"""Never returns more than results_wanted items."""
|
||||
with patch("scripts.custom_boards.craigslist._load_config",
|
||||
return_value=_TWO_METRO_CONFIG):
|
||||
with patch("scripts.custom_boards.craigslist.requests.get",
|
||||
return_value=_mock_resp(_TWO_ITEM_RSS)):
|
||||
from scripts.custom_boards import craigslist
|
||||
result = craigslist.scrape(_PROFILE, "Remote", results_wanted=1)
|
||||
|
||||
assert len(result) <= 1
|
||||
```
|
||||
|
||||
**Step 2: Run tests to verify they all fail**
|
||||
|
||||
```bash
|
||||
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_craigslist.py -v
|
||||
```
|
||||
|
||||
Expected: `ModuleNotFoundError: No module named 'scripts.custom_boards.craigslist'`
|
||||
|
||||
---
|
||||
|
||||
## Task 3: Implement `scripts/custom_boards/craigslist.py`
|
||||
|
||||
**Files:**
|
||||
- Create: `scripts/custom_boards/craigslist.py`
|
||||
|
||||
**Step 1: Create the scraper**
|
||||
|
||||
```python
|
||||
"""Craigslist job scraper — RSS-based.
|
||||
|
||||
Uses Craigslist's native RSS feed endpoint for discovery.
|
||||
Full job description is populated by the scrape_url background task.
|
||||
Company name and salary (not structured in Craigslist listings) are
|
||||
extracted from the description body by the enrich_craigslist task.
|
||||
|
||||
Config: config/craigslist.yaml (gitignored — metro list + location map)
|
||||
config/craigslist.yaml.example (committed template)
|
||||
|
||||
Returns a list of dicts compatible with scripts.db.insert_job().
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
import xml.etree.ElementTree as ET
|
||||
from datetime import datetime, timezone
|
||||
from email.utils import parsedate_to_datetime
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote_plus
|
||||
|
||||
import requests
|
||||
import yaml
|
||||
|
||||
_CONFIG_PATH = Path(__file__).parent.parent.parent / "config" / "craigslist.yaml"
|
||||
_DEFAULT_CATEGORY = "jjj"
|
||||
_HEADERS = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
)
|
||||
}
|
||||
_TIMEOUT = 15
|
||||
_SLEEP = 0.5 # seconds between requests — easy to make configurable later
|
||||
|
||||
|
||||
def _load_config() -> dict:
|
||||
if not _CONFIG_PATH.exists():
|
||||
raise FileNotFoundError(
|
||||
f"Craigslist config not found: {_CONFIG_PATH}\n"
|
||||
"Copy config/craigslist.yaml.example → config/craigslist.yaml "
|
||||
"and configure your target metros."
|
||||
)
|
||||
cfg = yaml.safe_load(_CONFIG_PATH.read_text()) or {}
|
||||
if not cfg.get("metros"):
|
||||
raise ValueError(
|
||||
"config/craigslist.yaml must contain at least one entry under 'metros'."
|
||||
)
|
||||
return cfg
|
||||
|
||||
|
||||
def _rss_url(metro: str, category: str, query: str) -> str:
|
||||
return (
|
||||
f"https://{metro}.craigslist.org/search/{category}"
|
||||
f"?query={quote_plus(query)}&format=rss&sort=date"
|
||||
)
|
||||
|
||||
|
||||
def _parse_pubdate(pubdate_str: str) -> datetime | None:
|
||||
"""Parse an RSS pubDate string to a timezone-aware datetime."""
|
||||
try:
|
||||
return parsedate_to_datetime(pubdate_str)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _fetch_rss(url: str) -> list[dict]:
|
||||
"""Fetch and parse a Craigslist RSS feed. Returns list of raw item dicts."""
|
||||
resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
try:
|
||||
root = ET.fromstring(resp.content)
|
||||
except ET.ParseError as exc:
|
||||
raise ValueError(f"Malformed RSS XML: {exc}") from exc
|
||||
|
||||
items = []
|
||||
for item in root.findall(".//item"):
|
||||
def _text(tag: str, _item=item) -> str:
|
||||
el = _item.find(tag)
|
||||
return (el.text or "").strip() if el is not None else ""
|
||||
|
||||
items.append({
|
||||
"title": _text("title"),
|
||||
"link": _text("link"),
|
||||
"description": _text("description"),
|
||||
"pubDate": _text("pubDate"),
|
||||
})
|
||||
return items
|
||||
|
||||
|
||||
def scrape(profile: dict, location: str, results_wanted: int = 50) -> list[dict]:
|
||||
"""Fetch jobs from Craigslist RSS for a single location.
|
||||
|
||||
Args:
|
||||
profile: Search profile dict from search_profiles.yaml.
|
||||
location: Location string (e.g. "Remote" or "San Francisco Bay Area, CA").
|
||||
results_wanted: Maximum results to return across all metros and titles.
|
||||
|
||||
Returns:
|
||||
List of job dicts with keys: title, company, url, source, location,
|
||||
is_remote, salary, description.
|
||||
company/salary are empty — filled later by enrich_craigslist task.
|
||||
"""
|
||||
try:
|
||||
cfg = _load_config()
|
||||
except (FileNotFoundError, ValueError) as exc:
|
||||
print(f" [craigslist] Skipped — {exc}")
|
||||
return []
|
||||
|
||||
metros_all: list[str] = cfg.get("metros", [])
|
||||
location_map: dict[str, str] = cfg.get("location_map", {})
|
||||
category: str = cfg.get("category") or _DEFAULT_CATEGORY
|
||||
|
||||
is_remote_search = location.lower() == "remote"
|
||||
if is_remote_search:
|
||||
metros = metros_all
|
||||
else:
|
||||
metro = location_map.get(location)
|
||||
if not metro:
|
||||
print(f" [craigslist] No metro mapping for '{location}' — skipping")
|
||||
return []
|
||||
metros = [metro]
|
||||
|
||||
titles: list[str] = profile.get("titles", [])
|
||||
hours_old: int = profile.get("hours_old", 240)
|
||||
cutoff = datetime.now(tz=timezone.utc).timestamp() - (hours_old * 3600)
|
||||
|
||||
seen_urls: set[str] = set()
|
||||
results: list[dict] = []
|
||||
|
||||
for metro in metros:
|
||||
if len(results) >= results_wanted:
|
||||
break
|
||||
|
||||
for title in titles:
|
||||
if len(results) >= results_wanted:
|
||||
break
|
||||
|
||||
url = _rss_url(metro, category, title)
|
||||
try:
|
||||
items = _fetch_rss(url)
|
||||
except requests.RequestException as exc:
|
||||
print(f" [craigslist] HTTP error ({metro}/{title}): {exc}")
|
||||
time.sleep(_SLEEP)
|
||||
continue
|
||||
except ValueError as exc:
|
||||
print(f" [craigslist] Parse error ({metro}/{title}): {exc}")
|
||||
time.sleep(_SLEEP)
|
||||
continue
|
||||
|
||||
for item in items:
|
||||
if len(results) >= results_wanted:
|
||||
break
|
||||
|
||||
item_url = item.get("link", "")
|
||||
if not item_url or item_url in seen_urls:
|
||||
continue
|
||||
|
||||
pub = _parse_pubdate(item.get("pubDate", ""))
|
||||
if pub and pub.timestamp() < cutoff:
|
||||
continue
|
||||
|
||||
seen_urls.add(item_url)
|
||||
results.append({
|
||||
"title": item.get("title", ""),
|
||||
"company": "",
|
||||
"url": item_url,
|
||||
"source": "craigslist",
|
||||
"location": f"{metro} (Craigslist)",
|
||||
"is_remote": is_remote_search,
|
||||
"salary": "",
|
||||
"description": "",
|
||||
})
|
||||
|
||||
time.sleep(_SLEEP)
|
||||
|
||||
return results[:results_wanted]
|
||||
```
|
||||
|
||||
**Step 2: Run tests**
|
||||
|
||||
```bash
|
||||
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_craigslist.py -v
|
||||
```
|
||||
|
||||
Expected: all 8 PASS
|
||||
|
||||
**Step 3: Run full test suite to check for regressions**
|
||||
|
||||
```bash
|
||||
/devl/miniconda3/envs/job-seeker/bin/pytest tests/ -v
|
||||
```
|
||||
|
||||
Expected: all existing tests still PASS
|
||||
|
||||
**Step 4: Commit**
|
||||
|
||||
```bash
|
||||
git add scripts/custom_boards/craigslist.py tests/test_craigslist.py
|
||||
git commit -m "feat: add Craigslist RSS scraper to custom_boards"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 4: Wire into discover.py + search_profiles.yaml
|
||||
|
||||
**Files:**
|
||||
- Modify: `scripts/discover.py:20-32`
|
||||
- Modify: `config/search_profiles.yaml`
|
||||
|
||||
**Step 1: Add to `CUSTOM_SCRAPERS` registry in `discover.py`**
|
||||
|
||||
Find this block (around line 20):
|
||||
|
||||
```python
|
||||
from scripts.custom_boards import adzuna as _adzuna
|
||||
from scripts.custom_boards import theladders as _theladders
|
||||
```
|
||||
|
||||
Replace with:
|
||||
|
||||
```python
|
||||
from scripts.custom_boards import adzuna as _adzuna
|
||||
from scripts.custom_boards import theladders as _theladders
|
||||
from scripts.custom_boards import craigslist as _craigslist
|
||||
```
|
||||
|
||||
Find:
|
||||
|
||||
```python
|
||||
CUSTOM_SCRAPERS: dict[str, object] = {
|
||||
"adzuna": _adzuna.scrape,
|
||||
"theladders": _theladders.scrape,
|
||||
}
|
||||
```
|
||||
|
||||
Replace with:
|
||||
|
||||
```python
|
||||
CUSTOM_SCRAPERS: dict[str, object] = {
|
||||
"adzuna": _adzuna.scrape,
|
||||
"theladders": _theladders.scrape,
|
||||
"craigslist": _craigslist.scrape,
|
||||
}
|
||||
```
|
||||
|
||||
**Step 2: Add `craigslist` to relevant profiles in `config/search_profiles.yaml`**
|
||||
|
||||
For each profile that has `custom_boards:`, add `- craigslist`. Example — the `cs_leadership` profile currently has:
|
||||
|
||||
```yaml
|
||||
custom_boards:
|
||||
- adzuna
|
||||
- theladders
|
||||
```
|
||||
|
||||
Change to:
|
||||
|
||||
```yaml
|
||||
custom_boards:
|
||||
- adzuna
|
||||
- theladders
|
||||
- craigslist
|
||||
```
|
||||
|
||||
Repeat for all profiles where Craigslist makes sense (all of them — remote + SF Bay Area are both mapped).
|
||||
|
||||
**Step 3: Verify discover.py imports cleanly**
|
||||
|
||||
```bash
|
||||
conda run -n job-seeker python -c "from scripts.discover import CUSTOM_SCRAPERS; print(list(CUSTOM_SCRAPERS.keys()))"
|
||||
```
|
||||
|
||||
Expected: `['adzuna', 'theladders', 'craigslist']`
|
||||
|
||||
**Step 4: Commit**
|
||||
|
||||
```bash
|
||||
git add scripts/discover.py config/search_profiles.yaml
|
||||
git commit -m "feat: register craigslist scraper in discover.py and search profiles"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 5: LLM enrichment — extract company + salary for Craigslist jobs
|
||||
|
||||
**Files:**
|
||||
- Modify: `scripts/enrich_descriptions.py`
|
||||
- Modify: `scripts/task_runner.py`
|
||||
|
||||
**Step 1: Read `scripts/task_runner.py`** to understand the `scrape_url` completion handler before editing.
|
||||
|
||||
**Step 2: Add `enrich_craigslist_fields()` to `enrich_descriptions.py`**
|
||||
|
||||
Add this function after `enrich_all_descriptions` (before `if __name__ == "__main__"`):
|
||||
|
||||
```python
|
||||
def enrich_craigslist_fields(
|
||||
db_path: Path = DEFAULT_DB,
|
||||
job_id: int = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Use LLM to extract company name and salary from a Craigslist job description.
|
||||
|
||||
Called after scrape_url populates the description for a craigslist job.
|
||||
Only runs when: source='craigslist', company='', description non-empty.
|
||||
|
||||
Returns dict with keys 'company' and/or 'salary' (may be empty strings).
|
||||
"""
|
||||
import sqlite3 as _sq
|
||||
conn = _sq.connect(db_path)
|
||||
conn.row_factory = _sq.Row
|
||||
row = conn.execute(
|
||||
"SELECT id, description, company, source FROM jobs WHERE id=?", (job_id,)
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
if not row:
|
||||
return {}
|
||||
if row["source"] != "craigslist":
|
||||
return {}
|
||||
if row["company"]: # already populated
|
||||
return {}
|
||||
if not (row["description"] or "").strip():
|
||||
return {}
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
from scripts.llm_router import LLMRouter
|
||||
|
||||
prompt = (
|
||||
"Extract the following from this job posting. "
|
||||
"Return JSON only, no commentary.\n\n"
|
||||
'{"company": "<company name or empty string>", '
|
||||
'"salary": "<salary/compensation or empty string>"}\n\n'
|
||||
f"Posting:\n{row['description'][:3000]}"
|
||||
)
|
||||
|
||||
try:
|
||||
router = LLMRouter()
|
||||
raw = router.complete(prompt)
|
||||
except Exception as exc:
|
||||
print(f"[enrich_craigslist] LLM error for job {job_id}: {exc}")
|
||||
return {}
|
||||
|
||||
import json, re
|
||||
try:
|
||||
# Strip markdown code fences if present
|
||||
clean = re.sub(r"```(?:json)?|```", "", raw).strip()
|
||||
fields = json.loads(clean)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
print(f"[enrich_craigslist] Could not parse LLM response for job {job_id}: {raw!r}")
|
||||
return {}
|
||||
|
||||
extracted = {
|
||||
k: (fields.get(k) or "").strip()
|
||||
for k in ("company", "salary")
|
||||
if (fields.get(k) or "").strip()
|
||||
}
|
||||
|
||||
if extracted:
|
||||
from scripts.db import update_job_fields
|
||||
update_job_fields(db_path, job_id, extracted)
|
||||
print(f"[enrich_craigslist] job {job_id}: "
|
||||
f"company={extracted.get('company', '—')} "
|
||||
f"salary={extracted.get('salary', '—')}")
|
||||
|
||||
return extracted
|
||||
```
|
||||
|
||||
Also add `import sys` to the top of `enrich_descriptions.py` if not already present.
|
||||
|
||||
**Step 3: Add `enrich_craigslist` task type to `task_runner.py`**
|
||||
|
||||
In `_run_task`, add a new `elif` branch. Find the block that handles `scrape_url` and add after it:
|
||||
|
||||
```python
|
||||
elif task_type == "enrich_craigslist":
|
||||
from scripts.enrich_descriptions import enrich_craigslist_fields
|
||||
extracted = enrich_craigslist_fields(db_path, job_id)
|
||||
company = extracted.get("company", "")
|
||||
msg = f"company={company}" if company else "no company found"
|
||||
update_task_status(db_path, task_id, "completed", error=msg)
|
||||
return
|
||||
```
|
||||
|
||||
**Step 4: Auto-submit `enrich_craigslist` after `scrape_url` for Craigslist jobs**
|
||||
|
||||
Still in `task_runner.py`, find the `scrape_url` completion handler. After the `update_task_status` call for `scrape_url`, add:
|
||||
|
||||
```python
|
||||
# Auto-enrich company/salary for Craigslist jobs
|
||||
import sqlite3 as _sq
|
||||
_conn = _sq.connect(db_path)
|
||||
_conn.row_factory = _sq.Row
|
||||
_job = _conn.execute(
|
||||
"SELECT source, company FROM jobs WHERE id=?", (job_id,)
|
||||
).fetchone()
|
||||
_conn.close()
|
||||
if _job and _job["source"] == "craigslist" and not _job["company"]:
|
||||
submit_task(db_path, "enrich_craigslist", job_id)
|
||||
```
|
||||
|
||||
**Step 5: Smoke test — run a discovery cycle and check a craigslist job**
|
||||
|
||||
```bash
|
||||
conda run -n job-seeker python -c "
|
||||
from scripts.custom_boards.craigslist import scrape
|
||||
jobs = scrape({'titles': ['Customer Success Manager'], 'hours_old': 48}, 'San Francisco Bay Area, CA', results_wanted=3)
|
||||
for j in jobs:
|
||||
print(j['title'], '|', j['url'])
|
||||
"
|
||||
```
|
||||
|
||||
Expected: 0–3 job dicts printed (may be 0 if no recent postings — that's fine).
|
||||
|
||||
**Step 6: Commit**
|
||||
|
||||
```bash
|
||||
git add scripts/enrich_descriptions.py scripts/task_runner.py
|
||||
git commit -m "feat: add enrich_craigslist task for LLM company/salary extraction"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Final: push to remote
|
||||
|
||||
```bash
|
||||
git push origin main
|
||||
```
|
||||
|
|
@ -1,291 +0,0 @@
|
|||
# Expanded First-Run Wizard — Design
|
||||
|
||||
**Date:** 2026-02-24
|
||||
**Status:** Approved
|
||||
|
||||
---
|
||||
|
||||
## Goal
|
||||
|
||||
Replace the current 5-step surface-level wizard with a comprehensive onboarding flow that covers resume upload/parsing/building, guided config walkthroughs, LLM-assisted generation for key sections, and tier-based feature gating — while enforcing a minimum viable setup before the user can access the main app.
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
`0_Setup.py` becomes a thin orchestrator. All step logic moves into a new `app/wizard/` package. Resume parsing moves into `scripts/resume_parser.py`.
|
||||
|
||||
```
|
||||
app/
|
||||
app.py # gate: user.yaml exists AND wizard_complete: true
|
||||
wizard/
|
||||
tiers.py # tier definitions, feature gates, can_use() helper
|
||||
step_hardware.py # Step 1: GPU detection → profile recommendation
|
||||
step_tier.py # Step 2: free/paid/premium + dev_tier_override
|
||||
step_identity.py # Step 3: name/email/phone/linkedin/career_summary
|
||||
step_resume.py # Step 4: upload→parse OR guided form builder
|
||||
step_inference.py # Step 5: LLM backend config + API keys
|
||||
step_search.py # Step 6: job titles, locations, boards, keywords
|
||||
step_integrations.py # Step 7: optional cloud/calendar/notification services
|
||||
pages/
|
||||
0_Setup.py # imports steps, drives progress state
|
||||
scripts/
|
||||
resume_parser.py # PDF/DOCX text extraction → LLM structuring
|
||||
integrations/
|
||||
__init__.py # registry: {name: IntegrationBase subclass}
|
||||
base.py # IntegrationBase: connect(), test(), sync(), fields()
|
||||
notion.py
|
||||
google_drive.py
|
||||
google_sheets.py
|
||||
airtable.py
|
||||
dropbox.py
|
||||
onedrive.py
|
||||
mega.py
|
||||
nextcloud.py
|
||||
google_calendar.py
|
||||
apple_calendar.py # CalDAV
|
||||
slack.py
|
||||
discord.py # webhook only
|
||||
home_assistant.py
|
||||
config/
|
||||
integrations/ # one gitignored yaml per connected service
|
||||
notion.yaml.example
|
||||
google_drive.yaml.example
|
||||
...
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Gate Logic
|
||||
|
||||
`app.py` gate changes from a single existence check to:
|
||||
|
||||
```python
|
||||
if not UserProfile.exists(_USER_YAML):
|
||||
show_wizard()
|
||||
elif not _profile.wizard_complete:
|
||||
show_wizard() # resumes at last incomplete mandatory step
|
||||
```
|
||||
|
||||
`wizard_complete: false` is written to `user.yaml` at the start of Step 3 (identity). It is only flipped to `true` when all mandatory steps pass validation on the final Finish action.
|
||||
|
||||
---
|
||||
|
||||
## Mandatory Steps
|
||||
|
||||
The wizard cannot be exited until all six mandatory steps pass validation.
|
||||
|
||||
| Step | File | Minimum to pass |
|
||||
|------|------|----------------|
|
||||
| 1. Hardware | `step_hardware.py` | Profile selected (auto-detected default accepted) |
|
||||
| 2. Tier | `step_tier.py` | Tier selected (free is valid) |
|
||||
| 3. Identity | `step_identity.py` | name + email + career_summary non-empty |
|
||||
| 4. Resume | `step_resume.py` | At least one work experience entry |
|
||||
| 5. Inference | `step_inference.py` | At least one working LLM endpoint confirmed |
|
||||
| 6. Search | `step_search.py` | At least one job title + one location |
|
||||
|
||||
Each mandatory step's module exports `validate(data: dict) -> list[str]` — an errors list; empty = pass. These are pure functions, fully testable without Streamlit.
|
||||
|
||||
---
|
||||
|
||||
## Tier System
|
||||
|
||||
### `app/wizard/tiers.py`
|
||||
|
||||
```python
|
||||
TIERS = ["free", "paid", "premium"]
|
||||
|
||||
FEATURES = {
|
||||
# Wizard LLM generation
|
||||
"llm_career_summary": "paid",
|
||||
"llm_expand_bullets": "paid",
|
||||
"llm_suggest_skills": "paid",
|
||||
"llm_voice_guidelines": "premium",
|
||||
"llm_job_titles": "paid",
|
||||
"llm_keywords_blocklist": "paid",
|
||||
"llm_mission_notes": "paid",
|
||||
|
||||
# App features
|
||||
"company_research": "paid",
|
||||
"interview_prep": "paid",
|
||||
"email_classifier": "paid",
|
||||
"survey_assistant": "paid",
|
||||
"model_fine_tuning": "premium",
|
||||
"shared_cover_writer_model": "paid",
|
||||
"multi_user": "premium",
|
||||
"search_profiles_limit": {free: 1, paid: 5, premium: None},
|
||||
|
||||
# Integrations
|
||||
"notion_sync": "paid",
|
||||
"google_sheets_sync": "paid",
|
||||
"airtable_sync": "paid",
|
||||
"google_calendar_sync": "paid",
|
||||
"apple_calendar_sync": "paid",
|
||||
"slack_notifications": "paid",
|
||||
}
|
||||
# Free-tier integrations: google_drive, dropbox, onedrive, mega,
|
||||
# nextcloud, discord, home_assistant
|
||||
```
|
||||
|
||||
### Storage in `user.yaml`
|
||||
|
||||
```yaml
|
||||
tier: free # free | paid | premium
|
||||
dev_tier_override: premium # overrides tier locally — for testing only
|
||||
```
|
||||
|
||||
### Dev override UI
|
||||
|
||||
Settings → Developer tab (visible when `dev_tier_override` is set or `DEV_MODE=true` in `.env`). Single selectbox to switch tier instantly — page reruns, all gates re-evaluate, no restart needed. Also exposes a "Reset wizard" button that sets `wizard_complete: false` to re-enter the wizard without deleting existing config.
|
||||
|
||||
### Gated UI behaviour
|
||||
|
||||
Paid/premium features show a muted `tier_label()` badge (`🔒 Paid` / `⭐ Premium`) and a disabled state rather than being hidden entirely — free users see what they're missing. Clicking a locked `✨` button opens an upsell tooltip, not an error.
|
||||
|
||||
---
|
||||
|
||||
## Resume Handling (Step 4)
|
||||
|
||||
### Fast path — upload
|
||||
|
||||
1. PDF → `pdfminer.six` extracts raw text
|
||||
2. DOCX → `python-docx` extracts paragraphs
|
||||
3. Raw text → LLM structures into `plain_text_resume.yaml` fields via background task
|
||||
4. Populated form rendered for review/correction
|
||||
|
||||
### Fallback — guided form builder
|
||||
|
||||
Walks through `plain_text_resume.yaml` section by section:
|
||||
- Personal info (pre-filled from Step 3)
|
||||
- Work experience (add/remove entries)
|
||||
- Education
|
||||
- Skills
|
||||
- Achievements (optional)
|
||||
|
||||
Both paths converge on the same review form before saving. `career_summary` from the resume is fed back to populate Step 3 if not already set.
|
||||
|
||||
### Outputs
|
||||
|
||||
- `aihawk/data_folder/plain_text_resume.yaml`
|
||||
- `career_summary` written back to `user.yaml`
|
||||
|
||||
---
|
||||
|
||||
## LLM Generation Map
|
||||
|
||||
All `✨` actions submit a background task via `task_runner.py` using task type `wizard_generate` with a `section` parameter. The wizard step polls via `@st.fragment(run_every=3)` and shows inline status stages. Results land in `session_state` keyed by section and auto-populate the field on completion.
|
||||
|
||||
**Status stages for all wizard generation tasks:**
|
||||
`Queued → Analyzing → Generating → Done`
|
||||
|
||||
| Step | Action | Tier | Input | Output |
|
||||
|------|--------|------|-------|--------|
|
||||
| Identity | ✨ Generate career summary | Paid | Resume text | `career_summary` in user.yaml |
|
||||
| Resume | ✨ Expand bullet points | Paid | Rough responsibility notes | Polished STAR-format bullets |
|
||||
| Resume | ✨ Suggest skills | Paid | Experience descriptions | Skills list additions |
|
||||
| Resume | ✨ Infer voice guidelines | Premium | Resume + uploaded cover letters | Voice/tone hints in user.yaml |
|
||||
| Search | ✨ Suggest job titles | Paid | Resume + current titles | Additional title suggestions |
|
||||
| Search | ✨ Suggest keywords | Paid | Resume + titles | `resume_keywords.yaml` additions |
|
||||
| Search | ✨ Suggest blocklist | Paid | Resume + titles | `blocklist.yaml` additions |
|
||||
| My Profile (post-wizard) | ✨ Suggest mission notes | Paid | Resume + LinkedIn URL | `mission_preferences` notes |
|
||||
|
||||
---
|
||||
|
||||
## Optional Steps — Home Banners
|
||||
|
||||
After wizard completion, dismissible banners on the Home page surface remaining setup. Dismissed state stored as `dismissed_banners: [...]` in `user.yaml`.
|
||||
|
||||
| Banner | Links to |
|
||||
|--------|---------|
|
||||
| Connect a cloud service | Settings → Integrations |
|
||||
| Set up email sync | Settings → Email |
|
||||
| Set up email labels | Settings → Email (label guide) |
|
||||
| Tune your mission preferences | Settings → My Profile |
|
||||
| Configure keywords & blocklist | Settings → Search |
|
||||
| Upload cover letter corpus | Settings → Fine-Tune |
|
||||
| Configure LinkedIn Easy Apply | Settings → AIHawk |
|
||||
| Set up company research | Settings → Services (SearXNG) |
|
||||
| Build a target company list | Settings → Search |
|
||||
| Set up notifications | Settings → Integrations |
|
||||
| Tune a model | Settings → Fine-Tune |
|
||||
| Review training data | Settings → Fine-Tune |
|
||||
| Set up calendar sync | Settings → Integrations |
|
||||
|
||||
---
|
||||
|
||||
## Integrations Architecture
|
||||
|
||||
The registry pattern means adding a new integration requires one file in `scripts/integrations/` and one `.yaml.example` in `config/integrations/` — the wizard and Settings tab auto-discover it.
|
||||
|
||||
```python
|
||||
class IntegrationBase:
|
||||
name: str
|
||||
label: str
|
||||
tier: str
|
||||
def connect(self, config: dict) -> bool: ...
|
||||
def test(self) -> bool: ...
|
||||
def sync(self, jobs: list[dict]) -> int: ...
|
||||
def fields(self) -> list[dict]: ... # form field definitions for wizard card
|
||||
```
|
||||
|
||||
Integration configs written to `config/integrations/<name>.yaml` only after a successful `test()` — never on partial input.
|
||||
|
||||
### v1 Integration List
|
||||
|
||||
| Integration | Purpose | Tier |
|
||||
|-------------|---------|------|
|
||||
| Notion | Job tracking DB sync | Paid |
|
||||
| Notion Calendar | Covered by Notion integration | Paid |
|
||||
| Google Sheets | Simpler tracker alternative | Paid |
|
||||
| Airtable | Alternative tracker | Paid |
|
||||
| Google Drive | Resume/cover letter storage | Free |
|
||||
| Dropbox | Document storage | Free |
|
||||
| OneDrive | Document storage | Free |
|
||||
| MEGA | Document storage (privacy-first, cross-platform) | Free |
|
||||
| Nextcloud | Self-hosted document storage | Free |
|
||||
| Google Calendar | Write interview dates | Paid |
|
||||
| Apple Calendar | Write interview dates (CalDAV) | Paid |
|
||||
| Slack | Stage change notifications | Paid |
|
||||
| Discord | Stage change notifications (webhook) | Free |
|
||||
| Home Assistant | Notifications + automations (self-hosted) | Free |
|
||||
|
||||
---
|
||||
|
||||
## Data Flow
|
||||
|
||||
```
|
||||
Wizard step → Written to
|
||||
──────────────────────────────────────────────────────────────
|
||||
Hardware → user.yaml (inference_profile)
|
||||
Tier → user.yaml (tier, dev_tier_override)
|
||||
Identity → user.yaml (name, email, phone, linkedin,
|
||||
career_summary, wizard_complete: false)
|
||||
Resume (upload) → aihawk/data_folder/plain_text_resume.yaml
|
||||
Resume (builder) → aihawk/data_folder/plain_text_resume.yaml
|
||||
Inference → user.yaml (services block)
|
||||
.env (ANTHROPIC_API_KEY, OPENAI_COMPAT_URL/KEY)
|
||||
Search → config/search_profiles.yaml
|
||||
config/resume_keywords.yaml
|
||||
config/blocklist.yaml
|
||||
Finish → user.yaml (wizard_complete: true)
|
||||
config/llm.yaml (via apply_service_urls())
|
||||
Integrations → config/integrations/<name>.yaml (per service,
|
||||
only after successful test())
|
||||
Background tasks → staging.db background_tasks table
|
||||
LLM results → session_state[section] → field → user saves step
|
||||
```
|
||||
|
||||
**Key rules:**
|
||||
- Each mandatory step writes immediately on "Next" — partial progress survives crash or browser close
|
||||
- `apply_service_urls()` called once at Finish, not per-step
|
||||
- Integration configs never written on partial input — only after `test()` passes
|
||||
|
||||
---
|
||||
|
||||
## Testing
|
||||
|
||||
- **Tier switching:** Settings → Developer tab selectbox — instant rerun, no restart
|
||||
- **Wizard re-entry:** Settings → Developer "Reset wizard" button sets `wizard_complete: false`
|
||||
- **Unit tests:** `validate(data) -> list[str]` on each step module — pure functions, no Streamlit
|
||||
- **Integration tests:** `tests/test_wizard_flow.py` — full step sequence with mock LLM router and mock file writes
|
||||
- **`DEV_MODE=true`** in `.env` makes Developer tab always visible regardless of `dev_tier_override`
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,108 +0,0 @@
|
|||
# Session Handoff — Generalization Implementation
|
||||
|
||||
**Date:** 2026-02-24
|
||||
**For:** Next Claude session implementing the public fork
|
||||
|
||||
---
|
||||
|
||||
## Current State
|
||||
|
||||
The personal version (`/devl/job-seeker/`) is **complete and working** on `main`.
|
||||
|
||||
### What was completed in the 2026-02-24 session
|
||||
- Survey Assistant page (`app/pages/7_Survey.py`) — text paste + screenshot via moondream2
|
||||
- Vision Service (`scripts/vision_service/`) — FastAPI on port 8002, `job-seeker-vision` conda env
|
||||
- LLM Router `images=` parameter — vision-aware routing
|
||||
- `survey_responses` table + `survey_at` column in SQLite
|
||||
- Kanban consolidation — applied+survey as pre-kanban section; offer+hired merged column
|
||||
- `survey_received` email classifier label
|
||||
- Forgejo remote: https://git.opensourcesolarpunk.com/pyr0ball/job-seeker.git
|
||||
|
||||
### Remote repo
|
||||
```
|
||||
git remote: https://git.opensourcesolarpunk.com/pyr0ball/job-seeker.git
|
||||
branch: main (up to date as of 2026-02-24)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## What to Implement Next
|
||||
|
||||
Follow the plan at `docs/plans/2026-02-24-job-seeker-app-generalize.md`.
|
||||
The design doc is at `docs/plans/2026-02-24-generalize-design.md`.
|
||||
|
||||
**Target directory:** `/Library/Development/devl/job-seeker-app/` (new repo, no shared history)
|
||||
|
||||
**CRITICAL:** Do NOT start implementing the public fork until explicitly asked. The user confirmed this.
|
||||
|
||||
---
|
||||
|
||||
## Complete List of Hardcoded Personal References
|
||||
|
||||
Everything that must be extracted into `config/user.yaml` via a `UserProfile` class:
|
||||
|
||||
| File | Hardcoded value | Generalized as |
|
||||
|------|----------------|----------------|
|
||||
| `company_research.py` | `"Meghan McCann"` in prompts | `profile.name` |
|
||||
| `company_research.py` | `_NDA_COMPANIES = {"upguard"}` | `profile.nda_companies` |
|
||||
| `company_research.py` | `_SCRAPER_DIR = Path("/Library/...")` | bundled in Docker image |
|
||||
| `generate_cover_letter.py` | `SYSTEM_CONTEXT` with Meghan's bio | `profile.career_summary` |
|
||||
| `generate_cover_letter.py` | `LETTERS_DIR = Path("/Library/...")` | `profile.docs_dir` |
|
||||
| `4_Apply.py` | contact block (name/email/phone) | `profile.*` |
|
||||
| `4_Apply.py` | `DOCS_DIR = Path("/Library/...")` | `profile.docs_dir` |
|
||||
| `5_Interviews.py` | email assistant persona "Meghan McCann is a Customer Success..." | `profile.name + profile.career_summary` |
|
||||
| `6_Interview_Prep.py` | `"Meghan"` in interviewer prompts | `profile.name` |
|
||||
| `7_Survey.py` | `_SURVEY_SYSTEM` — "The candidate values collaborative teamwork..." | `profile.career_summary` or survey persona field |
|
||||
| `scripts/vision_service/main.py` | `model_id = "vikhyatk/moondream2"`, `revision = "2025-01-09"` | `config/llm.yaml` vision_service block |
|
||||
| `match.py` | `RESUME_PATH = Path("/Library/...Meghan_McCann_Resume...")` | configurable in Settings |
|
||||
| `Home.py` | `"Meghan's Job Search"` | `f"{profile.name}'s Job Search"` |
|
||||
| `finetune_local.py` | all `/Library/` paths + `"meghan-cover-writer"` | `profile.*` |
|
||||
| `2_Settings.py` | `PFP_DIR`, host service paths (manage-services.sh etc.) | removed / compose-driven |
|
||||
| `config/llm.yaml` | hard-coded `base_url` values | auto-generated from `user.yaml` |
|
||||
|
||||
---
|
||||
|
||||
## New Components to Dockerize
|
||||
|
||||
### Vision Service
|
||||
- Currently: `job-seeker-vision` conda env, port 8002, `manage-vision.sh`
|
||||
- In public fork: separate container in `single-gpu` / `dual-gpu` profiles only
|
||||
- In `remote` / `cpu` profiles: vision falls back to cloud backends
|
||||
- Model configurable via env var in container (default: moondream2)
|
||||
|
||||
### CompanyScraper
|
||||
- Currently: `/Library/Development/scrapers/companyScraper.py` (external path)
|
||||
- In public fork: bundled directly in the app image at a fixed internal path
|
||||
|
||||
---
|
||||
|
||||
## Key Architectural Decisions (from design doc)
|
||||
|
||||
1. **`UserProfile` class** wraps `config/user.yaml` — imported everywhere personal data is used
|
||||
2. **Four Docker Compose profiles:** `remote`, `cpu`, `single-gpu`, `dual-gpu`
|
||||
3. **First-run wizard** gates the app until `config/user.yaml` exists (5-step flow)
|
||||
4. **No shared git history** with personal repo — fresh `git init` in target dir
|
||||
5. **`.env` file** generated by wizard (never hand-edited), gitignored, contains resolved paths
|
||||
6. **`config/llm.yaml` base URLs** are derived values auto-generated from `user.yaml` services block
|
||||
7. **Claude Code Wrapper + Copilot Wrapper** removed from Services tab entirely
|
||||
|
||||
---
|
||||
|
||||
## Files/Paths in Personal Repo to Reference
|
||||
|
||||
- Entry point: `app/app.py`
|
||||
- All pages: `app/pages/`
|
||||
- DB helpers: `scripts/db.py` (single source of truth for schema)
|
||||
- LLM router: `scripts/llm_router.py`
|
||||
- Config: `config/llm.yaml`, `config/search_profiles.yaml`
|
||||
- Vision service: `scripts/vision_service/` (FastAPI + environment.yml)
|
||||
- Test suite: `tests/`
|
||||
|
||||
---
|
||||
|
||||
## Skill to Use
|
||||
|
||||
When starting the generalization session:
|
||||
1. Load `superpowers:executing-plans` skill
|
||||
2. Reference `docs/plans/2026-02-24-job-seeker-app-generalize.md` as the plan
|
||||
3. Work task-by-task with review checkpoints
|
||||
|
|
@ -1,276 +0,0 @@
|
|||
# Design: Generalizing Job Seeker for Public Use
|
||||
|
||||
**Date:** 2026-02-24
|
||||
**Status:** Approved
|
||||
**Target directory:** `/Library/Development/devl/job-seeker-app/`
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Fork the personal job-seeker app into a fully generalized version suitable for any job seeker.
|
||||
The personal version (`/devl/job-seeker/`) is preserved as-is on `main`.
|
||||
The public version is a separate local directory with a fresh git repo — no shared history.
|
||||
|
||||
Core goals:
|
||||
- Extract every hard-coded personal reference into a `config/user.yaml` profile
|
||||
- Docker Compose stack with profiles covering all GPU/inference configurations
|
||||
- First-run wizard that gates the app until the user is configured
|
||||
- Optional fine-tune wizard in Settings for users with a cover letter corpus and a GPU
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
The app runs via `docker compose` with four named profiles:
|
||||
|
||||
| Profile | Containers | Use case |
|
||||
|---|---|---|
|
||||
| `remote` | app + searxng | No GPU; all LLM calls go to external APIs |
|
||||
| `cpu` | app + ollama + searxng | No GPU; local models run on CPU (slow) |
|
||||
| `single-gpu` | app + ollama + searxng | One GPU shared for cover letters + research |
|
||||
| `dual-gpu` | app + ollama + vllm + searxng | GPU 0 = Ollama, GPU 1 = vLLM |
|
||||
|
||||
**SearXNG always runs** regardless of profile — it's lightweight and useful in every mode.
|
||||
|
||||
**Vision Service** runs as a separate container only in `single-gpu` and `dual-gpu` profiles.
|
||||
In `remote` profile, vision falls back to `claude_code` / `anthropic` backends.
|
||||
In `cpu` profile, vision falls back to cloud backends (moondream2 on CPU is impractically slow).
|
||||
|
||||
SQLite lives in a named Docker volume mount (`./data/`). No separate DB container.
|
||||
|
||||
CompanyScraper (`companyScraper.py`) is bundled directly into the app image — no external
|
||||
path dependency on the host.
|
||||
|
||||
The Claude Code Wrapper and GitHub Copilot Wrapper service entries are removed from the
|
||||
Services tab entirely. Users bring their own OpenAI-compatible endpoints via `config/llm.yaml`.
|
||||
|
||||
---
|
||||
|
||||
## User Profile (`config/user.yaml`)
|
||||
|
||||
Single source of truth for all personal data. Checked at startup — if absent, the first-run
|
||||
wizard is shown before any other page is accessible.
|
||||
|
||||
```yaml
|
||||
# Identity — drives all LLM personas, PDF headers, UI labels
|
||||
name: ""
|
||||
email: ""
|
||||
phone: ""
|
||||
linkedin: ""
|
||||
career_summary: "" # paragraph injected into cover letter system prompt
|
||||
|
||||
# Sensitive employers — masked in research briefs
|
||||
nda_companies: [] # e.g. ["UpGuard"] → "enterprise security vendor (NDA)"
|
||||
|
||||
# Local file paths
|
||||
docs_dir: "~/Documents/JobSearch" # cover letter PDFs + corpus
|
||||
ollama_models_dir: "~/models/ollama" # maps to OLLAMA_MODELS in container
|
||||
vllm_models_dir: "~/models/vllm" # mounted into vllm container
|
||||
|
||||
# Active hardware profile
|
||||
inference_profile: "remote" # remote | cpu | single-gpu | dual-gpu
|
||||
|
||||
# Service connection config
|
||||
services:
|
||||
streamlit_port: 8501
|
||||
|
||||
ollama_host: localhost
|
||||
ollama_port: 11434
|
||||
ollama_ssl: false
|
||||
ollama_ssl_verify: true # set false for self-signed certs
|
||||
|
||||
vllm_host: localhost
|
||||
vllm_port: 8000
|
||||
vllm_ssl: false
|
||||
vllm_ssl_verify: true
|
||||
|
||||
searxng_host: localhost
|
||||
searxng_port: 8888
|
||||
searxng_ssl: false
|
||||
searxng_ssl_verify: true
|
||||
```
|
||||
|
||||
All service base URLs in `config/llm.yaml` are **derived values** — auto-generated from the
|
||||
`services` block whenever the user saves their profile. Users never hand-edit URLs.
|
||||
|
||||
Health checks in the Services tab switch from raw TCP socket checks to
|
||||
`requests.get(url, verify=ssl_verify)` so they work against HTTPS endpoints and self-signed certs.
|
||||
|
||||
---
|
||||
|
||||
## First-Run Wizard
|
||||
|
||||
A dedicated Streamlit page shown instead of normal navigation when `config/user.yaml` is absent.
|
||||
Five steps with a progress bar; all steps write to a staging dict, committed to disk on the
|
||||
final step only.
|
||||
|
||||
### Step 1 — Hardware Detection
|
||||
- Auto-detect CUDA GPUs via `nvidia-smi` or `torch.cuda.device_count()`
|
||||
- Check NVIDIA Container Toolkit availability (`docker info | grep nvidia`)
|
||||
- Suggest a profile based on findings; user can override
|
||||
- Warn if suggested profile requires toolkit not installed, with link to docs
|
||||
|
||||
### Step 2 — Identity
|
||||
- Name, email, phone, LinkedIn URL
|
||||
- Career summary (multi-line text area): used as the LLM cover letter persona
|
||||
- Example placeholder text drawn from the resume profile YAML if AIHawk is present
|
||||
|
||||
### Step 3 — Sensitive Employers
|
||||
- Optional; skip button prominent
|
||||
- Chip-based add/remove (same UI as Skills tab)
|
||||
- Explanation: "Employers listed here will appear as 'previous employer (NDA)' in research briefs"
|
||||
|
||||
### Step 4 — Inference & API Keys
|
||||
- Shows only fields relevant to the selected profile
|
||||
- `remote`: Anthropic API key, optional OpenAI-compat endpoint URL + key
|
||||
- `cpu` / `single-gpu` / `dual-gpu`: Ollama model name for cover letters, vLLM model path
|
||||
- Port/host/SSL fields for each active service (collapsed under "Advanced" by default)
|
||||
|
||||
### Step 5 — Notion (Optional)
|
||||
- Integration token + database ID
|
||||
- Test connection button
|
||||
- Skip button prominent; can be configured later in Settings
|
||||
|
||||
**On completion:** writes `config/user.yaml`, `config/notion.yaml` (if provided),
|
||||
auto-generates `config/llm.yaml` base URLs from service config, redirects to Home.
|
||||
|
||||
---
|
||||
|
||||
## Settings Changes
|
||||
|
||||
### New: My Profile tab
|
||||
Editable form for all `user.yaml` fields post-setup. Saving regenerates `config/llm.yaml`
|
||||
base URLs automatically. Replaces scattered "Meghan's" references in existing tab captions.
|
||||
|
||||
### Updated: Services tab
|
||||
- Reads port/host from `profile.services.*` instead of hard-coded values
|
||||
- Start/stop commands switch to `docker compose --profile <profile> up/stop <service>`
|
||||
- Health checks use `requests.get` with SSL support
|
||||
- Claude Code Wrapper and Copilot Wrapper entries removed
|
||||
- vLLM model dir reads from `profile.vllm_models_dir`
|
||||
- SearXNG Docker cwd replaced with compose command (no host path needed)
|
||||
|
||||
### New: Fine-Tune Wizard tab (optional, GPU only)
|
||||
Shown only when `inference_profile` is `single-gpu` or `dual-gpu`.
|
||||
|
||||
1. **Upload corpus** — drag-and-drop cover letters (PDF, DOCX, TXT)
|
||||
2. **Preview pairs** — shows extracted (job description snippet → cover letter) training pairs;
|
||||
user can remove bad examples
|
||||
3. **Configure & train** — base model selector (defaults to currently loaded Ollama model),
|
||||
epochs slider, runs `finetune_local.py` as a background task
|
||||
4. **Register** — on completion, `ollama create <username>-cover-writer -f Modelfile`,
|
||||
updates `config/llm.yaml` to use the new model
|
||||
|
||||
Skipped entirely in `remote` and `cpu` profiles with a clear explanation.
|
||||
|
||||
---
|
||||
|
||||
## Code Changes — Hard-Coded Reference Extraction
|
||||
|
||||
A `UserProfile` class (thin wrapper around `config/user.yaml`) is imported wherever
|
||||
personal data is currently hard-coded.
|
||||
|
||||
| Location | Current | Generalized |
|
||||
|---|---|---|
|
||||
| `company_research.py` prompts | `"Meghan McCann"` | `profile.name` |
|
||||
| `company_research.py` | `_NDA_COMPANIES = {"upguard"}` | `profile.nda_companies` |
|
||||
| `company_research.py` | `_SCRAPER_DIR = Path("/Library/...")` | bundled in container |
|
||||
| `generate_cover_letter.py` | `SYSTEM_CONTEXT` with Meghan's bio | `profile.career_summary` |
|
||||
| `generate_cover_letter.py` | `LETTERS_DIR = Path("/Library/...")` | `profile.docs_dir` |
|
||||
| `generate_cover_letter.py` | `_MISSION_SIGNALS` / `_MISSION_NOTES` (hardcoded) | `profile.mission_industries` list; First-Run Wizard step |
|
||||
| `4_Apply.py` | contact block with name/email/phone | `profile.*` |
|
||||
| `4_Apply.py` | `DOCS_DIR = Path("/Library/...")` | `profile.docs_dir` |
|
||||
| `5_Interviews.py` email assistant | `"Meghan McCann is a Customer Success..."` | `profile.name + profile.career_summary` |
|
||||
| `6_Interview_Prep.py` | `"Meghan"` in interviewer prompts | `profile.name` |
|
||||
| `7_Survey.py` `_SURVEY_SYSTEM` | "The candidate values collaborative teamwork, clear communication, growth, and impact." | `profile.career_summary` or user-editable survey persona field |
|
||||
| `scripts/vision_service/main.py` | `model_id = "vikhyatk/moondream2"`, `revision = "2025-01-09"` | configurable in `config/llm.yaml` vision_service block |
|
||||
| `match.py` | `RESUME_PATH = Path("/Library/...Meghan_McCann_Resume...")` | configurable in Settings |
|
||||
| `Home.py` | `"Meghan's Job Search"` | `f"{profile.name}'s Job Search"` |
|
||||
| `finetune_local.py` | all `/Library/` paths + `"meghan-cover-writer"` | `profile.*` |
|
||||
| `2_Settings.py` | `PFP_DIR`, hard-coded service paths | removed / compose-driven |
|
||||
| `config/llm.yaml` | hard-coded `base_url` values | auto-generated from `user.yaml` |
|
||||
| `config/search_profiles.yaml` | `mission_tags` on profiles (implicit) | `profile.mission_industries` drives profile generation in wizard |
|
||||
| `config/adzuna.yaml` | per-user API credentials | First-Run Wizard step → `config/adzuna.yaml` (gitignored) |
|
||||
|
||||
### New fields needed in `config/user.yaml` (generalization)
|
||||
|
||||
```yaml
|
||||
# Mission-aligned industries — drives cover letter Para 3 and research accessibility section
|
||||
# Options: music, animal_welfare, education (extensible)
|
||||
mission_industries: []
|
||||
|
||||
# Accessibility priority — adds Inclusion & Accessibility section to every research brief.
|
||||
# This is for the candidate's personal decision-making; never disclosed in applications.
|
||||
accessibility_priority: true
|
||||
|
||||
# Custom board API credentials
|
||||
custom_boards:
|
||||
adzuna:
|
||||
app_id: ""
|
||||
app_key: ""
|
||||
# theladders: no credentials needed (curl_cffi scraper)
|
||||
```
|
||||
|
||||
The First-Run Wizard gains a **Step 2b — Personal Preferences** screen (between Identity and Sensitive Employers):
|
||||
- Checkboxes for preferred industries (Music, Animal Welfare, Education, Other...)
|
||||
- "Other" opens a free-text field to add custom industry signals
|
||||
- Accessibility priority toggle (on by default, explains what it does: "Adds an accessibility assessment to every company research brief so you can evaluate companies on your own terms. This information stays private — it's never sent to employers.")
|
||||
- Custom board credentials (Adzuna app ID/key) with a "Test" button
|
||||
|
||||
---
|
||||
|
||||
## Docker Compose Structure
|
||||
|
||||
```
|
||||
compose.yml # all services + profiles
|
||||
.env # generated by wizard (resolved paths, ports)
|
||||
Dockerfile # app image (Streamlit + companyScraper bundled)
|
||||
docker/
|
||||
searxng/
|
||||
settings.yml # pre-configured for JSON format output
|
||||
ollama/
|
||||
entrypoint.sh # pulls default model on first start if none present
|
||||
```
|
||||
|
||||
GPU passthrough uses `deploy.resources.reservations.devices` (NVIDIA Container Toolkit).
|
||||
Wizard warns and links to install docs if toolkit is missing when a GPU profile is selected.
|
||||
|
||||
The `.env` file is generated (never hand-edited) and gitignored. It contains resolved
|
||||
absolute paths for volume mounts (tilde-expanded from `user.yaml`) and port numbers.
|
||||
|
||||
---
|
||||
|
||||
## Out of Scope (this version)
|
||||
|
||||
- conda + local install path (future track)
|
||||
- Multi-user / auth (single-user app)
|
||||
- PostgreSQL migration (SQLite sufficient)
|
||||
- Windows support
|
||||
- AIHawk LinkedIn Easy Apply generalization (too tightly coupled to personal config)
|
||||
|
||||
---
|
||||
|
||||
## Backlog — Custom Job Source Scrapers
|
||||
|
||||
Not supported by JobSpy; would need custom scrapers plugged into `scripts/discover.py`:
|
||||
|
||||
| Priority | Site | Notes |
|
||||
|----------|------|-------|
|
||||
| 1 | [Adzuna](https://www.adzuna.com) | Free public API (api.adzuna.com) — cleanest integration path |
|
||||
| 2 | [The Ladders](https://www.theladders.com) | Focuses on $100K+ roles — good signal-to-noise for senior CS/ops positions |
|
||||
| 3 | Craigslist | HTML scrape, highly inconsistent by region; likely needs its own dedicated ingestion queue separate from the main discovery run |
|
||||
| — | Monster.com | Low priority — requires session/auth, likely needs Playwright; skip until others are done |
|
||||
|
||||
**Integration pattern:** Each custom source should return the same `pd.DataFrame` schema as JobSpy (`title`, `company`, `job_url`, `location`, `is_remote`, `description`, `site`) so `run_discovery` can consume it without changes. Cleanest as a separate `scripts/custom_boards/` module.
|
||||
|
||||
**LLM-guided profile setup wizard** (for generic build): First-run wizard that walks a new user through their work history and desired search terms, auto-generating `plain_text_resume.yaml` and `search_profiles.yaml`. See First-Run Wizard section above for hardware/identity/inference steps; this extends Step 2 with a career interview flow.
|
||||
|
||||
---
|
||||
|
||||
## Migration from Personal Version
|
||||
|
||||
No automated migration. The personal version stays on its own repo. If the user wants to
|
||||
carry over their `staging.db`, `config/*.yaml`, or cover letter corpus, they copy manually.
|
||||
The wizard's field defaults can be pre-populated from the personal version's config files
|
||||
if detected at a well-known path — but this is a nice-to-have, not required.
|
||||
|
|
@ -1,108 +0,0 @@
|
|||
# Design: Job Ingestion Improvements
|
||||
|
||||
**Date:** 2026-02-24
|
||||
**Status:** Approved
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Three improvements to how jobs enter the pipeline:
|
||||
|
||||
1. **Auto-parse LinkedIn Job Alert emails** — digest emails from `jobalerts-noreply@linkedin.com`
|
||||
contain multiple structured job cards in plain text. Currently ingested as a single confusing
|
||||
email lead. Instead, parse each card into a separate pending job and scrape it via a background
|
||||
task.
|
||||
|
||||
2. **`scrape_url` background task** — new task type that takes a job record's URL, fetches
|
||||
the full listing (title, company, description, salary, location), and updates the job row.
|
||||
Shared by both the LinkedIn alert parser and the manual URL import feature.
|
||||
|
||||
3. **Add Job(s) by URL on Home page** — paste one URL per line, or upload a CSV with a URL
|
||||
column. Each URL is inserted as a pending job and queued for background scraping.
|
||||
|
||||
---
|
||||
|
||||
## `scrape_url` Worker (`scripts/scrape_url.py`)
|
||||
|
||||
Single public function: `scrape_job_url(db_path, job_id) -> dict`
|
||||
|
||||
Board detection from URL hostname:
|
||||
|
||||
| URL pattern | Board | Scrape method |
|
||||
|---|---|---|
|
||||
| `linkedin.com/jobs/view/<id>/` | LinkedIn | LinkedIn guest jobs API (`/jobs-guest/jobs/api/jobPosting/<id>`) |
|
||||
| `indeed.com/viewjob?jk=<key>` | Indeed | requests + BeautifulSoup HTML parse |
|
||||
| `glassdoor.com/...` | Glassdoor | JobSpy internal scraper (same as `enrich_descriptions.py`) |
|
||||
| anything else | generic | requests + JSON-LD → og:tags fallback |
|
||||
|
||||
On success: `UPDATE jobs SET title, company, description, salary, location, is_remote WHERE id=?`
|
||||
On failure: job remains pending with its URL intact — user can still approve/reject it.
|
||||
|
||||
Requires a new `update_job_fields(db_path, job_id, fields: dict)` helper in `db.py`.
|
||||
|
||||
---
|
||||
|
||||
## LinkedIn Alert Parser (`imap_sync.py`)
|
||||
|
||||
New function `parse_linkedin_alert(body: str) -> list[dict]`
|
||||
|
||||
The plain-text body has a reliable block structure:
|
||||
```
|
||||
<Title>
|
||||
<Company>
|
||||
<Location>
|
||||
[optional social proof lines like "2 school alumni"]
|
||||
View job: https://www.linkedin.com/comm/jobs/view/<ID>/?<tracking>
|
||||
|
||||
---------------------------------------------------------
|
||||
|
||||
<next job block...>
|
||||
```
|
||||
|
||||
Parser:
|
||||
1. Split on lines of 10+ dashes
|
||||
2. For each block: filter out social-proof lines (alumni, "Apply with", "actively hiring", etc.)
|
||||
3. Extract: title (line 1), company (line 2), location (line 3), URL (line starting "View job:")
|
||||
4. Canonicalize URL: strip tracking params → `https://www.linkedin.com/jobs/view/<id>/`
|
||||
|
||||
Detection in `_scan_unmatched_leads`: if `from_addr` contains
|
||||
`jobalerts-noreply@linkedin.com`, skip the LLM path and call `parse_linkedin_alert` instead.
|
||||
Each parsed card → `insert_job()` + `submit_task(db, "scrape_url", job_id)`.
|
||||
The email itself is not stored as an email lead — it's a batch import trigger.
|
||||
|
||||
---
|
||||
|
||||
## Home Page URL Import
|
||||
|
||||
New section on `app/Home.py` between Email Sync and Danger Zone.
|
||||
|
||||
Two tabs:
|
||||
- **Paste URLs** — `st.text_area`, one URL per line
|
||||
- **Upload CSV** — `st.file_uploader`, auto-detects first column value starting with `http`
|
||||
|
||||
Both routes call a shared `_queue_url_imports(db_path, urls)` helper that:
|
||||
1. Filters URLs already in the DB (dedup by URL)
|
||||
2. Calls `insert_job({title="Importing…", source="manual", url=url, ...})`
|
||||
3. Calls `submit_task(db, "scrape_url", job_id)` per new job
|
||||
4. Shows `st.success(f"Queued N job(s)")`
|
||||
|
||||
A `@st.fragment(run_every=3)` status block below the form polls active `scrape_url` tasks
|
||||
and shows per-job status (⏳ / ✅ / ❌ title - company).
|
||||
|
||||
---
|
||||
|
||||
## Search Settings (already applied)
|
||||
|
||||
`config/search_profiles.yaml`:
|
||||
- `hours_old: 120 → 240` (cover LinkedIn's algo-sorted alerts)
|
||||
- `results_per_board: 50 → 75`
|
||||
- Added title: `Customer Engagement Manager`
|
||||
|
||||
---
|
||||
|
||||
## Out of Scope
|
||||
|
||||
- Scraping all 551 historical LinkedIn alert emails (run email sync going forward)
|
||||
- Deduplication against Notion (URL dedup in SQLite is sufficient)
|
||||
- Authentication-required boards (Indeed Easy Apply, etc.)
|
||||
|
|
@ -1,936 +0,0 @@
|
|||
# Job Ingestion Improvements — Implementation Plan
|
||||
|
||||
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
|
||||
|
||||
**Goal:** Auto-parse LinkedIn Job Alert digest emails into multiple pending jobs, add a `scrape_url` background task that fills in job details from a URL, and add a Home page widget for manual URL/CSV import.
|
||||
|
||||
**Architecture:** New `scripts/scrape_url.py` worker + `update_job_fields` DB helper → `scrape_url` task type in `task_runner.py` → consumed by both the LinkedIn alert parser in `imap_sync.py` and the new Home page URL import section.
|
||||
|
||||
**Tech Stack:** Python 3.12, Streamlit, SQLite, requests, BeautifulSoup4, JobSpy (internal scrapers), imap_sync existing patterns
|
||||
|
||||
**Reference:** Design doc at `docs/plans/2026-02-24-job-ingestion-design.md`
|
||||
|
||||
---
|
||||
|
||||
## Task 1: DB helper — `update_job_fields`
|
||||
|
||||
**Files:**
|
||||
- Modify: `scripts/db.py`
|
||||
- Test: `tests/test_db.py`
|
||||
|
||||
**Step 1: Write the failing test**
|
||||
|
||||
Add to `tests/test_db.py`:
|
||||
|
||||
```python
|
||||
def test_update_job_fields(tmp_path):
|
||||
from scripts.db import init_db, insert_job, update_job_fields
|
||||
db = tmp_path / "test.db"
|
||||
init_db(db)
|
||||
job_id = insert_job(db, {
|
||||
"title": "Importing…", "company": "", "url": "https://example.com/job/1",
|
||||
"source": "manual", "location": "", "description": "", "date_found": "2026-02-24",
|
||||
})
|
||||
update_job_fields(db, job_id, {
|
||||
"title": "Customer Success Manager",
|
||||
"company": "Acme Corp",
|
||||
"location": "San Francisco, CA",
|
||||
"description": "Great role.",
|
||||
"salary": "$120k",
|
||||
"is_remote": 1,
|
||||
})
|
||||
import sqlite3
|
||||
conn = sqlite3.connect(db)
|
||||
row = dict(conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone())
|
||||
conn.close()
|
||||
assert row["title"] == "Customer Success Manager"
|
||||
assert row["company"] == "Acme Corp"
|
||||
assert row["description"] == "Great role."
|
||||
assert row["is_remote"] == 1
|
||||
|
||||
|
||||
def test_update_job_fields_ignores_unknown_columns(tmp_path):
|
||||
from scripts.db import init_db, insert_job, update_job_fields
|
||||
db = tmp_path / "test.db"
|
||||
init_db(db)
|
||||
job_id = insert_job(db, {
|
||||
"title": "Importing…", "company": "", "url": "https://example.com/job/2",
|
||||
"source": "manual", "location": "", "description": "", "date_found": "2026-02-24",
|
||||
})
|
||||
# Should not raise even with an unknown column
|
||||
update_job_fields(db, job_id, {"title": "Real Title", "nonexistent_col": "ignored"})
|
||||
import sqlite3
|
||||
conn = sqlite3.connect(db)
|
||||
row = dict(conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone())
|
||||
conn.close()
|
||||
assert row["title"] == "Real Title"
|
||||
```
|
||||
|
||||
**Step 2: Run test to verify it fails**
|
||||
|
||||
```bash
|
||||
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_db.py::test_update_job_fields tests/test_db.py::test_update_job_fields_ignores_unknown_columns -v
|
||||
```
|
||||
Expected: FAIL — `ImportError: cannot import name 'update_job_fields'`
|
||||
|
||||
**Step 3: Implement `update_job_fields` in `scripts/db.py`**
|
||||
|
||||
Add after `update_cover_letter`:
|
||||
|
||||
```python
|
||||
_UPDATABLE_JOB_COLS = {
|
||||
"title", "company", "url", "source", "location", "is_remote",
|
||||
"salary", "description", "match_score", "keyword_gaps",
|
||||
}
|
||||
|
||||
|
||||
def update_job_fields(db_path: Path = DEFAULT_DB, job_id: int = None,
|
||||
fields: dict = None) -> None:
|
||||
"""Update arbitrary job columns. Unknown keys are silently ignored."""
|
||||
if not job_id or not fields:
|
||||
return
|
||||
safe = {k: v for k, v in fields.items() if k in _UPDATABLE_JOB_COLS}
|
||||
if not safe:
|
||||
return
|
||||
conn = sqlite3.connect(db_path)
|
||||
sets = ", ".join(f"{col} = ?" for col in safe)
|
||||
conn.execute(
|
||||
f"UPDATE jobs SET {sets} WHERE id = ?",
|
||||
(*safe.values(), job_id),
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
```
|
||||
|
||||
**Step 4: Run tests to verify they pass**
|
||||
|
||||
```bash
|
||||
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_db.py::test_update_job_fields tests/test_db.py::test_update_job_fields_ignores_unknown_columns -v
|
||||
```
|
||||
Expected: PASS
|
||||
|
||||
**Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add scripts/db.py tests/test_db.py
|
||||
git commit -m "feat: add update_job_fields helper to db.py"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 2: `scripts/scrape_url.py` + `task_runner.py` integration
|
||||
|
||||
**Files:**
|
||||
- Create: `scripts/scrape_url.py`
|
||||
- Modify: `scripts/task_runner.py`
|
||||
- Test: `tests/test_scrape_url.py`
|
||||
|
||||
**Step 1: Write the failing tests**
|
||||
|
||||
Create `tests/test_scrape_url.py`:
|
||||
|
||||
```python
|
||||
"""Tests for URL-based job scraping."""
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
|
||||
def _make_db(tmp_path, url="https://www.linkedin.com/jobs/view/99999/"):
|
||||
from scripts.db import init_db, insert_job
|
||||
db = tmp_path / "test.db"
|
||||
init_db(db)
|
||||
job_id = insert_job(db, {
|
||||
"title": "Importing…", "company": "", "url": url,
|
||||
"source": "manual", "location": "", "description": "", "date_found": "2026-02-24",
|
||||
})
|
||||
return db, job_id
|
||||
|
||||
|
||||
def test_canonicalize_url_linkedin():
|
||||
from scripts.scrape_url import canonicalize_url
|
||||
messy = (
|
||||
"https://www.linkedin.com/jobs/view/4376518925/"
|
||||
"?trk=eml-email_job_alert&refId=abc%3D%3D&trackingId=xyz"
|
||||
)
|
||||
assert canonicalize_url(messy) == "https://www.linkedin.com/jobs/view/4376518925/"
|
||||
|
||||
|
||||
def test_canonicalize_url_linkedin_comm():
|
||||
from scripts.scrape_url import canonicalize_url
|
||||
comm = "https://www.linkedin.com/comm/jobs/view/4376518925/?trackingId=abc"
|
||||
assert canonicalize_url(comm) == "https://www.linkedin.com/jobs/view/4376518925/"
|
||||
|
||||
|
||||
def test_canonicalize_url_generic_strips_utm():
|
||||
from scripts.scrape_url import canonicalize_url
|
||||
url = "https://jobs.example.com/post/42?utm_source=linkedin&utm_medium=email&jk=real_param"
|
||||
result = canonicalize_url(url)
|
||||
assert "utm_source" not in result
|
||||
assert "real_param" in result
|
||||
|
||||
|
||||
def test_detect_board_linkedin():
|
||||
from scripts.scrape_url import _detect_board
|
||||
assert _detect_board("https://www.linkedin.com/jobs/view/12345/") == "linkedin"
|
||||
assert _detect_board("https://linkedin.com/jobs/view/12345/?tracking=abc") == "linkedin"
|
||||
|
||||
|
||||
def test_detect_board_indeed():
|
||||
from scripts.scrape_url import _detect_board
|
||||
assert _detect_board("https://www.indeed.com/viewjob?jk=abc123") == "indeed"
|
||||
|
||||
|
||||
def test_detect_board_glassdoor():
|
||||
from scripts.scrape_url import _detect_board
|
||||
assert _detect_board("https://www.glassdoor.com/job-listing/foo-bar-123.htm") == "glassdoor"
|
||||
|
||||
|
||||
def test_detect_board_generic():
|
||||
from scripts.scrape_url import _detect_board
|
||||
assert _detect_board("https://jobs.example.com/posting/42") == "generic"
|
||||
|
||||
|
||||
def test_extract_linkedin_job_id():
|
||||
from scripts.scrape_url import _extract_linkedin_job_id
|
||||
assert _extract_linkedin_job_id("https://www.linkedin.com/jobs/view/4376518925/") == "4376518925"
|
||||
assert _extract_linkedin_job_id("https://www.linkedin.com/comm/jobs/view/4376518925/?tracking=x") == "4376518925"
|
||||
assert _extract_linkedin_job_id("https://example.com/no-id") is None
|
||||
|
||||
|
||||
def test_scrape_linkedin_updates_job(tmp_path):
|
||||
db, job_id = _make_db(tmp_path)
|
||||
|
||||
linkedin_html = """<html><head></head><body>
|
||||
<h2 class="top-card-layout__title">Customer Success Manager</h2>
|
||||
<a class="topcard__org-name-link">Acme Corp</a>
|
||||
<span class="topcard__flavor--bullet">San Francisco, CA</span>
|
||||
<div class="show-more-less-html__markup">Exciting CSM role with great benefits.</div>
|
||||
</body></html>"""
|
||||
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.text = linkedin_html
|
||||
mock_resp.raise_for_status = MagicMock()
|
||||
|
||||
with patch("scripts.scrape_url.requests.get", return_value=mock_resp):
|
||||
from scripts.scrape_url import scrape_job_url
|
||||
result = scrape_job_url(db, job_id)
|
||||
|
||||
assert result.get("title") == "Customer Success Manager"
|
||||
assert result.get("company") == "Acme Corp"
|
||||
assert "CSM role" in result.get("description", "")
|
||||
|
||||
import sqlite3
|
||||
conn = sqlite3.connect(db)
|
||||
row = dict(conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone())
|
||||
conn.close()
|
||||
assert row["title"] == "Customer Success Manager"
|
||||
assert row["company"] == "Acme Corp"
|
||||
|
||||
|
||||
def test_scrape_url_generic_json_ld(tmp_path):
|
||||
db, job_id = _make_db(tmp_path, url="https://jobs.example.com/post/42")
|
||||
|
||||
json_ld_html = """<html><head>
|
||||
<script type="application/ld+json">
|
||||
{"@type": "JobPosting", "title": "TAM Role", "description": "Tech account mgmt.",
|
||||
"hiringOrganization": {"name": "TechCo"},
|
||||
"jobLocation": {"address": {"addressLocality": "Austin, TX"}}}
|
||||
</script>
|
||||
</head><body></body></html>"""
|
||||
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.text = json_ld_html
|
||||
mock_resp.raise_for_status = MagicMock()
|
||||
|
||||
with patch("scripts.scrape_url.requests.get", return_value=mock_resp):
|
||||
from scripts.scrape_url import scrape_job_url
|
||||
result = scrape_job_url(db, job_id)
|
||||
|
||||
assert result.get("title") == "TAM Role"
|
||||
assert result.get("company") == "TechCo"
|
||||
|
||||
|
||||
def test_scrape_url_graceful_on_http_error(tmp_path):
|
||||
db, job_id = _make_db(tmp_path)
|
||||
import requests as req
|
||||
|
||||
with patch("scripts.scrape_url.requests.get", side_effect=req.RequestException("timeout")):
|
||||
from scripts.scrape_url import scrape_job_url
|
||||
result = scrape_job_url(db, job_id)
|
||||
|
||||
# Should return empty dict and not raise; job row still exists
|
||||
assert isinstance(result, dict)
|
||||
import sqlite3
|
||||
conn = sqlite3.connect(db)
|
||||
row = conn.execute("SELECT id FROM jobs WHERE id=?", (job_id,)).fetchone()
|
||||
conn.close()
|
||||
assert row is not None
|
||||
```
|
||||
|
||||
**Step 2: Run tests to verify they fail**
|
||||
|
||||
```bash
|
||||
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_scrape_url.py -v
|
||||
```
|
||||
Expected: FAIL — `ModuleNotFoundError: No module named 'scripts.scrape_url'`
|
||||
|
||||
**Step 3: Implement `scripts/scrape_url.py`**
|
||||
|
||||
```python
|
||||
# scripts/scrape_url.py
|
||||
"""
|
||||
Scrape a job listing from its URL and update the job record.
|
||||
|
||||
Supports:
|
||||
- LinkedIn (guest jobs API — no auth required)
|
||||
- Indeed (HTML parse)
|
||||
- Glassdoor (JobSpy internal scraper, same as enrich_descriptions.py)
|
||||
- Generic (JSON-LD → og:tags fallback)
|
||||
|
||||
Usage (background task — called by task_runner):
|
||||
from scripts.scrape_url import scrape_job_url
|
||||
scrape_job_url(db_path, job_id)
|
||||
"""
|
||||
import json
|
||||
import re
|
||||
import sqlite3
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from scripts.db import DEFAULT_DB, update_job_fields
|
||||
|
||||
_HEADERS = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
)
|
||||
}
|
||||
_TIMEOUT = 12
|
||||
|
||||
|
||||
def _detect_board(url: str) -> str:
|
||||
"""Return 'linkedin', 'indeed', 'glassdoor', or 'generic'."""
|
||||
url_lower = url.lower()
|
||||
if "linkedin.com" in url_lower:
|
||||
return "linkedin"
|
||||
if "indeed.com" in url_lower:
|
||||
return "indeed"
|
||||
if "glassdoor.com" in url_lower:
|
||||
return "glassdoor"
|
||||
return "generic"
|
||||
|
||||
|
||||
def _extract_linkedin_job_id(url: str) -> Optional[str]:
|
||||
"""Extract numeric job ID from a LinkedIn job URL."""
|
||||
m = re.search(r"/jobs/view/(\d+)", url)
|
||||
return m.group(1) if m else None
|
||||
|
||||
|
||||
def canonicalize_url(url: str) -> str:
|
||||
"""
|
||||
Strip tracking parameters from a job URL and return a clean canonical form.
|
||||
|
||||
LinkedIn: https://www.linkedin.com/jobs/view/<id>/?trk=... → https://www.linkedin.com/jobs/view/<id>/
|
||||
Indeed: strips utm_* and other tracking params
|
||||
Others: strips utm_source/utm_medium/utm_campaign/trk/refId/trackingId
|
||||
"""
|
||||
url = url.strip()
|
||||
if "linkedin.com" in url.lower():
|
||||
job_id = _extract_linkedin_job_id(url)
|
||||
if job_id:
|
||||
return f"https://www.linkedin.com/jobs/view/{job_id}/"
|
||||
# For other boards: strip common tracking params
|
||||
from urllib.parse import urlparse, urlencode, parse_qsl
|
||||
_STRIP_PARAMS = {
|
||||
"utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term",
|
||||
"trk", "trkEmail", "refId", "trackingId", "lipi", "midToken", "midSig",
|
||||
"eid", "otpToken", "ssid", "fmid",
|
||||
}
|
||||
parsed = urlparse(url)
|
||||
clean_qs = urlencode([(k, v) for k, v in parse_qsl(parsed.query) if k not in _STRIP_PARAMS])
|
||||
return parsed._replace(query=clean_qs).geturl()
|
||||
|
||||
|
||||
def _scrape_linkedin(url: str) -> dict:
|
||||
"""Fetch via LinkedIn guest jobs API (no auth required)."""
|
||||
job_id = _extract_linkedin_job_id(url)
|
||||
if not job_id:
|
||||
return {}
|
||||
api_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
|
||||
resp = requests.get(api_url, headers=_HEADERS, timeout=_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
def _text(selector, **kwargs):
|
||||
tag = soup.find(selector, **kwargs)
|
||||
return tag.get_text(strip=True) if tag else ""
|
||||
|
||||
title = _text("h2", class_="top-card-layout__title")
|
||||
company = _text("a", class_="topcard__org-name-link") or _text("span", class_="topcard__org-name-link")
|
||||
location = _text("span", class_="topcard__flavor--bullet")
|
||||
desc_div = soup.find("div", class_="show-more-less-html__markup")
|
||||
description = desc_div.get_text(separator="\n", strip=True) if desc_div else ""
|
||||
|
||||
return {k: v for k, v in {
|
||||
"title": title,
|
||||
"company": company,
|
||||
"location": location,
|
||||
"description": description,
|
||||
"source": "linkedin",
|
||||
}.items() if v}
|
||||
|
||||
|
||||
def _scrape_indeed(url: str) -> dict:
|
||||
"""Scrape an Indeed job page."""
|
||||
resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
return _parse_json_ld_or_og(resp.text) or {}
|
||||
|
||||
|
||||
def _scrape_glassdoor(url: str) -> dict:
|
||||
"""Re-use JobSpy's Glassdoor scraper for description fetch."""
|
||||
m = re.search(r"jl=(\d+)", url)
|
||||
if not m:
|
||||
return {}
|
||||
try:
|
||||
from jobspy.glassdoor import Glassdoor
|
||||
from jobspy.glassdoor.constant import fallback_token, headers
|
||||
from jobspy.model import ScraperInput, Site
|
||||
from jobspy.util import create_session
|
||||
|
||||
scraper = Glassdoor()
|
||||
scraper.base_url = "https://www.glassdoor.com/"
|
||||
scraper.session = create_session(has_retry=True)
|
||||
token = scraper._get_csrf_token()
|
||||
headers["gd-csrf-token"] = token if token else fallback_token
|
||||
scraper.scraper_input = ScraperInput(site_type=[Site.GLASSDOOR])
|
||||
description = scraper._fetch_job_description(int(m.group(1)))
|
||||
return {"description": description} if description else {}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def _parse_json_ld_or_og(html: str) -> dict:
|
||||
"""Extract job fields from JSON-LD structured data, then og: meta tags."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Try JSON-LD first
|
||||
for script in soup.find_all("script", type="application/ld+json"):
|
||||
try:
|
||||
data = json.loads(script.string or "")
|
||||
if isinstance(data, list):
|
||||
data = next((d for d in data if d.get("@type") == "JobPosting"), {})
|
||||
if data.get("@type") == "JobPosting":
|
||||
org = data.get("hiringOrganization") or {}
|
||||
loc = (data.get("jobLocation") or {})
|
||||
if isinstance(loc, list):
|
||||
loc = loc[0] if loc else {}
|
||||
addr = loc.get("address") or {}
|
||||
location = (
|
||||
addr.get("addressLocality", "") or
|
||||
addr.get("addressRegion", "") or
|
||||
addr.get("addressCountry", "")
|
||||
)
|
||||
return {k: v for k, v in {
|
||||
"title": data.get("title", ""),
|
||||
"company": org.get("name", ""),
|
||||
"location": location,
|
||||
"description": data.get("description", ""),
|
||||
"salary": str(data.get("baseSalary", "")) if data.get("baseSalary") else "",
|
||||
}.items() if v}
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Fall back to og: meta tags
|
||||
def _meta(prop):
|
||||
tag = soup.find("meta", property=prop) or soup.find("meta", attrs={"name": prop})
|
||||
return (tag or {}).get("content", "") if tag else ""
|
||||
|
||||
title = _meta("og:title") or (soup.find("title") or {}).get_text(strip=True)
|
||||
description = _meta("og:description")
|
||||
return {k: v for k, v in {"title": title, "description": description}.items() if v}
|
||||
|
||||
|
||||
def _scrape_generic(url: str) -> dict:
|
||||
resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
return _parse_json_ld_or_og(resp.text) or {}
|
||||
|
||||
|
||||
def scrape_job_url(db_path: Path = DEFAULT_DB, job_id: int = None) -> dict:
|
||||
"""
|
||||
Fetch the job listing at the stored URL and update the job record.
|
||||
|
||||
Returns the dict of fields that were scraped (may be empty on failure).
|
||||
Does not raise — failures are logged and the job row is left as-is.
|
||||
"""
|
||||
if not job_id:
|
||||
return {}
|
||||
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
row = conn.execute("SELECT url FROM jobs WHERE id=?", (job_id,)).fetchone()
|
||||
conn.close()
|
||||
if not row:
|
||||
return {}
|
||||
|
||||
url = row["url"] or ""
|
||||
if not url.startswith("http"):
|
||||
return {}
|
||||
|
||||
board = _detect_board(url)
|
||||
try:
|
||||
if board == "linkedin":
|
||||
fields = _scrape_linkedin(url)
|
||||
elif board == "indeed":
|
||||
fields = _scrape_indeed(url)
|
||||
elif board == "glassdoor":
|
||||
fields = _scrape_glassdoor(url)
|
||||
else:
|
||||
fields = _scrape_generic(url)
|
||||
except requests.RequestException as exc:
|
||||
print(f"[scrape_url] HTTP error for job {job_id} ({url}): {exc}")
|
||||
return {}
|
||||
except Exception as exc:
|
||||
print(f"[scrape_url] Error scraping job {job_id} ({url}): {exc}")
|
||||
return {}
|
||||
|
||||
if fields:
|
||||
# Never overwrite the URL or source with empty values
|
||||
fields.pop("url", None)
|
||||
update_job_fields(db_path, job_id, fields)
|
||||
print(f"[scrape_url] job {job_id}: scraped '{fields.get('title', '?')}' @ {fields.get('company', '?')}")
|
||||
|
||||
return fields
|
||||
```
|
||||
|
||||
**Step 4: Add `scrape_url` task type to `scripts/task_runner.py`**
|
||||
|
||||
In `_run_task`, add a new `elif` branch after `enrich_descriptions` and before the final `else`:
|
||||
|
||||
```python
|
||||
elif task_type == "scrape_url":
|
||||
from scripts.scrape_url import scrape_job_url
|
||||
fields = scrape_job_url(db_path, job_id)
|
||||
title = fields.get("title") or job.get("url", "?")
|
||||
company = fields.get("company", "")
|
||||
msg = f"{title}" + (f" @ {company}" if company else "")
|
||||
update_task_status(db_path, task_id, "completed", error=msg)
|
||||
return
|
||||
```
|
||||
|
||||
**Step 5: Run all tests**
|
||||
|
||||
```bash
|
||||
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_scrape_url.py -v
|
||||
```
|
||||
Expected: all PASS
|
||||
|
||||
**Step 6: Commit**
|
||||
|
||||
```bash
|
||||
git add scripts/scrape_url.py scripts/task_runner.py tests/test_scrape_url.py
|
||||
git commit -m "feat: add scrape_url background task for URL-based job import"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 3: LinkedIn Job Alert email parser
|
||||
|
||||
**Files:**
|
||||
- Modify: `scripts/imap_sync.py`
|
||||
- Test: `tests/test_imap_sync.py`
|
||||
|
||||
**Step 1: Write the failing tests**
|
||||
|
||||
Add to `tests/test_imap_sync.py`:
|
||||
|
||||
```python
|
||||
def test_parse_linkedin_alert_extracts_jobs():
|
||||
from scripts.imap_sync import parse_linkedin_alert
|
||||
body = """\
|
||||
Your job alert for customer success manager in United States
|
||||
New jobs match your preferences.
|
||||
Manage alerts: https://www.linkedin.com/comm/jobs/alerts?...
|
||||
|
||||
Customer Success Manager
|
||||
Reflow
|
||||
California, United States
|
||||
View job: https://www.linkedin.com/comm/jobs/view/4376518925/?trackingId=abc%3D%3D&refId=xyz
|
||||
|
||||
---------------------------------------------------------
|
||||
|
||||
Customer Engagement Manager
|
||||
Bitwarden
|
||||
United States
|
||||
|
||||
2 school alumni
|
||||
Apply with resume & profile
|
||||
View job: https://www.linkedin.com/comm/jobs/view/4359824983/?trackingId=def%3D%3D
|
||||
|
||||
---------------------------------------------------------
|
||||
|
||||
"""
|
||||
jobs = parse_linkedin_alert(body)
|
||||
assert len(jobs) == 2
|
||||
assert jobs[0]["title"] == "Customer Success Manager"
|
||||
assert jobs[0]["company"] == "Reflow"
|
||||
assert jobs[0]["location"] == "California, United States"
|
||||
assert jobs[0]["url"] == "https://www.linkedin.com/jobs/view/4376518925/"
|
||||
assert jobs[1]["title"] == "Customer Engagement Manager"
|
||||
assert jobs[1]["company"] == "Bitwarden"
|
||||
assert jobs[1]["url"] == "https://www.linkedin.com/jobs/view/4359824983/"
|
||||
|
||||
|
||||
def test_parse_linkedin_alert_skips_blocks_without_view_job():
|
||||
from scripts.imap_sync import parse_linkedin_alert
|
||||
body = """\
|
||||
Customer Success Manager
|
||||
Some Company
|
||||
United States
|
||||
|
||||
---------------------------------------------------------
|
||||
|
||||
Valid Job Title
|
||||
Valid Company
|
||||
Remote
|
||||
View job: https://www.linkedin.com/comm/jobs/view/1111111/?x=y
|
||||
|
||||
---------------------------------------------------------
|
||||
"""
|
||||
jobs = parse_linkedin_alert(body)
|
||||
assert len(jobs) == 1
|
||||
assert jobs[0]["title"] == "Valid Job Title"
|
||||
|
||||
|
||||
def test_parse_linkedin_alert_empty_body():
|
||||
from scripts.imap_sync import parse_linkedin_alert
|
||||
assert parse_linkedin_alert("") == []
|
||||
assert parse_linkedin_alert("No jobs here.") == []
|
||||
```
|
||||
|
||||
**Step 2: Run tests to verify they fail**
|
||||
|
||||
```bash
|
||||
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_imap_sync.py::test_parse_linkedin_alert_extracts_jobs tests/test_imap_sync.py::test_parse_linkedin_alert_skips_blocks_without_view_job tests/test_imap_sync.py::test_parse_linkedin_alert_empty_body -v
|
||||
```
|
||||
Expected: FAIL — `ImportError: cannot import name 'parse_linkedin_alert'`
|
||||
|
||||
**Step 3: Implement `parse_linkedin_alert` in `scripts/imap_sync.py`**
|
||||
|
||||
Add after the existing `_has_todo_keyword` function (around line 391):
|
||||
|
||||
```python
|
||||
_LINKEDIN_ALERT_SENDER = "jobalerts-noreply@linkedin.com"
|
||||
|
||||
# Social-proof / nav lines to skip when parsing alert blocks
|
||||
_ALERT_SKIP_PHRASES = {
|
||||
"alumni", "apply with", "actively hiring", "manage alerts",
|
||||
"view all jobs", "your job alert", "new jobs match",
|
||||
"unsubscribe", "linkedin corporation",
|
||||
}
|
||||
|
||||
|
||||
def parse_linkedin_alert(body: str) -> list[dict]:
|
||||
"""
|
||||
Parse the plain-text body of a LinkedIn Job Alert digest email.
|
||||
|
||||
Returns a list of dicts: {title, company, location, url}.
|
||||
URL is canonicalized to https://www.linkedin.com/jobs/view/<id>/
|
||||
(tracking parameters stripped).
|
||||
"""
|
||||
jobs = []
|
||||
# Split on separator lines (10+ dashes)
|
||||
blocks = re.split(r"\n\s*-{10,}\s*\n", body)
|
||||
for block in blocks:
|
||||
lines = [ln.strip() for ln in block.strip().splitlines() if ln.strip()]
|
||||
|
||||
# Find "View job:" URL
|
||||
url = None
|
||||
for line in lines:
|
||||
m = re.search(r"View job:\s*(https?://\S+)", line, re.IGNORECASE)
|
||||
if m:
|
||||
raw_url = m.group(1)
|
||||
job_id_m = re.search(r"/jobs/view/(\d+)", raw_url)
|
||||
if job_id_m:
|
||||
url = f"https://www.linkedin.com/jobs/view/{job_id_m.group(1)}/"
|
||||
break
|
||||
if not url:
|
||||
continue
|
||||
|
||||
# Filter noise lines
|
||||
content = [
|
||||
ln for ln in lines
|
||||
if not any(p in ln.lower() for p in _ALERT_SKIP_PHRASES)
|
||||
and not ln.lower().startswith("view job:")
|
||||
and not ln.startswith("http")
|
||||
]
|
||||
if len(content) < 2:
|
||||
continue
|
||||
|
||||
jobs.append({
|
||||
"title": content[0],
|
||||
"company": content[1],
|
||||
"location": content[2] if len(content) > 2 else "",
|
||||
"url": url,
|
||||
})
|
||||
return jobs
|
||||
```
|
||||
|
||||
**Step 4: Wire the parser into `_scan_unmatched_leads`**
|
||||
|
||||
In `_scan_unmatched_leads`, inside the `for uid in all_uids:` loop, add a detection block immediately after the `if mid in known_message_ids: continue` check (before the existing `_has_recruitment_keyword` check):
|
||||
|
||||
```python
|
||||
# ── LinkedIn Job Alert digest — parse each card individually ──────
|
||||
if _LINKEDIN_ALERT_SENDER in parsed["from_addr"].lower():
|
||||
cards = parse_linkedin_alert(parsed["body"])
|
||||
for card in cards:
|
||||
if card["url"] in existing_urls:
|
||||
continue
|
||||
job_id = insert_job(db_path, {
|
||||
"title": card["title"],
|
||||
"company": card["company"],
|
||||
"url": card["url"],
|
||||
"source": "linkedin",
|
||||
"location": card["location"],
|
||||
"is_remote": 0,
|
||||
"salary": "",
|
||||
"description": "",
|
||||
"date_found": datetime.now().isoformat()[:10],
|
||||
})
|
||||
if job_id:
|
||||
from scripts.task_runner import submit_task
|
||||
submit_task(db_path, "scrape_url", job_id)
|
||||
existing_urls.add(card["url"])
|
||||
new_leads += 1
|
||||
print(f"[imap] LinkedIn alert → {card['company']} — {card['title']}")
|
||||
known_message_ids.add(mid)
|
||||
continue # skip normal LLM extraction path
|
||||
```
|
||||
|
||||
**Step 5: Run all imap_sync tests**
|
||||
|
||||
```bash
|
||||
/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_imap_sync.py -v
|
||||
```
|
||||
Expected: all PASS (including the 3 new tests)
|
||||
|
||||
**Step 6: Commit**
|
||||
|
||||
```bash
|
||||
git add scripts/imap_sync.py tests/test_imap_sync.py
|
||||
git commit -m "feat: auto-parse LinkedIn Job Alert digest emails into pending jobs"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 4: Home page — Add Job(s) by URL
|
||||
|
||||
**Files:**
|
||||
- Modify: `app/Home.py`
|
||||
|
||||
No unit tests — this is pure Streamlit UI. Verify manually by pasting a URL and checking the DB.
|
||||
|
||||
**Step 1: Add `_queue_url_imports` helper and the new section to `app/Home.py`**
|
||||
|
||||
Add to the imports at the top (after the existing `from scripts.db import ...` line):
|
||||
|
||||
```python
|
||||
from scripts.db import DEFAULT_DB, init_db, get_job_counts, purge_jobs, purge_email_data, \
|
||||
kill_stuck_tasks, get_task_for_job, get_active_tasks, insert_job, get_existing_urls
|
||||
```
|
||||
|
||||
Add this helper function before the Streamlit layout code (after the `init_db` call at the top):
|
||||
|
||||
```python
|
||||
def _queue_url_imports(db_path: Path, urls: list[str]) -> int:
|
||||
"""Insert each URL as a pending manual job and queue a scrape_url task.
|
||||
Returns count of newly queued jobs."""
|
||||
from datetime import datetime
|
||||
from scripts.scrape_url import canonicalize_url
|
||||
existing = get_existing_urls(db_path)
|
||||
queued = 0
|
||||
for url in urls:
|
||||
url = canonicalize_url(url.strip())
|
||||
if not url.startswith("http"):
|
||||
continue
|
||||
if url in existing:
|
||||
continue
|
||||
job_id = insert_job(db_path, {
|
||||
"title": "Importing…",
|
||||
"company": "",
|
||||
"url": url,
|
||||
"source": "manual",
|
||||
"location": "",
|
||||
"description": "",
|
||||
"date_found": datetime.now().isoformat()[:10],
|
||||
})
|
||||
if job_id:
|
||||
submit_task(db_path, "scrape_url", job_id)
|
||||
queued += 1
|
||||
return queued
|
||||
```
|
||||
|
||||
Add a new section between the Email Sync divider and the Danger Zone expander. Replace:
|
||||
|
||||
```python
|
||||
st.divider()
|
||||
|
||||
# ── Danger zone: purge + re-scrape ────────────────────────────────────────────
|
||||
```
|
||||
|
||||
with:
|
||||
|
||||
```python
|
||||
st.divider()
|
||||
|
||||
# ── Add Jobs by URL ───────────────────────────────────────────────────────────
|
||||
add_left, add_right = st.columns([3, 1])
|
||||
with add_left:
|
||||
st.subheader("Add Jobs by URL")
|
||||
st.caption("Paste job listing URLs to import and scrape in the background. "
|
||||
"Supports LinkedIn, Indeed, Glassdoor, and most job boards.")
|
||||
|
||||
url_tab, csv_tab = st.tabs(["Paste URLs", "Upload CSV"])
|
||||
|
||||
with url_tab:
|
||||
url_text = st.text_area(
|
||||
"urls",
|
||||
placeholder="https://www.linkedin.com/jobs/view/1234567/\nhttps://www.indeed.com/viewjob?jk=abc",
|
||||
height=100,
|
||||
label_visibility="collapsed",
|
||||
)
|
||||
if st.button("📥 Add Jobs", key="add_urls_btn", use_container_width=True,
|
||||
disabled=not (url_text or "").strip()):
|
||||
_urls = [u.strip() for u in url_text.strip().splitlines() if u.strip().startswith("http")]
|
||||
if _urls:
|
||||
_n = _queue_url_imports(DEFAULT_DB, _urls)
|
||||
if _n:
|
||||
st.success(f"Queued {_n} job{'s' if _n != 1 else ''} for import. Check Job Review shortly.")
|
||||
else:
|
||||
st.info("All URLs already in the database.")
|
||||
st.rerun()
|
||||
|
||||
with csv_tab:
|
||||
csv_file = st.file_uploader("CSV with a URL column", type=["csv"],
|
||||
label_visibility="collapsed")
|
||||
if csv_file:
|
||||
import csv as _csv
|
||||
import io as _io
|
||||
reader = _csv.DictReader(_io.StringIO(csv_file.read().decode("utf-8", errors="replace")))
|
||||
_csv_urls = []
|
||||
for row in reader:
|
||||
for val in row.values():
|
||||
if val and val.strip().startswith("http"):
|
||||
_csv_urls.append(val.strip())
|
||||
break
|
||||
if _csv_urls:
|
||||
st.caption(f"Found {len(_csv_urls)} URL(s) in CSV.")
|
||||
if st.button("📥 Import CSV Jobs", key="add_csv_btn", use_container_width=True):
|
||||
_n = _queue_url_imports(DEFAULT_DB, _csv_urls)
|
||||
st.success(f"Queued {_n} job{'s' if _n != 1 else ''} for import.")
|
||||
st.rerun()
|
||||
else:
|
||||
st.warning("No URLs found — CSV must have a column whose values start with http.")
|
||||
|
||||
# Active scrape_url tasks status
|
||||
@st.fragment(run_every=3)
|
||||
def _scrape_status():
|
||||
import sqlite3 as _sq
|
||||
conn = _sq.connect(DEFAULT_DB)
|
||||
conn.row_factory = _sq.Row
|
||||
rows = conn.execute(
|
||||
"""SELECT bt.status, bt.error, j.title, j.company, j.url
|
||||
FROM background_tasks bt
|
||||
JOIN jobs j ON j.id = bt.job_id
|
||||
WHERE bt.task_type = 'scrape_url'
|
||||
AND bt.updated_at >= datetime('now', '-5 minutes')
|
||||
ORDER BY bt.updated_at DESC LIMIT 20"""
|
||||
).fetchall()
|
||||
conn.close()
|
||||
if not rows:
|
||||
return
|
||||
st.caption("Recent URL imports:")
|
||||
for r in rows:
|
||||
if r["status"] == "running":
|
||||
st.info(f"⏳ Scraping {r['url']}")
|
||||
elif r["status"] == "completed":
|
||||
label = f"{r['title']}" + (f" @ {r['company']}" if r['company'] else "")
|
||||
st.success(f"✅ {label}")
|
||||
elif r["status"] == "failed":
|
||||
st.error(f"❌ {r['url']} — {r['error'] or 'scrape failed'}")
|
||||
|
||||
_scrape_status()
|
||||
|
||||
st.divider()
|
||||
|
||||
# ── Danger zone: purge + re-scrape ────────────────────────────────────────────
|
||||
```
|
||||
|
||||
**Step 2: Check `background_tasks` schema has an `updated_at` column**
|
||||
|
||||
The status fragment queries `bt.updated_at`. Verify it exists:
|
||||
|
||||
```bash
|
||||
conda run -n job-seeker python -c "
|
||||
import sqlite3
|
||||
from scripts.db import DEFAULT_DB, init_db
|
||||
init_db(DEFAULT_DB)
|
||||
conn = sqlite3.connect(DEFAULT_DB)
|
||||
print(conn.execute('PRAGMA table_info(background_tasks)').fetchall())
|
||||
"
|
||||
```
|
||||
|
||||
If `updated_at` is missing, add a migration in `scripts/db.py`'s `_migrate_db` function:
|
||||
|
||||
```python
|
||||
try:
|
||||
conn.execute("ALTER TABLE background_tasks ADD COLUMN updated_at TEXT DEFAULT (datetime('now'))")
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
```
|
||||
|
||||
And update `update_task_status` in `db.py` to set `updated_at = datetime('now')` on every status change:
|
||||
|
||||
```python
|
||||
def update_task_status(db_path, task_id, status, error=None):
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.execute(
|
||||
"UPDATE background_tasks SET status=?, error=?, updated_at=datetime('now') WHERE id=?",
|
||||
(status, error, task_id),
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
```
|
||||
|
||||
**Step 3: Restart the UI and manually verify**
|
||||
|
||||
```bash
|
||||
bash /devl/job-seeker/scripts/manage-ui.sh restart
|
||||
```
|
||||
|
||||
Test:
|
||||
1. Paste `https://www.linkedin.com/jobs/view/4376518925/` into the text area
|
||||
2. Click "📥 Add Jobs" — should show "Queued 1 job for import"
|
||||
3. Go to Job Review → should see a pending job (Reflow - Customer Success Manager once scraped)
|
||||
|
||||
**Step 4: Commit**
|
||||
|
||||
```bash
|
||||
git add app/Home.py
|
||||
git commit -m "feat: add 'Add Jobs by URL' section to Home page with background scraping"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Final: push to remote
|
||||
|
||||
```bash
|
||||
git push origin main
|
||||
```
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,474 +0,0 @@
|
|||
# Job Seeker Platform — Monetization Business Plan
|
||||
|
||||
**Date:** 2026-02-24
|
||||
**Status:** Draft — pre-VC pitch
|
||||
**Author:** Brainstorming session
|
||||
|
||||
---
|
||||
|
||||
## 1. Product Overview
|
||||
|
||||
An automated job discovery, resume matching, and application pipeline platform. Built originally as a personal tool for a single job seeker; architecture is already generalized — user identity, preferences, and data are fully parameterized via onboarding, not hardcoded.
|
||||
|
||||
### Core pipeline
|
||||
```
|
||||
Job Discovery (multi-board) → Resume Matching → Job Review UI
|
||||
→ Apply Workspace (cover letter + PDF)
|
||||
→ Interviews Kanban (phone_screen → offer → hired)
|
||||
→ Notion Sync
|
||||
```
|
||||
|
||||
### Key feature surface
|
||||
- Multi-board job discovery (LinkedIn, Indeed, Glassdoor, ZipRecruiter, Google, Adzuna, The Ladders)
|
||||
- LinkedIn Alert email ingestion + email classifier (interview requests, rejections, surveys)
|
||||
- Resume keyword matching + match scoring
|
||||
- AI cover letter generation (local model, shared hosted model, or cloud LLM)
|
||||
- Company research briefs (web scrape + LLM synthesis)
|
||||
- Interview prep + practice Q&A
|
||||
- Culture-fit survey assistant with vision/screenshot support
|
||||
- Application pipeline kanban with stage tracking
|
||||
- Notion sync for external tracking
|
||||
- Mission alignment + accessibility preferences (personal decision-making only)
|
||||
- Per-user fine-tuned cover letter model (trained on user's own writing corpus)
|
||||
|
||||
---
|
||||
|
||||
## 2. Target Market
|
||||
|
||||
### Primary: Individual job seekers (B2C)
|
||||
- Actively searching, technically comfortable, value privacy
|
||||
- Frustrated by manual tracking (spreadsheets, Notion boards)
|
||||
- Want AI-assisted applications without giving their data to a third party
|
||||
- Typical job search duration: 3–6 months → average subscription length ~4.5 months
|
||||
|
||||
### Secondary: Career coaches (B2B, seat-based)
|
||||
- Manage 10–20 active clients simultaneously
|
||||
- High willingness to pay for tools that make their service more efficient
|
||||
- **20× revenue multiplier** vs. solo users (base + per-seat pricing)
|
||||
|
||||
### Tertiary: Outplacement firms / staffing agencies (B2B enterprise)
|
||||
- Future expansion; validates product-market fit at coach tier first
|
||||
|
||||
---
|
||||
|
||||
## 3. Distribution Model
|
||||
|
||||
### Starting point: Local-first (self-hosted)
|
||||
|
||||
Users run the application on their own machine via Docker Compose or a native installer. All job data, resume data, and preferences stay local. AI features are optional and configurable — users can use their own LLM backends or subscribe for hosted AI.
|
||||
|
||||
**Why local-first:**
|
||||
- Zero infrastructure cost per free user
|
||||
- Strong privacy story (no job search data on your servers)
|
||||
- Reversible — easy to add a hosted SaaS path later without a rewrite
|
||||
- Aligns with the open core licensing model
|
||||
|
||||
### Future path: Cloud Edition (SaaS)
|
||||
|
||||
Same codebase deployed as a hosted service. Users sign up at a URL, no install required. Unlocked when revenue and user feedback validate the market.
|
||||
|
||||
**Architecture readiness:** The config layer, per-user data isolation, and SQLite-per-user design already support multi-tenancy with minimal refactoring. SaaS is a deployment mode, not a rewrite.
|
||||
|
||||
---
|
||||
|
||||
## 4. Licensing Strategy
|
||||
|
||||
### Open Core
|
||||
|
||||
| Component | License | Rationale |
|
||||
|---|---|---|
|
||||
| Job discovery pipeline | MIT | Community maintains scrapers (boards break constantly) |
|
||||
| SQLite schema + `db.py` | MIT | Interoperability, trust |
|
||||
| Application pipeline state machine | MIT | Core value is visible, auditable |
|
||||
| Streamlit UI shell | MIT | Community contributions, forks welcome |
|
||||
| AI cover letter generation | BSL 1.1 | Proprietary prompt engineering + model routing |
|
||||
| Company research synthesis | BSL 1.1 | LLM orchestration is the moat |
|
||||
| Interview prep + practice Q&A | BSL 1.1 | Premium feature |
|
||||
| Survey assistant (vision) | BSL 1.1 | Premium feature |
|
||||
| Email classifier | BSL 1.1 | Premium feature |
|
||||
| Notion sync | BSL 1.1 | Integration layer |
|
||||
| Team / multi-user features | Proprietary | Future enterprise feature |
|
||||
| Analytics dashboard | Proprietary | Future feature |
|
||||
| Fine-tuned model weights | Proprietary | Per-user, not redistributable |
|
||||
|
||||
**Business Source License (BSL 1.1):** Code is visible and auditable on GitHub. Free for personal, non-commercial self-hosting. Commercial use or SaaS re-hosting requires a paid license. Converts to MIT after 4 years. Used by HashiCorp (Vault, Terraform), MariaDB, and others — well understood by the VC community.
|
||||
|
||||
**Why this works here:** The value is not in the code. A competitor could clone the repo and still not have: the fine-tuned model, the user's corpus, the orchestration prompts, or the UX polish. The moat is the system, not any individual file.
|
||||
|
||||
---
|
||||
|
||||
## 5. Tier Structure
|
||||
|
||||
### Free — $0/mo
|
||||
Self-hosted, local-only. Genuinely useful as a privacy-respecting job tracker.
|
||||
|
||||
| Feature | Included |
|
||||
|---|---|
|
||||
| Multi-board job discovery | ✓ |
|
||||
| Custom board scrapers (Adzuna, The Ladders) | ✓ |
|
||||
| LinkedIn Alert email ingestion | ✓ |
|
||||
| Add jobs by URL | ✓ |
|
||||
| Resume keyword matching | ✓ |
|
||||
| Cover letter generation (local Ollama only) | ✓ |
|
||||
| Application pipeline kanban | ✓ |
|
||||
| Mission alignment + accessibility preferences | ✓ |
|
||||
| Search profiles | 1 |
|
||||
| AI backend | User's local Ollama |
|
||||
| Support | Community (GitHub Discussions) |
|
||||
|
||||
**Purpose:** Acquisition engine. GitHub stars = distribution. Users who get a job on free tier refer friends.
|
||||
|
||||
---
|
||||
|
||||
### Paid — $12/mo
|
||||
For job seekers who want quality AI output without GPU setup or API key management.
|
||||
|
||||
Includes everything in Free, plus:
|
||||
|
||||
| Feature | Included |
|
||||
|---|---|
|
||||
| Shared hosted fine-tuned cover letter model | ✓ |
|
||||
| Claude API (BYOK — bring your own key) | ✓ |
|
||||
| Company research briefs | ✓ |
|
||||
| Interview prep + practice Q&A | ✓ |
|
||||
| Survey assistant (vision/screenshot) | ✓ |
|
||||
| Search criteria LLM suggestions | ✓ |
|
||||
| Email classifier | ✓ |
|
||||
| Notion sync | ✓ |
|
||||
| Search profiles | 5 |
|
||||
| Support | Email |
|
||||
|
||||
**Purpose:** Primary revenue tier. High margin, low support burden. Targets the individual job seeker who wants "it just works."
|
||||
|
||||
---
|
||||
|
||||
### Premium — $29/mo
|
||||
For power users and career coaches who want best-in-class output and personal model training.
|
||||
|
||||
Includes everything in Paid, plus:
|
||||
|
||||
| Feature | Included |
|
||||
|---|---|
|
||||
| Claude Sonnet (your hosted key, 150 ops/mo included) | ✓ |
|
||||
| Per-user fine-tuned model (trained on their corpus) | ✓ (one-time onboarding) |
|
||||
| Corpus re-training | ✓ (quarterly) |
|
||||
| Search profiles | Unlimited |
|
||||
| Multi-user / coach mode | ✓ (+$15/seat) |
|
||||
| Shared job pool across seats | ✓ |
|
||||
| Priority support + onboarding call | ✓ |
|
||||
|
||||
**Purpose:** Highest LTV tier. Coach accounts at 3+ seats generate $59–$239/mo each. Fine-tuned personal model is a high-perceived-value differentiator that costs ~$0.50 to produce.
|
||||
|
||||
---
|
||||
|
||||
## 6. AI Inference — Claude API Cost Model
|
||||
|
||||
Pricing basis: Haiku 4.5 = $0.80/MTok in · $4/MTok out | Sonnet 4.6 = $3/MTok in · $15/MTok out
|
||||
|
||||
### Per-operation costs
|
||||
|
||||
| Operation | Tokens In | Tokens Out | Haiku | Sonnet |
|
||||
|---|---|---|---|---|
|
||||
| Cover letter generation | ~2,400 | ~400 | $0.0035 | $0.013 |
|
||||
| Company research brief | ~3,000 | ~800 | $0.0056 | $0.021 |
|
||||
| Survey Q&A (5 questions) | ~3,000 | ~1,500 | $0.0084 | $0.031 |
|
||||
| Job description enrichment | ~800 | ~300 | $0.0018 | $0.007 |
|
||||
| Search criteria suggestion | ~400 | ~200 | $0.0010 | $0.004 |
|
||||
|
||||
### Monthly inference cost per active user
|
||||
Assumptions: 12 cover letters, 3 research briefs, 2 surveys, 40 enrichments, 2 search suggestions
|
||||
|
||||
| Backend mix | Cost/user/mo |
|
||||
|---|---|
|
||||
| Haiku only (paid tier) | ~$0.15 |
|
||||
| Sonnet only | ~$0.57 |
|
||||
| Mixed: Sonnet for CL + research, Haiku for rest (premium tier) | ~$0.31 |
|
||||
|
||||
### Per-user fine-tuning cost (premium, one-time)
|
||||
| Provider | Cost |
|
||||
|---|---|
|
||||
| User's local GPU | $0 |
|
||||
| RunPod A100 (~20 min) | $0.25–$0.40 |
|
||||
| Together AI / Replicate | $0.50–$0.75 |
|
||||
| Quarterly re-train | Same as above |
|
||||
|
||||
**Amortized over 12 months:** ~$0.04–$0.06/user/mo
|
||||
|
||||
---
|
||||
|
||||
## 7. Full Infrastructure Cost Model
|
||||
|
||||
Local-first architecture means most compute runs on the user's machine. Your infra is limited to: AI inference API calls, shared model serving, fine-tune jobs, license/auth server, and storage for model artifacts.
|
||||
|
||||
### Monthly infrastructure at 100K users
|
||||
(4% paid conversion = 4,000 paid; 20% of paid premium = 800 premium)
|
||||
|
||||
| Cost center | Detail | Monthly cost |
|
||||
|---|---|---|
|
||||
| Claude API inference (paid tier, Haiku) | 4,000 users × $0.15 | $600 |
|
||||
| Claude API inference (premium tier, mixed) | 800 users × $0.31 | $248 |
|
||||
| Shared model serving (Together AI, 3B model) | 48,000 requests/mo | $27 |
|
||||
| Per-user fine-tune jobs | 800 users / 12mo × $0.50 | $33 |
|
||||
| App hosting (license server, auth API, DB) | VPS + PostgreSQL | $200 |
|
||||
| Model artifact storage (800 × 1.5GB on S3) | 1.2TB | $28 |
|
||||
| **Total** | | **$1,136/mo** |
|
||||
|
||||
---
|
||||
|
||||
## 8. Revenue Model & Unit Economics
|
||||
|
||||
### Monthly revenue at scale
|
||||
|
||||
| Total users | Paid (4%) | Premium (20% of paid) | Revenue/mo | Infra/mo | **Gross margin** |
|
||||
|---|---|---|---|---|---|
|
||||
| 10,000 | 400 | 80 | $7,120 | $196 | **97.2%** |
|
||||
| 100,000 | 4,000 | 800 | $88,250 | $1,136 | **98.7%** |
|
||||
|
||||
### Blended ARPU
|
||||
- Across all users (including free): **~$0.71/user/mo**
|
||||
- Across paying users only: **~$17.30/user/mo**
|
||||
- Coach account (3 seats avg): **~$74/mo**
|
||||
|
||||
### LTV per user segment
|
||||
- Paid individual (4.5mo avg job search): **~$54**
|
||||
- Premium individual (4.5mo avg): **~$130**
|
||||
- Coach account (ongoing, low churn): **$74/mo × 18mo estimated = ~$1,330**
|
||||
- **Note:** Success churn is real — users leave when they get a job. Re-subscription rate on next job search partially offsets this.
|
||||
|
||||
### ARR projections
|
||||
|
||||
| Scale | ARR |
|
||||
|---|---|
|
||||
| 10K users | **~$85K** |
|
||||
| 100K users | **~$1.06M** |
|
||||
| 1M users | **~$10.6M** |
|
||||
|
||||
To reach $10M ARR: ~1M total users **or** meaningful coach/enterprise penetration at lower user counts.
|
||||
|
||||
---
|
||||
|
||||
## 9. VC Pitch Angles
|
||||
|
||||
### The thesis
|
||||
> "GitHub is our distribution channel. Local-first is our privacy moat. Coaches are our revenue engine."
|
||||
|
||||
### Key metrics to hit before Series A
|
||||
- 10K GitHub stars (validates distribution thesis)
|
||||
- 500 paying users (validates willingness to pay)
|
||||
- 20 coach accounts (validates B2B multiplier)
|
||||
- 97%+ gross margin (already proven in model)
|
||||
|
||||
### Competitive differentiation
|
||||
1. **Privacy-first** — job search data never leaves your machine on free/paid tiers
|
||||
2. **Fine-tuned personal model** — no other tool trains a cover letter model on your specific writing voice
|
||||
3. **Full pipeline** — discovery through hired, not just one step (most competitors are point solutions)
|
||||
4. **Open core** — community maintains job board scrapers, which break constantly; competitors pay engineers for this
|
||||
5. **LLM-agnostic** — works with Ollama, Claude, GPT, vLLM; users aren't locked to one provider
|
||||
|
||||
### Risks to address
|
||||
- **Success churn** — mitigated by re-subscription on next job search, coach accounts (persistent), and potential pivot to ongoing career management
|
||||
- **Job board scraping fragility** — mitigated by open core (community patches), multiple board sources, email ingestion fallback
|
||||
- **LLM cost spikes** — mitigated by Haiku-first routing, local model fallback, user BYOK option
|
||||
- **Copying by incumbents** — LinkedIn, Indeed have distribution but not privacy story; fine-tuned personal model is hard to replicate at their scale
|
||||
|
||||
---
|
||||
|
||||
## 10. Roadmap
|
||||
|
||||
### Phase 1 — Local-first launch (now)
|
||||
- Docker Compose installer + setup wizard
|
||||
- License key server (simple, hosted)
|
||||
- Paid tier: shared model endpoint + Notion sync + email classifier
|
||||
- Premium tier: fine-tune pipeline + Claude API routing
|
||||
- Open core GitHub repo (MIT core, BSL premium)
|
||||
|
||||
### Phase 2 — Coach tier validation (3–6 months post-launch)
|
||||
- Multi-user mode with seat management
|
||||
- Coach dashboard: shared job pool, per-candidate pipeline view
|
||||
- Billing portal (Stripe)
|
||||
- Outplacement firm pilot
|
||||
|
||||
### Phase 3 — Cloud Edition (6–12 months, revenue-funded or post-seed)
|
||||
- Hosted SaaS version at a URL (no install)
|
||||
- Same codebase, cloud deployment mode
|
||||
- Converts local-first users who want convenience
|
||||
- Enables mobile access
|
||||
|
||||
### Phase 4 — Enterprise (post-Series A)
|
||||
- SSO / SAML
|
||||
- Admin dashboard + analytics
|
||||
- API for ATS integrations
|
||||
- Custom fine-tune models for outplacement firm's brand voice
|
||||
|
||||
---
|
||||
|
||||
## 11. Competitive Landscape
|
||||
|
||||
### Direct competitors
|
||||
|
||||
| Product | Price | Pipeline | AI CL | Privacy | Fine-tune | Open Source |
|
||||
|---|---|---|---|---|---|---|
|
||||
| **Job Seeker Platform** | Free–$29 | Full (discovery→hired) | Personal fine-tune | Local-first | Per-user | Core (MIT) |
|
||||
| Teal | Free/$29 | Partial (tracker + resume) | Generic AI | Cloud | No | No |
|
||||
| Jobscan | $49.95 | Resume scan only | No | Cloud | No | No |
|
||||
| Huntr | Free/$30 | Tracker only | No | Cloud | No | No |
|
||||
| Rezi | $29 | Resume/CL only | Generic AI | Cloud | No | No |
|
||||
| Kickresume | $19 | Resume/CL only | Generic AI | Cloud | No | No |
|
||||
| LinkedIn Premium | $40 | Job search only | No | Cloud (them) | No | No |
|
||||
| AIHawk | Free | LinkedIn Easy Apply | No | Local | No | Yes (MIT) |
|
||||
| Simplify | Free | Auto-fill only | No | Extension | No | No |
|
||||
|
||||
### Competitive analysis
|
||||
|
||||
**Teal** ($29/mo) is the closest feature competitor — job tracker + resume builder + AI cover letters. Key gaps: cloud-only (privacy risk), no discovery automation, generic AI (not fine-tuned to your voice), no interview prep, no email classifier. Their paid tier costs the same as our premium and delivers substantially less.
|
||||
|
||||
**Jobscan** ($49.95/mo) is the premium ATS-optimization tool. Single-purpose, no pipeline, no cover letters. Overpriced for what it does. Users often use it alongside a tracker — this platform replaces both.
|
||||
|
||||
**AIHawk** (open source) automates LinkedIn Easy Apply but has no pipeline, no AI beyond form filling, no cover letter gen, no tracking. It's a macro, not a platform. We already integrate with it as a downstream action. We're complementary, not competitive at the free tier.
|
||||
|
||||
**LinkedIn Premium** ($40/mo) has distribution but actively works against user privacy and owns the candidate relationship. Users are the product. Our privacy story is a direct counter-positioning.
|
||||
|
||||
### The whitespace
|
||||
|
||||
No competitor offers all three of: **full pipeline automation + privacy-first local storage + personalized fine-tuned AI**. Every existing tool is either a point solution (just resume, just tracker, just auto-apply) or cloud-based SaaS that monetizes user data. The combination is the moat.
|
||||
|
||||
### Indirect competition
|
||||
|
||||
- **Spreadsheets + Notion templates** — free, flexible, no AI. The baseline we replace for free users.
|
||||
- **Recruiting agencies** — human-assisted job search; we're a complement, not a replacement.
|
||||
- **Career coaches** — we sell *to* them, not against them.
|
||||
|
||||
---
|
||||
|
||||
## 12. Go-to-Market Strategy
|
||||
|
||||
### Phase 1: Developer + privacy community launch
|
||||
|
||||
**Channel:** GitHub → Hacker News → Reddit
|
||||
|
||||
The open core model makes GitHub the primary distribution channel. A compelling README, one-command Docker install, and a working free tier are the launch. Target communities:
|
||||
|
||||
- Hacker News "Show HN" — privacy-first self-hosted tools get strong traction
|
||||
- r/cscareerquestions (1.2M members) — active job seekers, technically literate
|
||||
- r/selfhosted (2.8M members) — prime audience for local-first tools
|
||||
- r/ExperiencedDevs, r/remotework — secondary seeding
|
||||
|
||||
**Goal:** 1,000 GitHub stars and 100 free installs in first 30 days.
|
||||
|
||||
**Content hook:** "I built a private job search AI that runs entirely on your machine — no data leaves your computer." Privacy angle resonates deeply post-2024 data breach fatigue.
|
||||
|
||||
### Phase 2: Career coaching channel
|
||||
|
||||
**Channel:** LinkedIn → direct outreach → coach partnerships
|
||||
|
||||
Career coaches are the highest-LTV customer and the most efficient channel to reach many job seekers at once. One coach onboarded = 10–20 active users.
|
||||
|
||||
Tactics:
|
||||
- Identify coaches on LinkedIn who post about job search tools
|
||||
- Offer white-glove onboarding + 60-day free trial of coach seats
|
||||
- Co-create content: "How I run 15 client job searches simultaneously"
|
||||
- Referral program: coach gets 1 free seat per paid client referral
|
||||
|
||||
**Goal:** 20 coach accounts within 90 days of paid tier launch.
|
||||
|
||||
### Phase 3: Content + SEO (SaaS phase)
|
||||
|
||||
Once the hosted Cloud Edition exists, invest in organic content:
|
||||
|
||||
- "Best job tracker apps 2027" (comparison content — we win on privacy + AI)
|
||||
- "How to write a cover letter that sounds like you, not ChatGPT"
|
||||
- "Job search automation without giving LinkedIn your data"
|
||||
- Tutorial videos: full setup walkthrough, fine-tuning demo
|
||||
|
||||
**Goal:** 10K organic monthly visitors driving 2–5% free tier signups.
|
||||
|
||||
### Phase 4: Outplacement firm partnerships (enterprise)
|
||||
|
||||
Target HR consultancies and outplacement firms (Challenger, Gray & Christmas; Right Management; Lee Hecht Harrison). These firms place thousands of candidates per year and pay per-seat enterprise licenses.
|
||||
|
||||
**Goal:** 3 enterprise pilots within 12 months of coach tier validation.
|
||||
|
||||
### Pricing strategy by channel
|
||||
|
||||
| Channel | Entry offer | Conversion lever |
|
||||
|---|---|---|
|
||||
| GitHub / OSS | Free forever | Upgrade friction: GPU setup, no shared model |
|
||||
| Direct / ProductHunt | Free 30-day paid trial | AI quality gap is immediately visible |
|
||||
| Coach outreach | Free 60-day coach trial | Efficiency gain across client base |
|
||||
| Enterprise | Pilot with 10 seats | ROI vs. current manual process |
|
||||
|
||||
### Key metrics by phase
|
||||
|
||||
| Phase | Primary metric | Target |
|
||||
|---|---|---|
|
||||
| Launch | GitHub stars | 1K in 30 days |
|
||||
| Paid validation | Paying users | 500 in 90 days |
|
||||
| Coach validation | Coach accounts | 20 in 90 days |
|
||||
| SaaS launch | Cloud signups | 10K in 6 months |
|
||||
| Enterprise | ARR from enterprise | $100K in 12 months |
|
||||
|
||||
---
|
||||
|
||||
## 13. Pricing Sensitivity Analysis
|
||||
|
||||
### Paid tier sensitivity ($8 / $12 / $15 / $20)
|
||||
|
||||
Assumption: 100K total users, 4% base conversion, gross infra cost $1,136/mo
|
||||
|
||||
| Price | Conversion assumption | Paying users | Revenue/mo | Gross margin |
|
||||
|---|---|---|---|---|
|
||||
| $8 | 5.5% (price-elastic) | 5,500 | $44,000 | 97.4% |
|
||||
| **$12** | **4.0% (base)** | **4,000** | **$48,000** | **97.6%** |
|
||||
| $15 | 3.2% (slight drop) | 3,200 | $48,000 | 97.6% |
|
||||
| $20 | 2.5% (meaningful drop) | 2,500 | $50,000 | 97.7% |
|
||||
|
||||
**Finding:** Revenue is relatively flat between $12 and $20 because conversion drops offset the price increase. $12 is the sweet spot — maximizes paying user count (more data, more referrals, more upgrade candidates) without sacrificing revenue. Going below $10 requires meaningfully higher conversion to justify.
|
||||
|
||||
### Premium tier sensitivity ($19 / $29 / $39 / $49)
|
||||
|
||||
Assumption: 800 base premium users (20% of 4,000 paid), conversion adjusts with price
|
||||
|
||||
| Price | Conversion from paid | Premium users | Revenue/mo | Fine-tune cost | Net/mo |
|
||||
|---|---|---|---|---|---|
|
||||
| $19 | 25% | 1,000 | $19,000 | $42 | $18,958 |
|
||||
| **$29** | **20%** | **800** | **$23,200** | **$33** | **$23,167** |
|
||||
| $39 | 15% | 600 | $23,400 | $25 | $23,375 |
|
||||
| $49 | 10% | 400 | $19,600 | $17 | $19,583 |
|
||||
|
||||
**Finding:** $29–$39 is the revenue-maximizing range. $29 wins on user volume (more fine-tune data, stronger coach acquisition funnel). $39 wins marginally on revenue but shrinks the premium base significantly. Recommend $29 at launch with the option to test $34–$39 once the fine-tuned model quality is demonstrated.
|
||||
|
||||
### Coach seat sensitivity ($10 / $15 / $20 per seat)
|
||||
|
||||
Assumption: 50 coach accounts, 3 seats avg, base $29 already captured above
|
||||
|
||||
| Seat price | Seat revenue/mo | Total coach revenue/mo |
|
||||
|---|---|---|
|
||||
| $10 | $1,500 | $1,500 |
|
||||
| **$15** | **$2,250** | **$2,250** |
|
||||
| $20 | $3,000 | $3,000 |
|
||||
|
||||
**Finding:** Seat pricing is relatively inelastic for coaches — $15–$20 is well within their cost of tools per client. $15 is conservative and easy to raise. $20 is defensible once coach ROI is documented. Consider $15 at launch, $20 after first 20 coach accounts are active.
|
||||
|
||||
### Blended revenue at optimized pricing (100K users)
|
||||
|
||||
| Component | Users | Price | Revenue/mo |
|
||||
|---|---|---|---|
|
||||
| Paid tier | 4,000 | $12 | $48,000 |
|
||||
| Premium individual | 720 | $29 | $20,880 |
|
||||
| Premium coach base | 80 | $29 | $2,320 |
|
||||
| Coach seats (80 accounts × 3 avg) | 240 seats | $15 | $3,600 |
|
||||
| **Total** | | | **$74,800/mo** |
|
||||
| Infrastructure | | | -$1,136/mo |
|
||||
| **Net** | | | **$73,664/mo (~$884K ARR)** |
|
||||
|
||||
### Sensitivity to conversion rate (at $12/$29 pricing, 100K users)
|
||||
|
||||
| Free→Paid conversion | Paid→Premium conversion | Revenue/mo | ARR |
|
||||
|---|---|---|---|
|
||||
| 2% | 15% | $30,720 | $369K |
|
||||
| 3% | 18% | $47,664 | $572K |
|
||||
| **4%** | **20%** | **$65,600** | **$787K** |
|
||||
| 5% | 22% | $84,480 | $1.01M |
|
||||
| 6% | 25% | $104,400 | $1.25M |
|
||||
|
||||
**Key insight:** Conversion rate is the highest-leverage variable. Going from 4% → 5% free-to-paid conversion adds $228K ARR at 100K users. Investment in onboarding quality and the free-tier value proposition has outsized return vs. price adjustments.
|
||||
|
|
@ -1,367 +0,0 @@
|
|||
# CircuitForge License Server — Design Document
|
||||
|
||||
**Date:** 2026-02-25
|
||||
**Status:** Approved — ready for implementation
|
||||
|
||||
---
|
||||
|
||||
## Goal
|
||||
|
||||
Build a self-hosted licensing server for Circuit Forge LLC products. v1 serves Peregrine; schema is multi-product from day one. Enforces free / paid / premium / ultra tier gates with offline-capable JWT validation, 30-day refresh cycle, 7-day grace period, seat tracking, usage telemetry, and a content violation flagging foundation.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────┐
|
||||
│ circuitforge-license (Heimdall:8600) │
|
||||
│ FastAPI + SQLite + RS256 JWT │
|
||||
│ │
|
||||
│ Public API (/v1/…): │
|
||||
│ POST /v1/activate → issue JWT │
|
||||
│ POST /v1/refresh → renew JWT │
|
||||
│ POST /v1/deactivate → free a seat │
|
||||
│ POST /v1/usage → record usage event │
|
||||
│ POST /v1/flag → report violation │
|
||||
│ │
|
||||
│ Admin API (/admin/…, bearer token): │
|
||||
│ POST/GET /admin/keys → CRUD keys │
|
||||
│ DELETE /admin/keys/{id} → revoke │
|
||||
│ GET /admin/activations → audit │
|
||||
│ GET /admin/usage → telemetry │
|
||||
│ GET/PATCH /admin/flags → flag review │
|
||||
└─────────────────────────────────────────────────┘
|
||||
↑ HTTPS via Caddy (license.circuitforge.com)
|
||||
|
||||
┌─────────────────────────────────────────────────┐
|
||||
│ Peregrine (user's machine) │
|
||||
│ scripts/license.py │
|
||||
│ │
|
||||
│ activate(key) → POST /v1/activate │
|
||||
│ writes config/license.json │
|
||||
│ verify_local() → validates JWT offline │
|
||||
│ using embedded public key │
|
||||
│ refresh_if_needed() → called on app startup │
|
||||
│ effective_tier() → tier string for can_use() │
|
||||
│ report_usage(…) → fire-and-forget telemetry │
|
||||
│ report_flag(…) → fire-and-forget violation │
|
||||
└─────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
**Key properties:**
|
||||
- Peregrine verifies tier **offline** on every check — RS256 public key embedded at build time
|
||||
- Network required only at activation and 30-day refresh
|
||||
- Revoked keys stop working at next refresh cycle (≤30 day lag — acceptable for v1)
|
||||
- `config/license.json` gitignored; missing = free tier
|
||||
|
||||
---
|
||||
|
||||
## Crypto: RS256 (asymmetric JWT)
|
||||
|
||||
- **Private key** — lives only on the license server (`keys/private.pem`, gitignored)
|
||||
- **Public key** — committed to both the license server repo and Peregrine (`scripts/license_public_key.pem`)
|
||||
- Peregrine can verify JWT authenticity without ever knowing the private key
|
||||
- A stolen JWT cannot be forged without the private key
|
||||
- Revocation: server refuses refresh; old JWT valid until expiry then grace period expires
|
||||
|
||||
**Key generation (one-time, on Heimdall):**
|
||||
```bash
|
||||
openssl genrsa -out keys/private.pem 2048
|
||||
openssl rsa -in keys/private.pem -pubout -out keys/public.pem
|
||||
# copy keys/public.pem → peregrine/scripts/license_public_key.pem
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Database Schema
|
||||
|
||||
```sql
|
||||
CREATE TABLE license_keys (
|
||||
id TEXT PRIMARY KEY, -- UUID
|
||||
key_display TEXT UNIQUE NOT NULL, -- CFG-PRNG-XXXX-XXXX-XXXX
|
||||
product TEXT NOT NULL, -- peregrine | falcon | osprey | …
|
||||
tier TEXT NOT NULL, -- paid | premium | ultra
|
||||
seats INTEGER DEFAULT 1,
|
||||
valid_until TEXT, -- ISO date or NULL (perpetual)
|
||||
revoked INTEGER DEFAULT 0,
|
||||
customer_email TEXT, -- proper field, not buried in notes
|
||||
source TEXT DEFAULT 'manual', -- manual | beta | promo | stripe
|
||||
trial INTEGER DEFAULT 0, -- 1 = time-limited trial key
|
||||
notes TEXT,
|
||||
created_at TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE activations (
|
||||
id TEXT PRIMARY KEY,
|
||||
key_id TEXT NOT NULL REFERENCES license_keys(id),
|
||||
machine_id TEXT NOT NULL, -- sha256(hostname + MAC)
|
||||
app_version TEXT, -- Peregrine version at last refresh
|
||||
platform TEXT, -- linux | macos | windows | docker
|
||||
activated_at TEXT NOT NULL,
|
||||
last_refresh TEXT NOT NULL,
|
||||
deactivated_at TEXT -- NULL = still active
|
||||
);
|
||||
|
||||
CREATE TABLE usage_events (
|
||||
id TEXT PRIMARY KEY,
|
||||
key_id TEXT NOT NULL REFERENCES license_keys(id),
|
||||
machine_id TEXT NOT NULL,
|
||||
product TEXT NOT NULL,
|
||||
event_type TEXT NOT NULL, -- cover_letter_generated |
|
||||
-- company_research | email_sync |
|
||||
-- interview_prep | survey | etc.
|
||||
metadata TEXT, -- JSON blob for context
|
||||
created_at TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE flags (
|
||||
id TEXT PRIMARY KEY,
|
||||
key_id TEXT NOT NULL REFERENCES license_keys(id),
|
||||
machine_id TEXT,
|
||||
product TEXT NOT NULL,
|
||||
flag_type TEXT NOT NULL, -- content_violation | tos_violation |
|
||||
-- abuse | manual
|
||||
details TEXT, -- JSON: prompt snippet, output excerpt
|
||||
status TEXT DEFAULT 'open', -- open | reviewed | dismissed | actioned
|
||||
created_at TEXT NOT NULL,
|
||||
reviewed_at TEXT,
|
||||
action_taken TEXT -- none | warned | revoked
|
||||
);
|
||||
|
||||
CREATE TABLE audit_log (
|
||||
id TEXT PRIMARY KEY,
|
||||
entity_type TEXT NOT NULL, -- key | activation | flag
|
||||
entity_id TEXT NOT NULL,
|
||||
action TEXT NOT NULL, -- created | revoked | activated |
|
||||
-- deactivated | flag_actioned
|
||||
actor TEXT, -- admin identifier (future multi-admin)
|
||||
details TEXT, -- JSON
|
||||
created_at TEXT NOT NULL
|
||||
);
|
||||
```
|
||||
|
||||
**Flags scope (v1):** Schema and `POST /v1/flag` endpoint capture data. No admin enforcement UI in v1 — query DB directly. Build review UI in v2 when there's data to act on.
|
||||
|
||||
---
|
||||
|
||||
## JWT Payload
|
||||
|
||||
```json
|
||||
{
|
||||
"sub": "CFG-PRNG-A1B2-C3D4-E5F6",
|
||||
"product": "peregrine",
|
||||
"tier": "paid",
|
||||
"seats": 2,
|
||||
"machine": "a3f9c2…",
|
||||
"notice": "Version 1.1 available — see circuitforge.com/update",
|
||||
"iat": 1740000000,
|
||||
"exp": 1742592000
|
||||
}
|
||||
```
|
||||
|
||||
`notice` is optional — set via a server config value; included in refresh responses so Peregrine can surface it as a banner. No DB table needed.
|
||||
|
||||
---
|
||||
|
||||
## Key Format
|
||||
|
||||
`CFG-PRNG-A1B2-C3D4-E5F6`
|
||||
|
||||
- `CFG` — Circuit Forge
|
||||
- `PRNG` / `FLCN` / `OSPY` / … — 4-char product code
|
||||
- Three random 4-char alphanumeric segments
|
||||
- Human-readable, easy to copy/paste into a support email
|
||||
|
||||
---
|
||||
|
||||
## Endpoint Reference
|
||||
|
||||
| Method | Path | Auth | Purpose |
|
||||
|--------|------|------|---------|
|
||||
| POST | `/v1/activate` | none | Issue JWT for key + machine |
|
||||
| POST | `/v1/refresh` | JWT bearer | Renew JWT before expiry |
|
||||
| POST | `/v1/deactivate` | JWT bearer | Free a seat |
|
||||
| POST | `/v1/usage` | JWT bearer | Record usage event (fire-and-forget) |
|
||||
| POST | `/v1/flag` | JWT bearer | Report content/ToS violation |
|
||||
| POST | `/admin/keys` | admin token | Create a new key |
|
||||
| GET | `/admin/keys` | admin token | List all keys + activation counts |
|
||||
| DELETE | `/admin/keys/{id}` | admin token | Revoke a key |
|
||||
| GET | `/admin/activations` | admin token | Full activation audit |
|
||||
| GET | `/admin/usage` | admin token | Usage breakdown per key/product/event |
|
||||
| GET | `/admin/flags` | admin token | List flags (open by default) |
|
||||
| PATCH | `/admin/flags/{id}` | admin token | Update flag status + action |
|
||||
|
||||
---
|
||||
|
||||
## Peregrine Client (`scripts/license.py`)
|
||||
|
||||
**Public API:**
|
||||
```python
|
||||
def activate(key: str) -> dict # POST /v1/activate, writes license.json
|
||||
def verify_local() -> dict | None # validates JWT offline; None = free tier
|
||||
def refresh_if_needed() -> None # silent; called on app startup
|
||||
def effective_tier() -> str # "free"|"paid"|"premium"|"ultra"
|
||||
def report_usage(event_type: str, # fire-and-forget; failures silently dropped
|
||||
metadata: dict = {}) -> None
|
||||
def report_flag(flag_type: str, # fire-and-forget
|
||||
details: dict) -> None
|
||||
```
|
||||
|
||||
**`effective_tier()` decision tree:**
|
||||
```
|
||||
license.json missing or unreadable → "free"
|
||||
JWT signature invalid → "free"
|
||||
JWT product != "peregrine" → "free"
|
||||
JWT not expired → tier from payload
|
||||
JWT expired, within grace period → tier from payload + show banner
|
||||
JWT expired, grace period expired → "free" + show banner
|
||||
```
|
||||
|
||||
**`config/license.json` (gitignored):**
|
||||
```json
|
||||
{
|
||||
"jwt": "eyJ…",
|
||||
"key_display": "CFG-PRNG-A1B2-C3D4-E5F6",
|
||||
"tier": "paid",
|
||||
"valid_until": "2026-03-27",
|
||||
"machine_id": "a3f9c2…",
|
||||
"last_refresh": "2026-02-25T12:00:00Z",
|
||||
"grace_until": null
|
||||
}
|
||||
```
|
||||
|
||||
**Integration point in `tiers.py`:**
|
||||
```python
|
||||
def effective_tier(profile) -> str:
|
||||
from scripts.license import effective_tier as _license_tier
|
||||
if profile.dev_tier_override: # dev override still works in dev mode
|
||||
return profile.dev_tier_override
|
||||
return _license_tier()
|
||||
```
|
||||
|
||||
**Settings License tab** (new tab in `app/pages/2_Settings.py`):
|
||||
- Text input: enter license key → calls `activate()` → shows result
|
||||
- If active: tier badge, key display string, expiry date, seat count
|
||||
- Grace period: amber banner with days remaining
|
||||
- "Deactivate this machine" button → `/v1/deactivate`, deletes `license.json`
|
||||
|
||||
---
|
||||
|
||||
## Deployment
|
||||
|
||||
**Repo:** `git.opensourcesolarpunk.com/pyr0ball/circuitforge-license` (private)
|
||||
|
||||
**Repo layout:**
|
||||
```
|
||||
circuitforge-license/
|
||||
├── app/
|
||||
│ ├── main.py # FastAPI app
|
||||
│ ├── db.py # SQLite helpers, schema init
|
||||
│ ├── models.py # Pydantic models
|
||||
│ ├── crypto.py # RSA sign/verify helpers
|
||||
│ └── routes/
|
||||
│ ├── public.py # /v1/* endpoints
|
||||
│ └── admin.py # /admin/* endpoints
|
||||
├── data/ # SQLite DB (named volume)
|
||||
├── keys/
|
||||
│ ├── private.pem # gitignored
|
||||
│ └── public.pem # committed
|
||||
├── scripts/
|
||||
│ └── issue-key.sh # curl wrapper for key issuance
|
||||
├── tests/
|
||||
├── Dockerfile
|
||||
├── docker-compose.yml
|
||||
├── .env.example
|
||||
└── requirements.txt
|
||||
```
|
||||
|
||||
**`docker-compose.yml` (on Heimdall):**
|
||||
```yaml
|
||||
services:
|
||||
license:
|
||||
build: .
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "127.0.0.1:8600:8600"
|
||||
volumes:
|
||||
- license_data:/app/data
|
||||
- ./keys:/app/keys:ro
|
||||
env_file: .env
|
||||
|
||||
volumes:
|
||||
license_data:
|
||||
```
|
||||
|
||||
**`.env` (gitignored):**
|
||||
```
|
||||
ADMIN_TOKEN=<long random string>
|
||||
JWT_PRIVATE_KEY_PATH=/app/keys/private.pem
|
||||
JWT_PUBLIC_KEY_PATH=/app/keys/public.pem
|
||||
JWT_EXPIRY_DAYS=30
|
||||
GRACE_PERIOD_DAYS=7
|
||||
```
|
||||
|
||||
**Caddy block (add to Heimdall Caddyfile):**
|
||||
```caddy
|
||||
license.circuitforge.com {
|
||||
reverse_proxy localhost:8600
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Admin Workflow (v1)
|
||||
|
||||
All operations via `curl` or `scripts/issue-key.sh`:
|
||||
|
||||
```bash
|
||||
# Issue a key
|
||||
./scripts/issue-key.sh --product peregrine --tier paid --seats 2 \
|
||||
--email user@example.com --notes "Beta — manual payment 2026-02-25"
|
||||
# → CFG-PRNG-A1B2-C3D4-E5F6 (email to customer)
|
||||
|
||||
# List all keys
|
||||
curl https://license.circuitforge.com/admin/keys \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN"
|
||||
|
||||
# Revoke a key
|
||||
curl -X DELETE https://license.circuitforge.com/admin/keys/{id} \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
**License server:**
|
||||
- pytest with in-memory SQLite and generated test keypair
|
||||
- All endpoints tested: activate, refresh, deactivate, usage, flag, admin CRUD
|
||||
- Seat limit enforcement, expiry, revocation all unit tested
|
||||
|
||||
**Peregrine client:**
|
||||
- `verify_local()` tested with pre-signed test JWT using test keypair
|
||||
- `activate()` / `refresh()` tested with `httpx` mocks
|
||||
- `effective_tier()` tested across all states: valid, expired, grace, revoked, missing
|
||||
|
||||
**Integration smoke test:**
|
||||
```bash
|
||||
docker compose up -d
|
||||
# create test key via admin API
|
||||
# call /v1/activate with test key
|
||||
# verify JWT signature with public key
|
||||
# verify /v1/refresh extends expiry
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Decisions Log
|
||||
|
||||
| Decision | Rationale |
|
||||
|----------|-----------|
|
||||
| RS256 over HS256 | Public key embeddable in client; private key never leaves server |
|
||||
| SQLite over Postgres | Matches Peregrine's SQLite-first philosophy; trivially backupable |
|
||||
| 30-day JWT lifetime | Standard SaaS pattern; invisible to users in normal operation |
|
||||
| 7-day grace period | Covers travel, network outages, server maintenance |
|
||||
| Flags v1: capture only | No volume to justify review UI yet; add in v2 |
|
||||
| No payment integration | Manual issuance until customer volume justifies automation |
|
||||
| Multi-product schema | Adding a column now vs migrating a live DB later |
|
||||
| Separate repo | License server is infrastructure, not part of Peregrine's BSL scope |
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,106 +0,0 @@
|
|||
# Email Sync — Testing Checklist
|
||||
|
||||
Generated from audit of `scripts/imap_sync.py`.
|
||||
|
||||
## Bugs fixed (2026-02-23)
|
||||
|
||||
- [x] Gmail label with spaces not quoted for IMAP SELECT → `_quote_folder()` added
|
||||
- [x] `_quote_folder` didn't escape internal double-quotes → RFC 3501 escaping added
|
||||
- [x] `signal is None` in `_scan_unmatched_leads` allowed classifier failures through → now skips
|
||||
- [x] Email with no Message-ID re-inserted on every sync → `_parse_message` returns `None` when ID missing
|
||||
- [x] `todo_attached` missing from early-return dict in `sync_all` → added
|
||||
- [x] Body phrase check truncated at 800 chars (rejection footers missed) → bumped to 1500
|
||||
- [x] `_DONT_FORGET_VARIANTS` missing left single quotation mark `\u2018` → added
|
||||
|
||||
---
|
||||
|
||||
## Unit tests — phrase filter
|
||||
|
||||
- [x] `_has_rejection_or_ats_signal` — rejection phrase at char 1501 (boundary)
|
||||
- [x] `_has_rejection_or_ats_signal` — right single quote `\u2019` in "don't forget"
|
||||
- [x] `_has_rejection_or_ats_signal` — left single quote `\u2018` in "don't forget"
|
||||
- [x] `_has_rejection_or_ats_signal` — ATS subject phrase only checked against subject, not body
|
||||
- [x] `_has_rejection_or_ats_signal` — spam subject prefix `@` match
|
||||
- [x] `_has_rejection_or_ats_signal` — `"UNFORTUNATELY"` (uppercase → lowercased correctly)
|
||||
- [x] `_has_rejection_or_ats_signal` — phrase in body quoted thread (beyond 1500 chars) is not blocked
|
||||
|
||||
## Unit tests — folder quoting
|
||||
|
||||
- [x] `_quote_folder("TO DO JOBS")` → `'"TO DO JOBS"'`
|
||||
- [x] `_quote_folder("INBOX")` → `"INBOX"` (no spaces, no quotes added)
|
||||
- [x] `_quote_folder('My "Jobs"')` → `'"My \\"Jobs\\""'`
|
||||
- [x] `_search_folder` — folder doesn't exist → returns `[]`, no exception
|
||||
- [x] `_search_folder` — special folder `"[Gmail]/All Mail"` (brackets + slash)
|
||||
|
||||
## Unit tests — message-ID dedup
|
||||
|
||||
- [x] `_get_existing_message_ids` — NULL message_id in DB excluded from set
|
||||
- [x] `_get_existing_message_ids` — empty string `""` excluded from set
|
||||
- [x] `_get_existing_message_ids` — job with no contacts returns empty set
|
||||
- [x] `_parse_message` — email with no Message-ID header returns `None`
|
||||
- [x] `_parse_message` — email with RFC2047-encoded subject decodes correctly
|
||||
- [x] No email is inserted twice across two sync runs (integration)
|
||||
|
||||
## Unit tests — classifier & signal
|
||||
|
||||
- [x] `classify_stage_signal` — returns one of 5 labels or `None`
|
||||
- [x] `classify_stage_signal` — returns `None` on LLM error
|
||||
- [x] `classify_stage_signal` — returns `"neutral"` when no label matched in LLM output
|
||||
- [x] `classify_stage_signal` — strips `<think>…</think>` blocks
|
||||
- [x] `_scan_unmatched_leads` — skips when `signal is None`
|
||||
- [x] `_scan_unmatched_leads` — skips when `signal == "rejected"`
|
||||
- [x] `_scan_unmatched_leads` — proceeds when `signal == "neutral"`
|
||||
- [x] `extract_lead_info` — returns `(None, None)` on bad JSON
|
||||
- [x] `extract_lead_info` — returns `(None, None)` on LLM error
|
||||
|
||||
## Integration tests — TODO label scan
|
||||
|
||||
- [x] `_scan_todo_label` — `todo_label` empty string → returns 0
|
||||
- [x] `_scan_todo_label` — `todo_label` missing from config → returns 0
|
||||
- [x] `_scan_todo_label` — folder doesn't exist on IMAP server → returns 0, no crash
|
||||
- [x] `_scan_todo_label` — email matches company + action keyword → contact attached
|
||||
- [x] `_scan_todo_label` — email matches company but no action keyword → skipped
|
||||
- [x] `_scan_todo_label` — email matches no company term → skipped
|
||||
- [x] `_scan_todo_label` — duplicate message-ID → not re-inserted
|
||||
- [x] `_scan_todo_label` — stage_signal set when classifier returns non-neutral
|
||||
- [x] `_scan_todo_label` — body fallback (company only in body[:300]) → still matches
|
||||
- [x] `_scan_todo_label` — email handled by `sync_job_emails` first not re-added by label scan
|
||||
|
||||
## Integration tests — unmatched leads
|
||||
|
||||
- [x] `_scan_unmatched_leads` — genuine lead inserted with synthetic URL `email://domain/hash`
|
||||
- [x] `_scan_unmatched_leads` — same email not re-inserted on second sync run
|
||||
- [x] `_scan_unmatched_leads` — duplicate synthetic URL skipped
|
||||
- [x] `_scan_unmatched_leads` — `extract_lead_info` returns `(None, None)` → no insertion
|
||||
- [x] `_scan_unmatched_leads` — rejection phrase in body → blocked before LLM
|
||||
- [x] `_scan_unmatched_leads` — rejection phrase in quoted thread > 1500 chars → passes filter (acceptable)
|
||||
|
||||
## Integration tests — full sync
|
||||
|
||||
- [x] `sync_all` with no active jobs → returns dict with all 6 keys incl. `todo_attached: 0`
|
||||
- [x] `sync_all` return dict shape identical on all code paths
|
||||
- [x] `sync_all` with `job_ids` filter → only syncs those jobs
|
||||
- [x] `sync_all` `dry_run=True` → no DB writes
|
||||
- [x] `sync_all` `on_stage` callback fires: "connecting", "job N/M", "scanning todo label", "scanning leads"
|
||||
- [x] `sync_all` IMAP connection error → caught, returned in `errors` list
|
||||
- [x] `sync_all` per-job exception → other jobs still sync
|
||||
|
||||
## Config / UI
|
||||
|
||||
- [x] Settings UI field for `todo_label` (currently YAML-only)
|
||||
- [x] Warn in sync summary when `todo_label` folder not found on server
|
||||
- [x] Clear error message when `config/email.yaml` is missing
|
||||
- [x] `test_email_classify.py --verbose` shows correct blocking phrase for each BLOCK
|
||||
|
||||
## Backlog — Known issues
|
||||
|
||||
- [x] **The Ladders emails confuse the classifier** — promotional/job alert emails from `@theladders.com` are matching the recruitment keyword filter and being treated as leads. Fix: add a sender-based skip rule in `_scan_unmatched_leads` for known job board senders (similar to how LinkedIn Alert emails are short-circuited before the LLM classifier). Senders to exclude: `@theladders.com`, and audit for others (Glassdoor alerts, Indeed digest, ZipRecruiter, etc.).
|
||||
|
||||
---
|
||||
|
||||
## Performance & edge cases
|
||||
|
||||
- [x] Email with 10 000-char body → truncated to 4000 chars, no crash
|
||||
- [x] Email with binary attachment → `_parse_message` returns valid dict, no crash
|
||||
- [x] Email with multiple `text/plain` MIME parts → first part taken
|
||||
- [x] `get_all_message_ids` with 100 000 rows → completes in < 1s
|
||||
|
|
@ -12,41 +12,53 @@ free < paid < premium
|
|||
|
||||
| Tier | Description |
|
||||
|------|-------------|
|
||||
| `free` | Core discovery pipeline, resume matching, and basic UI — no LLM features |
|
||||
| `paid` | All AI features: cover letters, research, email, integrations, calendar, notifications |
|
||||
| `free` | Core discovery pipeline, resume matching, basic UI. AI features unlock with BYOK. |
|
||||
| `paid` | Managed cloud LLM (no key required), integrations, calendar, notifications |
|
||||
| `premium` | Adds fine-tuning and multi-user support |
|
||||
|
||||
---
|
||||
|
||||
## BYOK — Bring Your Own Key
|
||||
|
||||
If you configure any LLM backend in `config/llm.yaml` — local (ollama, vllm) **or** an external API key (Anthropic, OpenAI, etc.) — **all pure LLM-call features unlock automatically**, regardless of your subscription tier.
|
||||
|
||||
The paid tier gives you access to CircuitForge's managed cloud inference. It does not gate your ability to use AI when you're providing the compute yourself.
|
||||
|
||||
Features that unlock with BYOK are listed in `BYOK_UNLOCKABLE` in `tiers.py`. Features that depend on CircuitForge-operated infrastructure (integrations, email classifier training, fine-tuned models) remain tier-gated.
|
||||
|
||||
---
|
||||
|
||||
## Feature Gate Table
|
||||
|
||||
Features listed here require a minimum tier. Features not in this table are available to all tiers (free by default).
|
||||
|
||||
### Wizard LLM generation
|
||||
|
||||
| Feature key | Minimum tier | Description |
|
||||
|-------------|-------------|-------------|
|
||||
| `llm_career_summary` | paid | LLM-assisted career summary generation in the wizard |
|
||||
| `llm_expand_bullets` | paid | LLM expansion of resume bullet points |
|
||||
| `llm_suggest_skills` | paid | LLM skill suggestions from resume content |
|
||||
| `llm_voice_guidelines` | premium | LLM writing voice and tone guidelines |
|
||||
| `llm_job_titles` | paid | LLM-suggested job title variations for search |
|
||||
| `llm_keywords_blocklist` | paid | LLM-suggested blocklist keywords |
|
||||
| `llm_mission_notes` | paid | LLM-generated mission alignment notes |
|
||||
| Feature key | Minimum tier | BYOK unlocks? | Description |
|
||||
|-------------|-------------|---------------|-------------|
|
||||
| `llm_career_summary` | paid | ✅ yes | LLM-assisted career summary generation in the wizard |
|
||||
| `llm_expand_bullets` | paid | ✅ yes | LLM expansion of resume bullet points |
|
||||
| `llm_suggest_skills` | paid | ✅ yes | LLM skill suggestions from resume content |
|
||||
| `llm_voice_guidelines` | premium | ✅ yes | LLM writing voice and tone guidelines |
|
||||
| `llm_job_titles` | paid | ✅ yes | LLM-suggested job title variations for search |
|
||||
| `llm_mission_notes` | paid | ✅ yes | LLM-generated mission alignment notes |
|
||||
| `llm_keywords_blocklist` | paid | ❌ no | Orchestration pipeline over background keyword data |
|
||||
|
||||
### App features
|
||||
|
||||
| Feature key | Minimum tier | Description |
|
||||
|-------------|-------------|-------------|
|
||||
| `company_research` | paid | Auto-generated company research briefs pre-interview |
|
||||
| `interview_prep` | paid | Live reference sheet and practice Q&A during calls |
|
||||
| `email_classifier` | paid | IMAP email sync with LLM classification |
|
||||
| `survey_assistant` | paid | Culture-fit survey Q&A helper (text + screenshot) |
|
||||
| `model_fine_tuning` | premium | Cover letter model fine-tuning on personal writing |
|
||||
| `shared_cover_writer_model` | paid | Access to shared fine-tuned cover letter model |
|
||||
| `multi_user` | premium | Multiple user profiles on one instance |
|
||||
| Feature key | Minimum tier | BYOK unlocks? | Description |
|
||||
|-------------|-------------|---------------|-------------|
|
||||
| `company_research` | paid | ✅ yes | Auto-generated company research briefs pre-interview |
|
||||
| `interview_prep` | paid | ✅ yes | Live reference sheet and practice Q&A during calls |
|
||||
| `survey_assistant` | paid | ✅ yes | Culture-fit survey Q&A helper (text + screenshot) |
|
||||
| `email_classifier` | paid | ❌ no | IMAP email sync with LLM classification (training pipeline) |
|
||||
| `model_fine_tuning` | premium | ❌ no | Cover letter model fine-tuning on personal writing |
|
||||
| `shared_cover_writer_model` | paid | ❌ no | Access to shared fine-tuned cover letter model (CF infra) |
|
||||
| `multi_user` | premium | ❌ no | Multiple user profiles on one instance |
|
||||
|
||||
### Integrations (paid)
|
||||
### Integrations
|
||||
|
||||
Integrations depend on CircuitForge-operated infrastructure and are **not** BYOK-unlockable.
|
||||
|
||||
| Feature key | Minimum tier | Description |
|
||||
|-------------|-------------|-------------|
|
||||
|
|
@ -73,31 +85,46 @@ The following integrations are free for all tiers and are not in the `FEATURES`
|
|||
|
||||
## API Reference
|
||||
|
||||
### `can_use(tier, feature) -> bool`
|
||||
### `can_use(tier, feature, has_byok=False) -> bool`
|
||||
|
||||
Returns `True` if the given tier has access to the feature.
|
||||
Returns `True` if the given tier has access to the feature. Pass `has_byok=has_configured_llm()` to apply BYOK unlock logic.
|
||||
|
||||
```python
|
||||
from app.wizard.tiers import can_use
|
||||
from app.wizard.tiers import can_use, has_configured_llm
|
||||
|
||||
can_use("free", "company_research") # False
|
||||
can_use("paid", "company_research") # True
|
||||
can_use("premium", "company_research") # True
|
||||
byok = has_configured_llm()
|
||||
|
||||
can_use("free", "unknown_feature") # True — ungated features return True
|
||||
can_use("invalid", "company_research") # False — invalid tier string
|
||||
can_use("free", "company_research") # False — no LLM configured
|
||||
can_use("free", "company_research", has_byok=True) # True — BYOK unlocks it
|
||||
can_use("paid", "company_research") # True
|
||||
|
||||
can_use("free", "notion_sync", has_byok=True) # False — integration, not BYOK-unlockable
|
||||
can_use("free", "unknown_feature") # True — ungated features return True
|
||||
can_use("invalid", "company_research") # False — invalid tier string
|
||||
```
|
||||
|
||||
### `tier_label(feature) -> str`
|
||||
### `has_configured_llm(config_path=None) -> bool`
|
||||
|
||||
Returns a display badge string for locked features, or `""` if the feature is free or unknown.
|
||||
Returns `True` if at least one non-vision LLM backend is enabled in `config/llm.yaml`. Local backends (ollama, vllm) and external API keys both count.
|
||||
|
||||
```python
|
||||
from app.wizard.tiers import has_configured_llm
|
||||
|
||||
has_configured_llm() # True if any backend is enabled and not vision_service
|
||||
```
|
||||
|
||||
### `tier_label(feature, has_byok=False) -> str`
|
||||
|
||||
Returns a display badge string for locked features, or `""` if the feature is free, unlocked, or BYOK-accessible.
|
||||
|
||||
```python
|
||||
from app.wizard.tiers import tier_label
|
||||
|
||||
tier_label("company_research") # "🔒 Paid"
|
||||
tier_label("model_fine_tuning") # "⭐ Premium"
|
||||
tier_label("job_discovery") # "" (ungated)
|
||||
tier_label("company_research") # "🔒 Paid"
|
||||
tier_label("company_research", has_byok=True) # "" (BYOK unlocks, no label shown)
|
||||
tier_label("model_fine_tuning") # "⭐ Premium"
|
||||
tier_label("notion_sync", has_byok=True) # "🔒 Paid" (BYOK doesn't unlock integrations)
|
||||
tier_label("job_discovery") # "" (ungated)
|
||||
```
|
||||
|
||||
---
|
||||
|
|
@ -120,36 +147,42 @@ dev_tier_override: premium # overrides tier locally for testing
|
|||
|
||||
## Adding a New Feature Gate
|
||||
|
||||
1. Add the feature to `FEATURES` in `app/wizard/tiers.py`:
|
||||
1. Add the feature to `FEATURES` in `app/wizard/tiers.py`. If it's a pure LLM call that should unlock with BYOK, also add it to `BYOK_UNLOCKABLE`:
|
||||
|
||||
```python
|
||||
FEATURES: dict[str, str] = {
|
||||
# ...existing entries...
|
||||
"my_new_feature": "paid", # or "free" | "premium"
|
||||
"my_new_llm_feature": "paid",
|
||||
}
|
||||
|
||||
BYOK_UNLOCKABLE: frozenset[str] = frozenset({
|
||||
# ...existing entries...
|
||||
"my_new_llm_feature", # add here if it's a pure LLM call
|
||||
})
|
||||
```
|
||||
|
||||
2. Guard the feature in the UI:
|
||||
2. Guard the feature in the UI, passing `has_byok`:
|
||||
|
||||
```python
|
||||
from app.wizard.tiers import can_use, tier_label
|
||||
from scripts.user_profile import UserProfile
|
||||
from app.wizard.tiers import can_use, tier_label, has_configured_llm
|
||||
|
||||
user = UserProfile()
|
||||
if can_use(user.tier, "my_new_feature"):
|
||||
_byok = has_configured_llm()
|
||||
if can_use(user.tier, "my_new_llm_feature", has_byok=_byok):
|
||||
# show the feature
|
||||
pass
|
||||
else:
|
||||
st.info(f"My New Feature requires a {tier_label('my_new_feature').replace('🔒 ', '').replace('⭐ ', '')} plan.")
|
||||
st.info(f"Requires a paid plan or a configured LLM backend.")
|
||||
```
|
||||
|
||||
3. Add a test in `tests/test_tiers.py`:
|
||||
3. Add tests in `tests/test_wizard_tiers.py` covering both the tier gate and BYOK unlock:
|
||||
|
||||
```python
|
||||
def test_my_new_feature_requires_paid():
|
||||
assert can_use("free", "my_new_feature") is False
|
||||
assert can_use("paid", "my_new_feature") is True
|
||||
assert can_use("premium", "my_new_feature") is True
|
||||
def test_my_new_feature_requires_paid_without_byok():
|
||||
assert can_use("free", "my_new_llm_feature") is False
|
||||
assert can_use("paid", "my_new_llm_feature") is True
|
||||
|
||||
def test_my_new_feature_byok_unlocks():
|
||||
assert can_use("free", "my_new_llm_feature", has_byok=True) is True
|
||||
```
|
||||
|
||||
---
|
||||
|
|
|
|||
174
docs/vue-spa-migration.md
Normal file
174
docs/vue-spa-migration.md
Normal file
|
|
@ -0,0 +1,174 @@
|
|||
# Peregrine Vue 3 SPA Migration
|
||||
|
||||
**Branch:** `feature/vue-spa`
|
||||
**Issue:** #8 — Vue 3 SPA frontend (Paid Tier GA milestone)
|
||||
**Worktree:** `.worktrees/feature-vue-spa/`
|
||||
**Reference:** `avocet/docs/vue-port-gotchas.md` (15 battle-tested gotchas)
|
||||
|
||||
---
|
||||
|
||||
## What We're Replacing
|
||||
|
||||
The current Streamlit UI (`app/app.py` + `app/pages/`) is an internal tool built for speed of development. The Vue SPA replaces it with a proper frontend — faster, more accessible, and extensible for the Paid Tier. The FastAPI already exists (partially, from the cloud managed instance work); the Vue SPA will consume it.
|
||||
|
||||
### Pages to Port
|
||||
|
||||
| Streamlit file | Vue view | Route | Notes |
|
||||
|---|---|---|---|
|
||||
| `app/Home.py` | `HomeView.vue` | `/` | Dashboard, discovery trigger, sync status |
|
||||
| `app/pages/1_Job_Review.py` | `JobReviewView.vue` | `/review` | Batch approve/reject; primary daily-driver view |
|
||||
| `app/pages/4_Apply.py` | `ApplyView.vue` | `/apply` | Cover letter gen + PDF + mark applied |
|
||||
| `app/pages/5_Interviews.py` | `InterviewsView.vue` | `/interviews` | Kanban: phone_screen → offer → hired |
|
||||
| `app/pages/6_Interview_Prep.py` | `InterviewPrepView.vue` | `/prep` | Live reference sheet + practice Q&A |
|
||||
| `app/pages/7_Survey.py` | `SurveyView.vue` | `/survey` | Culture-fit survey assist + screenshot |
|
||||
| `app/pages/2_Settings.py` | `SettingsView.vue` | `/settings` | 6 tabs: Profile, Resume, Search, System, Fine-Tune, License |
|
||||
|
||||
---
|
||||
|
||||
## Avocet Lessons Applied — What We Fixed Before Starting
|
||||
|
||||
The avocet SPA was the testbed. These bugs were found and fixed there; Peregrine's scaffold already incorporates all fixes. See `avocet/docs/vue-port-gotchas.md` for the full writeup.
|
||||
|
||||
### Applied at scaffold level (baked in — you don't need to think about these)
|
||||
|
||||
| # | Gotcha | How it's fixed in this scaffold |
|
||||
|---|--------|----------------------------------|
|
||||
| 1 | `id="app"` on App.vue root → nested `#app` elements, broken CSS specificity | `App.vue` root uses `class="app-root"`. `#app` in `index.html` is mount target only. |
|
||||
| 3 | `overflow-x: hidden` on html → creates scroll container → 15px scrollbar jitter on Linux | `peregrine.css`: `html { overflow-x: clip }` |
|
||||
| 4 | UnoCSS `presetAttributify` generates CSS for bare attribute names like `h2` | `uno.config.ts`: `presetAttributify({ prefix: 'un-', prefixedOnly: true })` |
|
||||
| 5 | Theme variable name mismatches cause dark mode to silently fall back to hardcoded colors | `peregrine.css` alias map: `--color-bg → var(--color-surface)`, `--color-text-secondary → var(--color-text-muted)` |
|
||||
| 7 | SPA cache: browser caches `index.html` indefinitely → old asset hashes → 404 on rebuild | FastAPI must register explicit `GET /` with no-cache headers before `StaticFiles` mount (see FastAPI section below) |
|
||||
| 9 | `navigator.vibrate()` not supported on desktop/Safari — throws on call | `useHaptics.ts` guards with `'vibrate' in navigator` |
|
||||
| 10 | Pinia options store = Vue 2 migration path | All stores use setup store form: `defineStore('id', () => { ... })` |
|
||||
| 12 | `matchMedia`, `vibrate`, `ResizeObserver` absent in jsdom → composable tests throw | `test-setup.ts` stubs all three |
|
||||
| 13 | `100vh` ignores mobile browser chrome | `App.vue`: `min-height: 100dvh` |
|
||||
|
||||
### Must actively avoid when writing new components
|
||||
|
||||
| # | Gotcha | Rule |
|
||||
|---|--------|------|
|
||||
| 2 | `transition: all` + spring easing → every CSS property bounces → layout explosion | Always enumerate: `transition: background 200ms ease, transform 250ms cubic-bezier(...)` |
|
||||
| 6 | Keyboard composables called with snapshot arrays → keys don't work after async data loads | Accept `getLabels: () => labels.value` (reactive getter), not `labels: []` (snapshot) |
|
||||
| 8 | Font reflow at ~780ms shifts layout measurements taken in `onMounted` | Measure layout in `document.fonts.ready` promise or after 1s timeout |
|
||||
| 11 | `useSwipe` from `@vueuse/core` fires on desktop trackpad pointer events, not just touch | Add `pointer-type === 'touch'` guard if you need touch-only behavior |
|
||||
| 14 | Rebuild workflow confusion | `cd web && npm run build` → refresh browser. Only restart FastAPI if `app/api.py` changed. |
|
||||
| 15 | `:global(ancestor) .descendant` in `<style scoped>` → Vue drops the descendant entirely | Never use `:global(X) .Y` in scoped CSS. Use JS gate or CSS custom property token. |
|
||||
|
||||
---
|
||||
|
||||
## FastAPI Integration
|
||||
|
||||
### SPA serving (gotcha #7)
|
||||
|
||||
When the Vue SPA is built, FastAPI needs to serve it. Register the explicit `/` route **before** the `StaticFiles` mount, otherwise `index.html` gets cached and old asset hashes cause 404s after rebuild:
|
||||
|
||||
```python
|
||||
from pathlib import Path
|
||||
from fastapi.responses import FileResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
|
||||
_DIST = Path(__file__).parent.parent / "web" / "dist"
|
||||
_NO_CACHE = {
|
||||
"Cache-Control": "no-cache, no-store, must-revalidate",
|
||||
"Pragma": "no-cache",
|
||||
}
|
||||
|
||||
@app.get("/")
|
||||
def spa_root():
|
||||
return FileResponse(_DIST / "index.html", headers=_NO_CACHE)
|
||||
|
||||
# Must come after the explicit route above
|
||||
app.mount("/", StaticFiles(directory=str(_DIST), html=True), name="spa")
|
||||
```
|
||||
|
||||
Hashed assets (`/assets/index-abc123.js`) can be cached aggressively — their filenames change with content. Only `index.html` needs no-cache.
|
||||
|
||||
### API prefix
|
||||
|
||||
Vue Router uses HTML5 history mode. All `/api/*` routes must be registered on FastAPI before the `StaticFiles` mount. Vue routes (`/`, `/review`, `/apply`, etc.) are handled client-side; FastAPI's `html=True` on `StaticFiles` serves `index.html` for any unmatched path.
|
||||
|
||||
---
|
||||
|
||||
## Peregrine-Specific Considerations
|
||||
|
||||
### Auth & license gating
|
||||
|
||||
The Streamlit UI uses `app/wizard/tiers.py` for tier gating. In the Vue SPA, tier state should be fetched from a `GET /api/license/status` endpoint on mount and stored in a Pinia store. Components check `licenseStore.tier` to gate features.
|
||||
|
||||
### Discovery trigger
|
||||
|
||||
The "Start Discovery" button on Home triggers `python scripts/discover.py` as a background process. The Vue version should use SSE (same pattern as avocet's finetune SSE) to stream progress back in real-time. The `useApiSSE` composable is already wired for this.
|
||||
|
||||
### Job Review — card stack UX
|
||||
|
||||
This is the daily-driver view. Consider the avocet ASMR bucket pattern here — approve/reject could transform into buckets on drag pickup. The motion tokens (`--transition-spring`, `--transition-dismiss`) are pre-defined in `peregrine.css`. The `useHaptics` composable is ready.
|
||||
|
||||
### Kanban (Interviews view)
|
||||
|
||||
The drag-to-column kanban is a strong candidate for `@vueuse/core`'s `useDraggable`. Watch for the `useSwipe` gotcha #11 — use pointer-type guards if drag behavior differs between touch and mouse.
|
||||
|
||||
### Settings — 6 tabs
|
||||
|
||||
Use a tab component with reactive route query params (`/settings?tab=license`) so direct links work and the page is shareable/bookmarkable.
|
||||
|
||||
---
|
||||
|
||||
## Build & Dev Workflow
|
||||
|
||||
```bash
|
||||
# From worktree root
|
||||
cd web
|
||||
npm install # first time only
|
||||
npm run dev # Vite dev server at :5173 (proxies /api/* to FastAPI at :8502)
|
||||
npm run build # output to web/dist/
|
||||
npm run test # Vitest unit tests
|
||||
```
|
||||
|
||||
FastAPI serves the built `dist/` on the main port. During dev, configure Vite to proxy `/api` to the running FastAPI:
|
||||
|
||||
```ts
|
||||
// vite.config.ts addition for dev proxy
|
||||
server: {
|
||||
proxy: {
|
||||
'/api': 'http://localhost:8502',
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
After `npm run build`, just refresh the browser — no FastAPI restart needed unless `app/api.py` changed (gotcha #14).
|
||||
|
||||
---
|
||||
|
||||
## Implementation Order
|
||||
|
||||
Suggested sequence — validate the full stack before porting complex pages:
|
||||
|
||||
1. **FastAPI SPA endpoint** — serve `web/dist/` with correct cache headers
|
||||
2. **App shell** — nav, routing, hacker mode, motion toggle work end-to-end
|
||||
3. **Home view** — dashboard widgets, discovery trigger with SSE progress
|
||||
4. **Job Review** — most-used view; gets the most polish
|
||||
5. **Settings** — license tab is the blocker for tier gating in other views
|
||||
6. **Apply Workspace** — cover letter gen + PDF export
|
||||
7. **Interviews kanban** — drag-to-column + calendar sync
|
||||
8. **Interview Prep** — reference sheet, practice Q&A
|
||||
9. **Survey Assistant** — screenshot + text paste
|
||||
|
||||
---
|
||||
|
||||
## Checklist
|
||||
|
||||
Copy of the avocet gotchas checklist (all pre-applied at scaffold level are checked):
|
||||
|
||||
- [x] App.vue root element: use `.app-root` class, NOT `id="app"`
|
||||
- [ ] No `transition: all` with spring easings — enumerate properties explicitly
|
||||
- [ ] No `:global(ancestor) .descendant` in scoped CSS — Vue drops the descendant
|
||||
- [x] `overflow-x: clip` on html, `overflow-x: hidden` on body
|
||||
- [x] UnoCSS `presetAttributify`: `prefixedOnly: true`
|
||||
- [x] Product CSS aliases: `--color-bg`, `--color-text-secondary` mapped in `peregrine.css`
|
||||
- [ ] Keyboard composables: accept reactive getters, not snapshot arrays
|
||||
- [x] FastAPI SPA serving pattern documented — apply when wiring FastAPI
|
||||
- [ ] Font reflow: measure layout after `document.fonts.ready` or 1s timeout
|
||||
- [x] Haptics: guard `navigator.vibrate` with feature detection
|
||||
- [x] Pinia: use setup store form (function syntax)
|
||||
- [x] Tests: mock matchMedia, vibrate, ResizeObserver in test-setup.ts
|
||||
- [x] `min-height: 100dvh` on full-height layout containers
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
name: job-seeker
|
||||
name: cf
|
||||
# Recreate: conda env create -f environment.yml
|
||||
# Update pinned snapshot: conda env export --no-builds > environment.yml
|
||||
channels:
|
||||
|
|
@ -48,6 +48,12 @@ dependencies:
|
|||
# ── Notion integration ────────────────────────────────────────────────────
|
||||
- notion-client>=3.0
|
||||
|
||||
# ── Calendar integrations ─────────────────────────────────────────────────
|
||||
- caldav>=1.3
|
||||
- icalendar>=5.0
|
||||
- google-api-python-client>=2.0
|
||||
- google-auth>=2.0
|
||||
|
||||
# ── Document handling ─────────────────────────────────────────────────────
|
||||
- pypdf
|
||||
- pdfminer-six
|
||||
|
|
|
|||
34
manage.sh
34
manage.sh
|
|
@ -32,7 +32,10 @@ usage() {
|
|||
echo -e " ${GREEN}logs [service]${NC} Tail logs (default: app)"
|
||||
echo -e " ${GREEN}update${NC} Pull latest images + rebuild app"
|
||||
echo -e " ${GREEN}preflight${NC} Check ports + resources; write .env"
|
||||
echo -e " ${GREEN}models${NC} Check ollama models in config; pull any missing"
|
||||
echo -e " ${GREEN}test${NC} Run test suite"
|
||||
echo -e " ${GREEN}e2e [mode]${NC} Run E2E tests (mode: demo|cloud|local, default: demo)"
|
||||
echo -e " Set E2E_HEADLESS=false to run headed via Xvfb"
|
||||
echo -e " ${GREEN}prepare-training${NC} Extract cover letters → training JSONL"
|
||||
echo -e " ${GREEN}finetune${NC} Run LoRA fine-tune (needs GPU profile)"
|
||||
echo -e " ${GREEN}clean${NC} Remove containers, images, volumes (DESTRUCTIVE)"
|
||||
|
|
@ -42,7 +45,10 @@ usage() {
|
|||
echo " remote API-only, no local inference (default)"
|
||||
echo " cpu Local Ollama inference on CPU"
|
||||
echo " single-gpu Ollama + Vision on GPU 0"
|
||||
echo " dual-gpu Ollama + Vision + vLLM on GPU 0+1"
|
||||
echo " dual-gpu Ollama + Vision on GPU 0; GPU 1 set by DUAL_GPU_MODE"
|
||||
echo " DUAL_GPU_MODE=ollama (default) ollama_research on GPU 1"
|
||||
echo " DUAL_GPU_MODE=vllm vllm on GPU 1"
|
||||
echo " DUAL_GPU_MODE=mixed both on GPU 1 (VRAM-split)"
|
||||
echo ""
|
||||
echo " Examples:"
|
||||
echo " ./manage.sh start"
|
||||
|
|
@ -86,6 +92,12 @@ case "$CMD" in
|
|||
make preflight PROFILE="$PROFILE"
|
||||
;;
|
||||
|
||||
models)
|
||||
info "Checking ollama models..."
|
||||
conda run -n cf python scripts/preflight.py --models-only
|
||||
success "Model check complete."
|
||||
;;
|
||||
|
||||
start)
|
||||
info "Starting Peregrine (PROFILE=${PROFILE})..."
|
||||
make start PROFILE="$PROFILE"
|
||||
|
|
@ -128,7 +140,7 @@ case "$CMD" in
|
|||
&& echo "docker compose" \
|
||||
|| (command -v podman >/dev/null 2>&1 && echo "podman compose" || echo "podman-compose"))"
|
||||
$COMPOSE pull searxng ollama 2>/dev/null || true
|
||||
$COMPOSE build app
|
||||
$COMPOSE build app web
|
||||
success "Update complete. Run './manage.sh restart' to apply."
|
||||
;;
|
||||
|
||||
|
|
@ -167,6 +179,24 @@ case "$CMD" in
|
|||
fi
|
||||
;;
|
||||
|
||||
e2e)
|
||||
MODE="${2:-demo}"
|
||||
RESULTS_DIR="tests/e2e/results/${MODE}"
|
||||
mkdir -p "${RESULTS_DIR}"
|
||||
HEADLESS="${E2E_HEADLESS:-true}"
|
||||
if [ "$HEADLESS" = "false" ]; then
|
||||
RUNNER="xvfb-run --auto-servernum --server-args='-screen 0 1280x900x24'"
|
||||
else
|
||||
RUNNER=""
|
||||
fi
|
||||
info "Running E2E tests (mode=${MODE}, headless=${HEADLESS})..."
|
||||
$RUNNER conda run -n cf pytest tests/e2e/ \
|
||||
--mode="${MODE}" \
|
||||
--json-report \
|
||||
--json-report-file="${RESULTS_DIR}/report.json" \
|
||||
-v "${@:3}"
|
||||
;;
|
||||
|
||||
help|--help|-h)
|
||||
usage
|
||||
;;
|
||||
|
|
|
|||
97
migrations/001_baseline.sql
Normal file
97
migrations/001_baseline.sql
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
-- Migration 001: Baseline schema
|
||||
-- Captures the full schema as of v0.8.5 (all columns including those added via ALTER TABLE)
|
||||
|
||||
CREATE TABLE IF NOT EXISTS jobs (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
title TEXT,
|
||||
company TEXT,
|
||||
url TEXT UNIQUE,
|
||||
source TEXT,
|
||||
location TEXT,
|
||||
is_remote INTEGER DEFAULT 0,
|
||||
salary TEXT,
|
||||
description TEXT,
|
||||
match_score REAL,
|
||||
keyword_gaps TEXT,
|
||||
date_found TEXT,
|
||||
status TEXT DEFAULT 'pending',
|
||||
notion_page_id TEXT,
|
||||
cover_letter TEXT,
|
||||
applied_at TEXT,
|
||||
interview_date TEXT,
|
||||
rejection_stage TEXT,
|
||||
phone_screen_at TEXT,
|
||||
interviewing_at TEXT,
|
||||
offer_at TEXT,
|
||||
hired_at TEXT,
|
||||
survey_at TEXT,
|
||||
calendar_event_id TEXT,
|
||||
optimized_resume TEXT,
|
||||
ats_gap_report TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS job_contacts (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
job_id INTEGER,
|
||||
direction TEXT,
|
||||
subject TEXT,
|
||||
from_addr TEXT,
|
||||
to_addr TEXT,
|
||||
body TEXT,
|
||||
received_at TEXT,
|
||||
is_response_needed INTEGER DEFAULT 0,
|
||||
responded_at TEXT,
|
||||
message_id TEXT,
|
||||
stage_signal TEXT,
|
||||
suggestion_dismissed INTEGER DEFAULT 0
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS company_research (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
job_id INTEGER UNIQUE,
|
||||
generated_at TEXT,
|
||||
company_brief TEXT,
|
||||
ceo_brief TEXT,
|
||||
talking_points TEXT,
|
||||
raw_output TEXT,
|
||||
tech_brief TEXT,
|
||||
funding_brief TEXT,
|
||||
competitors_brief TEXT,
|
||||
red_flags TEXT,
|
||||
scrape_used INTEGER DEFAULT 0,
|
||||
accessibility_brief TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS background_tasks (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
task_type TEXT,
|
||||
job_id INTEGER,
|
||||
params TEXT,
|
||||
status TEXT DEFAULT 'pending',
|
||||
error TEXT,
|
||||
created_at TEXT,
|
||||
started_at TEXT,
|
||||
finished_at TEXT,
|
||||
stage TEXT,
|
||||
updated_at TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS survey_responses (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
job_id INTEGER,
|
||||
survey_name TEXT,
|
||||
received_at TEXT,
|
||||
source TEXT,
|
||||
raw_input TEXT,
|
||||
image_path TEXT,
|
||||
mode TEXT,
|
||||
llm_output TEXT,
|
||||
reported_score REAL,
|
||||
created_at TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS digest_queue (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
job_contact_id INTEGER UNIQUE,
|
||||
created_at TEXT
|
||||
);
|
||||
13
mkdocs.yml
13
mkdocs.yml
|
|
@ -1,9 +1,9 @@
|
|||
site_name: Peregrine
|
||||
site_description: AI-powered job search pipeline
|
||||
site_author: Circuit Forge LLC
|
||||
site_url: https://docs.circuitforge.io/peregrine
|
||||
repo_url: https://git.circuitforge.io/circuitforge/peregrine
|
||||
repo_name: circuitforge/peregrine
|
||||
site_url: https://docs.circuitforge.tech/peregrine
|
||||
repo_url: https://git.opensourcesolarpunk.com/pyr0ball/peregrine
|
||||
repo_name: pyr0ball/peregrine
|
||||
|
||||
theme:
|
||||
name: material
|
||||
|
|
@ -32,7 +32,11 @@ theme:
|
|||
markdown_extensions:
|
||||
- admonition
|
||||
- pymdownx.details
|
||||
- pymdownx.superfences
|
||||
- pymdownx.superfences:
|
||||
custom_fences:
|
||||
- name: mermaid
|
||||
class: mermaid
|
||||
format: !!python/name:pymdownx.superfences.fence_code_format
|
||||
- pymdownx.highlight:
|
||||
anchor_linenums: true
|
||||
- pymdownx.tabbed:
|
||||
|
|
@ -58,6 +62,7 @@ nav:
|
|||
- Developer Guide:
|
||||
- Contributing: developer-guide/contributing.md
|
||||
- Architecture: developer-guide/architecture.md
|
||||
- Cloud Deployment: developer-guide/cloud-deployment.md
|
||||
- Adding a Scraper: developer-guide/adding-scrapers.md
|
||||
- Adding an Integration: developer-guide/adding-integrations.md
|
||||
- Testing: developer-guide/testing.md
|
||||
|
|
|
|||
|
|
@ -2,6 +2,15 @@
|
|||
# Extracted from environment.yml for Docker pip installs
|
||||
# Keep in sync with environment.yml
|
||||
|
||||
# ── CircuitForge shared core ───────────────────────────────────────────────
|
||||
# Requires circuitforge-core >= 0.8.0 (config.load_env, db, tasks; resources moved to circuitforge-orch).
|
||||
# Local dev / Docker (parent-context build): path install works because
|
||||
# circuitforge-core/ is a sibling directory.
|
||||
# CI / fresh checkouts: falls back to the Forgejo VCS URL below.
|
||||
# To use local editable install run: pip install -e ../circuitforge-core
|
||||
# TODO: pin to @v0.7.0 tag once cf-core cuts a release tag.
|
||||
git+https://git.opensourcesolarpunk.com/Circuit-Forge/circuitforge-core.git@main
|
||||
|
||||
# ── Web UI ────────────────────────────────────────────────────────────────
|
||||
streamlit>=1.35
|
||||
watchdog
|
||||
|
|
@ -12,7 +21,9 @@ streamlit-paste-button>=0.1.0
|
|||
|
||||
# ── Job scraping ──────────────────────────────────────────────────────────
|
||||
python-jobspy>=1.1
|
||||
playwright
|
||||
playwright>=1.40
|
||||
pytest-playwright>=0.4
|
||||
pytest-json-report>=1.5
|
||||
selenium
|
||||
undetected-chromedriver
|
||||
webdriver-manager
|
||||
|
|
@ -37,7 +48,8 @@ tiktoken
|
|||
# ── Resume matching ───────────────────────────────────────────────────────
|
||||
scikit-learn>=1.3
|
||||
rapidfuzz
|
||||
lib-resume-builder-aihawk
|
||||
# lib-resume-builder-aihawk intentionally excluded — pulls torch+CUDA (~7GB).
|
||||
# AIHawk runs in its own conda env (aihawk-env) outside the Docker container.
|
||||
|
||||
# ── Notion integration ────────────────────────────────────────────────────
|
||||
notion-client>=3.0
|
||||
|
|
@ -53,6 +65,10 @@ python-dotenv
|
|||
|
||||
# ── Auth / licensing ──────────────────────────────────────────────────────
|
||||
PyJWT>=2.8
|
||||
pysqlcipher3
|
||||
|
||||
# ── Cloud / telemetry ─────────────────────────────────────────────────────────
|
||||
psycopg2-binary
|
||||
|
||||
# ── Utilities ─────────────────────────────────────────────────────────────
|
||||
sqlalchemy
|
||||
|
|
@ -71,3 +87,10 @@ lxml
|
|||
# ── Documentation ────────────────────────────────────────────────────────
|
||||
mkdocs>=1.5
|
||||
mkdocs-material>=9.5
|
||||
|
||||
# ── Vue SPA API backend ──────────────────────────────────────────────────
|
||||
fastapi>=0.100.0
|
||||
uvicorn[standard]>=0.20.0
|
||||
PyJWT>=2.8.0
|
||||
cryptography>=40.0.0
|
||||
python-multipart>=0.0.6
|
||||
|
|
|
|||
364
scripts/backup.py
Normal file
364
scripts/backup.py
Normal file
|
|
@ -0,0 +1,364 @@
|
|||
"""Config backup / restore / teleport for Peregrine.
|
||||
|
||||
Creates a portable zip of all gitignored configs + optionally the staging DB.
|
||||
Intended for: machine migrations, Docker volume transfers, and safe wizard testing.
|
||||
Supports both the Peregrine Docker instance and the legacy /devl/job-seeker install.
|
||||
|
||||
Cloud mode notes
|
||||
----------------
|
||||
In cloud mode (CLOUD_MODE=true), the staging DB is SQLCipher-encrypted.
|
||||
Pass the per-user ``db_key`` to ``create_backup()`` to have it transparently
|
||||
decrypt the DB before archiving — producing a portable, plain SQLite file
|
||||
that works with any local Docker install.
|
||||
|
||||
Pass the same ``db_key`` to ``restore_backup()`` and it will re-encrypt the
|
||||
plain DB on its way in, so the cloud app can open it normally.
|
||||
|
||||
Usage (CLI):
|
||||
conda run -n job-seeker python scripts/backup.py --create backup.zip
|
||||
conda run -n job-seeker python scripts/backup.py --create backup.zip --no-db
|
||||
conda run -n job-seeker python scripts/backup.py --create backup.zip --base-dir /devl/job-seeker
|
||||
conda run -n job-seeker python scripts/backup.py --restore backup.zip
|
||||
conda run -n job-seeker python scripts/backup.py --list backup.zip
|
||||
|
||||
Usage (programmatic — called from Settings UI):
|
||||
from scripts.backup import create_backup, restore_backup, list_backup_contents
|
||||
zip_bytes = create_backup(base_dir, include_db=True)
|
||||
info = list_backup_contents(zip_bytes)
|
||||
result = restore_backup(zip_bytes, base_dir, include_db=True)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import zipfile
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Files included in every backup (relative to repo root)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Gitignored config files that hold secrets / personal data
|
||||
_SECRET_CONFIGS = [
|
||||
"config/notion.yaml",
|
||||
"config/tokens.yaml",
|
||||
"config/email.yaml",
|
||||
"config/adzuna.yaml",
|
||||
"config/craigslist.yaml",
|
||||
"config/user.yaml",
|
||||
"config/plain_text_resume.yaml",
|
||||
"config/license.json",
|
||||
"config/user.yaml.working",
|
||||
]
|
||||
|
||||
# Gitignored integration configs (glob pattern — each matching file is added)
|
||||
_INTEGRATION_CONFIG_GLOB = "config/integrations/*.yaml"
|
||||
|
||||
# Non-secret committed configs worth preserving for portability
|
||||
# (also present in the legacy /devl/job-seeker instance)
|
||||
_EXTRA_CONFIGS = [
|
||||
"config/llm.yaml",
|
||||
"config/search_profiles.yaml",
|
||||
"config/resume_keywords.yaml", # personal keyword list — present in both instances
|
||||
"config/skills_suggestions.yaml",
|
||||
"config/blocklist.yaml",
|
||||
"config/server.yaml", # deployment config (base URL path, port) — Peregrine only
|
||||
]
|
||||
|
||||
# Candidate DB paths (first one that exists wins)
|
||||
_DB_CANDIDATES = ["data/staging.db", "staging.db"]
|
||||
|
||||
_MANIFEST_NAME = "backup-manifest.json"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SQLCipher helpers (cloud mode only — only called when db_key is set)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _decrypt_db_to_bytes(db_path: Path, db_key: str) -> bytes:
|
||||
"""Open a SQLCipher-encrypted DB and return plain SQLite bytes.
|
||||
|
||||
Uses SQLCipher's ATTACH + sqlcipher_export() to produce a portable
|
||||
unencrypted copy. Only called in cloud mode (db_key non-empty).
|
||||
pysqlcipher3 is available in the Docker image (Dockerfile installs
|
||||
libsqlcipher-dev); never called in local-mode tests.
|
||||
"""
|
||||
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
|
||||
tmp_path = tmp.name
|
||||
try:
|
||||
from pysqlcipher3 import dbapi2 as _sqlcipher # type: ignore[import]
|
||||
conn = _sqlcipher.connect(str(db_path))
|
||||
conn.execute(f"PRAGMA key='{db_key}'")
|
||||
conn.execute(f"ATTACH DATABASE '{tmp_path}' AS plaintext KEY ''")
|
||||
conn.execute("SELECT sqlcipher_export('plaintext')")
|
||||
conn.execute("DETACH DATABASE plaintext")
|
||||
conn.close()
|
||||
return Path(tmp_path).read_bytes()
|
||||
finally:
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _encrypt_db_from_bytes(plain_bytes: bytes, dest_path: Path, db_key: str) -> None:
|
||||
"""Write plain SQLite bytes as a SQLCipher-encrypted DB at dest_path.
|
||||
|
||||
Used on restore in cloud mode to convert a portable plain backup into
|
||||
the per-user encrypted format the app expects.
|
||||
"""
|
||||
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
|
||||
tmp.write(plain_bytes)
|
||||
tmp_path = tmp.name
|
||||
try:
|
||||
from pysqlcipher3 import dbapi2 as _sqlcipher # type: ignore[import]
|
||||
# Open the plain DB (empty key = no encryption in SQLCipher)
|
||||
conn = _sqlcipher.connect(tmp_path)
|
||||
conn.execute("PRAGMA key=''")
|
||||
# Attach the encrypted destination and export there
|
||||
conn.execute(f"ATTACH DATABASE '{dest_path}' AS encrypted KEY '{db_key}'")
|
||||
conn.execute("SELECT sqlcipher_export('encrypted')")
|
||||
conn.execute("DETACH DATABASE encrypted")
|
||||
conn.close()
|
||||
finally:
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Source detection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _detect_source_label(base_dir: Path) -> str:
|
||||
"""Return a human-readable label for the instance being backed up.
|
||||
|
||||
Uses the directory name — stable as long as the repo root isn't renamed,
|
||||
which is the normal case for both the Docker install (peregrine/) and the
|
||||
legacy Conda install (job-seeker/).
|
||||
|
||||
Args:
|
||||
base_dir: The root directory being backed up.
|
||||
|
||||
Returns:
|
||||
A short identifier string, e.g. "peregrine" or "job-seeker".
|
||||
"""
|
||||
return base_dir.name
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def create_backup(
|
||||
base_dir: Path,
|
||||
include_db: bool = True,
|
||||
source_label: str | None = None,
|
||||
db_key: str = "",
|
||||
) -> bytes:
|
||||
"""Return a zip archive as raw bytes.
|
||||
|
||||
Args:
|
||||
base_dir: Repo root (parent of config/ and staging.db).
|
||||
include_db: If True, include staging.db in the archive.
|
||||
source_label: Human-readable instance name stored in the manifest
|
||||
(e.g. "peregrine", "job-seeker"). Auto-detected if None.
|
||||
db_key: SQLCipher key for the DB (cloud mode). When set, the DB
|
||||
is decrypted before archiving so the backup is portable
|
||||
to any local Docker install.
|
||||
"""
|
||||
buf = io.BytesIO()
|
||||
included: list[str] = []
|
||||
|
||||
with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_DEFLATED) as zf:
|
||||
# Gitignored secret configs
|
||||
for rel in _SECRET_CONFIGS:
|
||||
p = base_dir / rel
|
||||
if p.exists():
|
||||
zf.write(p, rel)
|
||||
included.append(rel)
|
||||
|
||||
# Integration configs (glob)
|
||||
for p in sorted((base_dir).glob(_INTEGRATION_CONFIG_GLOB)):
|
||||
rel = str(p.relative_to(base_dir))
|
||||
zf.write(p, rel)
|
||||
included.append(rel)
|
||||
|
||||
# Extra non-secret configs
|
||||
for rel in _EXTRA_CONFIGS:
|
||||
p = base_dir / rel
|
||||
if p.exists():
|
||||
zf.write(p, rel)
|
||||
included.append(rel)
|
||||
|
||||
# Staging DB
|
||||
if include_db:
|
||||
for candidate in _DB_CANDIDATES:
|
||||
p = base_dir / candidate
|
||||
if p.exists():
|
||||
if db_key:
|
||||
# Cloud mode: decrypt to plain SQLite before archiving
|
||||
plain_bytes = _decrypt_db_to_bytes(p, db_key)
|
||||
zf.writestr(candidate, plain_bytes)
|
||||
else:
|
||||
zf.write(p, candidate)
|
||||
included.append(candidate)
|
||||
break
|
||||
|
||||
# Manifest
|
||||
manifest = {
|
||||
"created_at": datetime.now().isoformat(),
|
||||
"source": source_label or _detect_source_label(base_dir),
|
||||
"source_path": str(base_dir.resolve()),
|
||||
"peregrine_version": "1.0",
|
||||
"files": included,
|
||||
"includes_db": include_db and any(f.endswith(".db") for f in included),
|
||||
}
|
||||
zf.writestr(_MANIFEST_NAME, json.dumps(manifest, indent=2))
|
||||
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
def list_backup_contents(zip_bytes: bytes) -> dict:
|
||||
"""Return manifest + file list from a backup zip (no extraction)."""
|
||||
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
|
||||
names = [n for n in zf.namelist() if n != _MANIFEST_NAME]
|
||||
manifest: dict = {}
|
||||
if _MANIFEST_NAME in zf.namelist():
|
||||
manifest = json.loads(zf.read(_MANIFEST_NAME))
|
||||
sizes = {info.filename: info.file_size for info in zf.infolist()}
|
||||
return {
|
||||
"manifest": manifest,
|
||||
"files": names,
|
||||
"sizes": sizes,
|
||||
"total_bytes": sum(sizes[n] for n in names if n in sizes),
|
||||
}
|
||||
|
||||
|
||||
def restore_backup(
|
||||
zip_bytes: bytes,
|
||||
base_dir: Path,
|
||||
include_db: bool = True,
|
||||
overwrite: bool = True,
|
||||
db_key: str = "",
|
||||
) -> dict[str, list[str]]:
|
||||
"""Extract a backup zip into base_dir.
|
||||
|
||||
Args:
|
||||
zip_bytes: Raw bytes of the backup zip.
|
||||
base_dir: Repo root to restore into.
|
||||
include_db: If False, skip any .db files.
|
||||
overwrite: If False, skip files that already exist.
|
||||
db_key: SQLCipher key (cloud mode). When set, any .db file in the
|
||||
zip (plain SQLite) is re-encrypted on the way in so the
|
||||
cloud app can open it normally.
|
||||
|
||||
Returns:
|
||||
{"restored": [...], "skipped": [...]}
|
||||
"""
|
||||
restored: list[str] = []
|
||||
skipped: list[str] = []
|
||||
|
||||
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
|
||||
for name in zf.namelist():
|
||||
if name == _MANIFEST_NAME:
|
||||
continue
|
||||
if not include_db and name.endswith(".db"):
|
||||
skipped.append(name)
|
||||
continue
|
||||
dest = base_dir / name
|
||||
if dest.exists() and not overwrite:
|
||||
skipped.append(name)
|
||||
continue
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
raw = zf.read(name)
|
||||
if db_key and name.endswith(".db"):
|
||||
# Cloud mode: the zip contains plain SQLite — re-encrypt on restore
|
||||
_encrypt_db_from_bytes(raw, dest, db_key)
|
||||
else:
|
||||
dest.write_bytes(raw)
|
||||
restored.append(name)
|
||||
|
||||
return {"restored": restored, "skipped": skipped}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main() -> None:
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
parser = argparse.ArgumentParser(description="Peregrine config backup / restore / teleport")
|
||||
group = parser.add_mutually_exclusive_group(required=True)
|
||||
group.add_argument("--create", metavar="OUT.zip", help="Create a backup zip")
|
||||
group.add_argument("--restore", metavar="IN.zip", help="Restore from a backup zip")
|
||||
group.add_argument("--list", metavar="IN.zip", help="List contents of a backup zip")
|
||||
parser.add_argument("--no-db", action="store_true", help="Exclude staging.db (--create/--restore)")
|
||||
parser.add_argument("--no-overwrite", action="store_true",
|
||||
help="Skip files that already exist (--restore)")
|
||||
parser.add_argument(
|
||||
"--base-dir", metavar="PATH",
|
||||
help="Root of the instance to back up/restore (default: this repo root). "
|
||||
"Use /devl/job-seeker to target the legacy Conda install.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
base_dir = Path(args.base_dir).resolve() if args.base_dir else Path(__file__).parent.parent
|
||||
|
||||
if args.create:
|
||||
out = Path(args.create)
|
||||
data = create_backup(base_dir, include_db=not args.no_db)
|
||||
out.write_bytes(data)
|
||||
info = list_backup_contents(data)
|
||||
m = info["manifest"]
|
||||
print(f"Backup created: {out} ({len(data):,} bytes)")
|
||||
print(f" Source: {m.get('source', '?')} ({base_dir})")
|
||||
print(f" {len(info['files'])} files archived:")
|
||||
for name in info["files"]:
|
||||
size = info["sizes"].get(name, 0)
|
||||
print(f" {name} ({size:,} bytes)")
|
||||
|
||||
elif args.restore:
|
||||
in_path = Path(args.restore)
|
||||
if not in_path.exists():
|
||||
print(f"ERROR: {in_path} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
data = in_path.read_bytes()
|
||||
result = restore_backup(data, base_dir,
|
||||
include_db=not args.no_db,
|
||||
overwrite=not args.no_overwrite)
|
||||
print(f"Restored {len(result['restored'])} files:")
|
||||
for name in result["restored"]:
|
||||
print(f" ✓ {name}")
|
||||
if result["skipped"]:
|
||||
print(f"Skipped {len(result['skipped'])} files:")
|
||||
for name in result["skipped"]:
|
||||
print(f" - {name}")
|
||||
|
||||
elif args.list:
|
||||
in_path = Path(args.list)
|
||||
if not in_path.exists():
|
||||
print(f"ERROR: {in_path} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
data = in_path.read_bytes()
|
||||
info = list_backup_contents(data)
|
||||
m = info["manifest"]
|
||||
if m:
|
||||
print(f"Created: {m.get('created_at', 'unknown')}")
|
||||
print(f"Source: {m.get('source', '?')} ({m.get('source_path', '?')})")
|
||||
print(f"Has DB: {m.get('includes_db', '?')}")
|
||||
print(f"\n{len(info['files'])} files ({info['total_bytes']:,} bytes uncompressed):")
|
||||
for name in info["files"]:
|
||||
size = info["sizes"].get(name, 0)
|
||||
print(f" {name} ({size:,} bytes)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
347
scripts/benchmark_classifier.py
Normal file
347
scripts/benchmark_classifier.py
Normal file
|
|
@ -0,0 +1,347 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Email classifier benchmark — compare HuggingFace models against our 6 labels.
|
||||
|
||||
Usage:
|
||||
# List available models
|
||||
conda run -n job-seeker-classifiers python scripts/benchmark_classifier.py --list-models
|
||||
|
||||
# Score against labeled JSONL
|
||||
conda run -n job-seeker-classifiers python scripts/benchmark_classifier.py --score
|
||||
|
||||
# Visual comparison on live IMAP emails
|
||||
conda run -n job-seeker-classifiers python scripts/benchmark_classifier.py --compare --limit 20
|
||||
|
||||
# Include slow/large models
|
||||
conda run -n job-seeker-classifiers python scripts/benchmark_classifier.py --score --include-slow
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import email as _email_lib
|
||||
import imaplib
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from scripts.classifier_adapters import (
|
||||
LABELS,
|
||||
LABEL_DESCRIPTIONS,
|
||||
ClassifierAdapter,
|
||||
GLiClassAdapter,
|
||||
RerankerAdapter,
|
||||
ZeroShotAdapter,
|
||||
compute_metrics,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Model registry
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
MODEL_REGISTRY: dict[str, dict[str, Any]] = {
|
||||
"deberta-zeroshot": {
|
||||
"adapter": ZeroShotAdapter,
|
||||
"model_id": "MoritzLaurer/DeBERTa-v3-large-zeroshot-v2.0",
|
||||
"params": "400M",
|
||||
"default": True,
|
||||
},
|
||||
"deberta-small": {
|
||||
"adapter": ZeroShotAdapter,
|
||||
"model_id": "cross-encoder/nli-deberta-v3-small",
|
||||
"params": "100M",
|
||||
"default": True,
|
||||
},
|
||||
"gliclass-large": {
|
||||
"adapter": GLiClassAdapter,
|
||||
"model_id": "knowledgator/gliclass-instruct-large-v1.0",
|
||||
"params": "400M",
|
||||
"default": True,
|
||||
},
|
||||
"bart-mnli": {
|
||||
"adapter": ZeroShotAdapter,
|
||||
"model_id": "facebook/bart-large-mnli",
|
||||
"params": "400M",
|
||||
"default": True,
|
||||
},
|
||||
"bge-m3-zeroshot": {
|
||||
"adapter": ZeroShotAdapter,
|
||||
"model_id": "MoritzLaurer/bge-m3-zeroshot-v2.0",
|
||||
"params": "600M",
|
||||
"default": True,
|
||||
},
|
||||
"bge-reranker": {
|
||||
"adapter": RerankerAdapter,
|
||||
"model_id": "BAAI/bge-reranker-v2-m3",
|
||||
"params": "600M",
|
||||
"default": False,
|
||||
},
|
||||
"deberta-xlarge": {
|
||||
"adapter": ZeroShotAdapter,
|
||||
"model_id": "microsoft/deberta-xlarge-mnli",
|
||||
"params": "750M",
|
||||
"default": False,
|
||||
},
|
||||
"mdeberta-mnli": {
|
||||
"adapter": ZeroShotAdapter,
|
||||
"model_id": "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
|
||||
"params": "300M",
|
||||
"default": False,
|
||||
},
|
||||
"xlm-roberta-anli": {
|
||||
"adapter": ZeroShotAdapter,
|
||||
"model_id": "vicgalle/xlm-roberta-large-xnli-anli",
|
||||
"params": "600M",
|
||||
"default": False,
|
||||
},
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def load_scoring_jsonl(path: str) -> list[dict[str, str]]:
|
||||
"""Load labeled examples from a JSONL file for benchmark scoring."""
|
||||
p = Path(path)
|
||||
if not p.exists():
|
||||
raise FileNotFoundError(
|
||||
f"Scoring file not found: {path}\n"
|
||||
f"Copy data/email_score.jsonl.example → data/email_score.jsonl and label your emails."
|
||||
)
|
||||
rows = []
|
||||
with p.open() as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
rows.append(json.loads(line))
|
||||
return rows
|
||||
|
||||
|
||||
def _active_models(include_slow: bool) -> dict[str, dict[str, Any]]:
|
||||
return {k: v for k, v in MODEL_REGISTRY.items() if v["default"] or include_slow}
|
||||
|
||||
|
||||
def run_scoring(
|
||||
adapters: list[ClassifierAdapter],
|
||||
score_file: str,
|
||||
) -> dict[str, Any]:
|
||||
"""Run all adapters against a labeled JSONL. Returns per-adapter metrics."""
|
||||
rows = load_scoring_jsonl(score_file)
|
||||
gold = [r["label"] for r in rows]
|
||||
results: dict[str, Any] = {}
|
||||
|
||||
for adapter in adapters:
|
||||
preds: list[str] = []
|
||||
t0 = time.monotonic()
|
||||
for row in rows:
|
||||
try:
|
||||
pred = adapter.classify(row["subject"], row["body"])
|
||||
except Exception as exc:
|
||||
print(f" [{adapter.name}] ERROR on '{row['subject'][:40]}': {exc}", flush=True)
|
||||
pred = "neutral"
|
||||
preds.append(pred)
|
||||
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||
metrics = compute_metrics(preds, gold, LABELS)
|
||||
metrics["latency_ms"] = round(elapsed_ms / len(rows), 1)
|
||||
results[adapter.name] = metrics
|
||||
adapter.unload()
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# IMAP helpers (stdlib only — no imap_sync dependency)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_BROAD_TERMS = [
|
||||
"interview", "opportunity", "offer letter",
|
||||
"job offer", "application", "recruiting",
|
||||
]
|
||||
|
||||
|
||||
def _load_imap_config() -> dict[str, Any]:
|
||||
import yaml
|
||||
cfg_path = Path(__file__).parent.parent / "config" / "email.yaml"
|
||||
with cfg_path.open() as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def _imap_connect(cfg: dict[str, Any]) -> imaplib.IMAP4_SSL:
|
||||
conn = imaplib.IMAP4_SSL(cfg["host"], cfg.get("port", 993))
|
||||
conn.login(cfg["username"], cfg["password"])
|
||||
return conn
|
||||
|
||||
|
||||
def _decode_part(part: Any) -> str:
|
||||
charset = part.get_content_charset() or "utf-8"
|
||||
try:
|
||||
return part.get_payload(decode=True).decode(charset, errors="replace")
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def _parse_uid(conn: imaplib.IMAP4_SSL, uid: bytes) -> dict[str, str] | None:
|
||||
try:
|
||||
_, data = conn.uid("fetch", uid, "(RFC822)")
|
||||
raw = data[0][1]
|
||||
msg = _email_lib.message_from_bytes(raw)
|
||||
subject = str(msg.get("subject", "")).strip()
|
||||
body = ""
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
if part.get_content_type() == "text/plain":
|
||||
body = _decode_part(part)
|
||||
break
|
||||
else:
|
||||
body = _decode_part(msg)
|
||||
return {"subject": subject, "body": body}
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _fetch_imap_sample(limit: int, days: int) -> list[dict[str, str]]:
|
||||
cfg = _load_imap_config()
|
||||
conn = _imap_connect(cfg)
|
||||
since = (datetime.now() - timedelta(days=days)).strftime("%d-%b-%Y")
|
||||
conn.select("INBOX")
|
||||
|
||||
seen_uids: dict[bytes, None] = {}
|
||||
for term in _BROAD_TERMS:
|
||||
_, data = conn.uid("search", None, f'(SUBJECT "{term}" SINCE {since})')
|
||||
for uid in (data[0] or b"").split():
|
||||
seen_uids[uid] = None
|
||||
|
||||
sample = list(seen_uids.keys())[:limit]
|
||||
emails = []
|
||||
for uid in sample:
|
||||
parsed = _parse_uid(conn, uid)
|
||||
if parsed:
|
||||
emails.append(parsed)
|
||||
try:
|
||||
conn.logout()
|
||||
except Exception:
|
||||
pass
|
||||
return emails
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Subcommands
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def cmd_list_models(_args: argparse.Namespace) -> None:
|
||||
print(f"\n{'Name':<20} {'Params':<8} {'Default':<20} {'Adapter':<15} Model ID")
|
||||
print("-" * 100)
|
||||
for name, entry in MODEL_REGISTRY.items():
|
||||
adapter_name = entry["adapter"].__name__
|
||||
default_flag = "yes" if entry["default"] else "(--include-slow)"
|
||||
print(f"{name:<20} {entry['params']:<8} {default_flag:<20} {adapter_name:<15} {entry['model_id']}")
|
||||
print()
|
||||
|
||||
|
||||
def cmd_score(args: argparse.Namespace) -> None:
|
||||
active = _active_models(args.include_slow)
|
||||
if args.models:
|
||||
active = {k: v for k, v in active.items() if k in args.models}
|
||||
|
||||
adapters = [
|
||||
entry["adapter"](name, entry["model_id"])
|
||||
for name, entry in active.items()
|
||||
]
|
||||
|
||||
print(f"\nScoring {len(adapters)} model(s) against {args.score_file} …\n")
|
||||
results = run_scoring(adapters, args.score_file)
|
||||
|
||||
col = 12
|
||||
print(f"{'Model':<22}" + f"{'macro-F1':>{col}} {'Accuracy':>{col}} {'ms/email':>{col}}")
|
||||
print("-" * (22 + col * 3 + 2))
|
||||
for name, m in results.items():
|
||||
print(
|
||||
f"{name:<22}"
|
||||
f"{m['__macro_f1__']:>{col}.3f}"
|
||||
f"{m['__accuracy__']:>{col}.3f}"
|
||||
f"{m['latency_ms']:>{col}.1f}"
|
||||
)
|
||||
|
||||
print("\nPer-label F1:")
|
||||
names = list(results.keys())
|
||||
print(f"{'Label':<25}" + "".join(f"{n[:11]:>{col}}" for n in names))
|
||||
print("-" * (25 + col * len(names)))
|
||||
for label in LABELS:
|
||||
row_str = f"{label:<25}"
|
||||
for m in results.values():
|
||||
row_str += f"{m[label]['f1']:>{col}.3f}"
|
||||
print(row_str)
|
||||
print()
|
||||
|
||||
|
||||
def cmd_compare(args: argparse.Namespace) -> None:
|
||||
active = _active_models(args.include_slow)
|
||||
if args.models:
|
||||
active = {k: v for k, v in active.items() if k in args.models}
|
||||
|
||||
print(f"Fetching up to {args.limit} emails from IMAP …")
|
||||
emails = _fetch_imap_sample(args.limit, args.days)
|
||||
print(f"Fetched {len(emails)} emails. Loading {len(active)} model(s) …\n")
|
||||
|
||||
adapters = [
|
||||
entry["adapter"](name, entry["model_id"])
|
||||
for name, entry in active.items()
|
||||
]
|
||||
model_names = [a.name for a in adapters]
|
||||
|
||||
col = 22
|
||||
subj_w = 50
|
||||
print(f"{'Subject':<{subj_w}}" + "".join(f"{n:<{col}}" for n in model_names))
|
||||
print("-" * (subj_w + col * len(model_names)))
|
||||
|
||||
for row in emails:
|
||||
short_subj = row["subject"][:subj_w - 1] if len(row["subject"]) > subj_w else row["subject"]
|
||||
line = f"{short_subj:<{subj_w}}"
|
||||
for adapter in adapters:
|
||||
try:
|
||||
label = adapter.classify(row["subject"], row["body"])
|
||||
except Exception as exc:
|
||||
label = f"ERR:{str(exc)[:8]}"
|
||||
line += f"{label:<{col}}"
|
||||
print(line, flush=True)
|
||||
|
||||
for adapter in adapters:
|
||||
adapter.unload()
|
||||
print()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Benchmark HuggingFace email classifiers against our 6 labels."
|
||||
)
|
||||
parser.add_argument("--list-models", action="store_true", help="Show model registry and exit")
|
||||
parser.add_argument("--score", action="store_true", help="Score against labeled JSONL")
|
||||
parser.add_argument("--compare", action="store_true", help="Visual table on live IMAP emails")
|
||||
parser.add_argument("--score-file", default="data/email_score.jsonl", help="Path to labeled JSONL")
|
||||
parser.add_argument("--limit", type=int, default=20, help="Max emails for --compare")
|
||||
parser.add_argument("--days", type=int, default=90, help="Days back for IMAP search")
|
||||
parser.add_argument("--include-slow", action="store_true", help="Include non-default heavy models")
|
||||
parser.add_argument("--models", nargs="+", help="Override: run only these model names")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.list_models:
|
||||
cmd_list_models(args)
|
||||
elif args.score:
|
||||
cmd_score(args)
|
||||
elif args.compare:
|
||||
cmd_compare(args)
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue