avocet/config/label_tool.yaml.example
pyr0ball 9bb88b168f feat(corpus): pipeline log ingest from shared dir (closes #67)
Pull-side companion to kiwi#141. Ingests structured JSONL pipeline logs
from /Library/Assets/logs/pipeline/ into the log corpus for Turnstone
logreading model training.

- app/data/log_corpus.py: add ingested_pipeline_files tracking table,
  _pipeline_ingest_dir() config helper, _ingest_one_file() parser, and
  POST /api/corpus/pipeline-ingest endpoint
- source_host = "pipeline_scrape"; source_id from logger field; extra
  dict stored as matched_patterns; batch_type = "pipeline_log"
- Idempotent by filename: skips files already in ingested_pipeline_files
- config/label_tool.yaml.example: add corpus section with pipeline_ingest_dir
  and push sources comment block
- tests/test_log_corpus.py: 8 new tests covering ingest, idempotency,
  non-JSONL filtering, malformed line resilience, incremental runs
2026-05-17 11:28:33 -07:00

144 lines
6.4 KiB
Text

# config/label_tool.yaml — Multi-account IMAP config for the email label tool
# Copy to config/label_tool.yaml and fill in your credentials.
# This file is gitignored.
accounts:
- name: "Gmail"
host: "imap.gmail.com"
port: 993
username: "you@gmail.com"
password: "your-app-password" # Use an App Password, not your login password
folder: "INBOX"
days_back: 90
- name: "Outlook"
host: "outlook.office365.com"
port: 993
username: "you@outlook.com"
password: "your-app-password"
folder: "INBOX"
days_back: 90
# Optional: limit emails fetched per account per run (0 = unlimited)
max_per_account: 500
# cf-orch SFT candidate import — path to the bench_results/ directory
# produced by circuitforge-orch's benchmark harness.
sft:
bench_results_dir: /path/to/circuitforge-orch/scripts/bench_results
# cf-orch integration — LLM benchmark harness via cf-orch coordinator.
# All keys here override the corresponding environment variables.
# Omit any key to fall back to the env var (see .env.example).
cforch:
# Path to cf-orch's benchmark.py script
bench_script: /path/to/circuitforge-orch/scripts/benchmark.py
# Task and model definition files (yaml)
bench_tasks: /path/to/circuitforge-orch/scripts/bench_tasks.yaml
bench_models: /path/to/circuitforge-orch/scripts/bench_models.yaml
# Where benchmark results are written (also used for SFT candidate discovery)
results_dir: /path/to/circuitforge-orch/scripts/bench_results
# Python interpreter with cf-orch installed
python_bin: /devl/miniconda3/envs/cf/bin/python
# Connection config — override env vars CF_ORCH_URL / CF_LICENSE_KEY / OLLAMA_HOST / CF_JUDGE_URL / HF_TOKEN
# coordinator_url: http://localhost:7700
# license_key: CFG-AVCT-xxxx-xxxx-xxxx
# ollama_url: http://localhost:11434
# ollama_model: llama3.2:3b
# embed_model: nomic-embed-text # Ollama embedding model for EmbeddingKNNAdapter
# judge_url: http://10.1.10.158:8008 # Sif cf-text — LLM-as-judge secondary scorer
# judge_url: http://10.1.10.71:8008 # Heimdall cf-text (alternative)
# Or set CF_JUDGE_URL. Populates the Judge URL field in the LLM Eval UI automatically.
# hf_token: hf_xxxxxxxxxxxxxxxxxxxx # HuggingFace token — required for gated/terms-restricted models
# Directory containing per-node profile YAMLs (cf-orch node profiles).
# Default: derived from bench_script location (../../profiles/nodes).
# profiles_dir: /Library/Development/CircuitForge/circuitforge-orch/circuitforge_orch/profiles/nodes
# Imitate tab — pull real samples from sibling CF product APIs and run them
# through local LLMs to build a corrections dataset.
# ollama_url defaults to cforch.ollama_url if omitted here.
imitate:
ollama_url: http://localhost:11434 # optional — falls back to cforch.ollama_url
products:
- id: peregrine
name: Peregrine
icon: "🦅"
description: Job search assistant — live job listings
base_url: http://localhost:8601
health_path: /api/jobs/counts
sample_endpoint: /api/jobs?status=pending&limit=5
text_fields: [title, company, description]
prompt_template: "Analyze this job listing and identify the key requirements, must-have skills, and any culture signals that would help tailor an application:\n\n{text}"
- id: osprey
name: Osprey
icon: "📞"
description: Gov't hold-line automation — recent call records
base_url: http://localhost:8520
health_path: /api/health
sample_endpoint: /api/calls/recent
text_fields: [agency, issue, notes]
prompt_template: "Draft a clear, professional follow-up letter for this government hold-line call. Include what was discussed, what action the agency committed to, and a polite deadline for response:\n\n{text}"
- id: linnet
name: Linnet
icon: "🐦"
description: Real-time tone annotation — Elcor-style subtext for ND users
base_url: http://localhost:8522
health_path: /health
sample_endpoint: /samples
text_fields: [text, context]
prompt_template: "Annotate the emotional tone and subtext of the following text using explicit Elcor-style markers (e.g. [SINCERELY], [UNCERTAIN], [FRUSTRATED]). Identify implied emotions, potential sarcasm, and any ambiguity that might be misread by neurodivergent readers:\n\n{text}"
- id: kiwi
name: Kiwi
icon: "🥝"
description: Pantry tracker
base_url: http://localhost:8511
sample_endpoint: /api/inventory
text_fields: [name, category, notes]
prompt_template: "Describe this pantry item and estimate how best to use it:\n\n{text}"
- id: snipe
name: Snipe
icon: "🎯"
description: eBay trust scoring
base_url: http://localhost:8509
sample_endpoint: /api/listings
text_fields: [title, description, seller_info]
prompt_template: "Evaluate the trustworthiness of this listing and flag any red flags:\n\n{text}"
- id: pagepiper
name: Pagepiper
icon: "📄"
description: "PDF/rulebook RAG tool: page-level text chunks"
base_url: http://localhost:8511
health_path: /api/health
sample_endpoint: /api/library
chunk_endpoint: /api/library/sample-chunks?limit=50 # requires pagepiper#6
text_fields: [title]
prompt_template: "Summarize the key rules described in this passage:\n\n{text}"
# ── Log corpus (Turnstone training data) ──────────────────────────────────────
corpus:
# Directory containing pipeline JSONL log files to ingest (pull-side).
# Files named <script>_<ts>.jsonl; one structured record per line.
# POST /api/corpus/pipeline-ingest walks this dir and imports new files.
# NFS-mounted on both Heimdall and Sif at /Library/Assets/
pipeline_ingest_dir: /Library/Assets/logs/pipeline/
# Turnstone push sources (consent-gated, token-authenticated).
# sources:
# - token: "your-bearer-token"
# source_host: "node.local"
# owner: YourName
# consent_date: "2026-05-17"
# consent_method: signal_chat
# ── Embedding model comparison harness ────────────────────────────────────────
embed_bench:
# ollama_url: http://localhost:11434 # optional; falls back to cforch.ollama_url
# top_k: 5 # default hits per model per query