Compare commits

..

No commits in common. "2b990a603aeda90b910c02cc5151ade25f6c0075" and "6f9aad126e921071ddd15b6fef169637c17c35d4" have entirely different histories.

19 changed files with 73 additions and 2181 deletions

3
.gitignore vendored
View file

@ -8,9 +8,6 @@ __pycache__/
config/label_tool.yaml config/label_tool.yaml
# Data files (user-generated, not for version control) # Data files (user-generated, not for version control)
data/corpus.db
data/corpus.db-wal
data/corpus.db-shm
data/email_score.jsonl data/email_score.jsonl
data/email_label_queue.jsonl data/email_label_queue.jsonl
data/email_compare_sample.jsonl data/email_compare_sample.jsonl

View file

@ -40,36 +40,6 @@ app.include_router(plans_bench_router, prefix="/api/plans-bench")
# In-memory last-action store (single user, local tool — in-memory is fine) # In-memory last-action store (single user, local tool — in-memory is fine)
_last_action: dict | None = None _last_action: dict | None = None
# -- Backward-compat shims (ClassifierTab still uses old /api/finetune/* paths)
# Remove once ClassifierTab fine-tune section is migrated to TrainJobsView.
from fastapi import Query
from fastapi.responses import StreamingResponse as _StreamingResponse
@app.get("/api/finetune/run")
def finetune_run_compat(model: str = Query(...), epochs: int = Query(5)) -> _StreamingResponse:
"""Shim: create a classifier train job and immediately stream it."""
from app.train.train import create_job, run_job, CreateJobRequest
job = create_job(CreateJobRequest(type="classifier", model_key=model, config_json={"epochs": epochs}))
return run_job(job["id"])
@app.post("/api/finetune/cancel")
def finetune_cancel_compat() -> dict:
"""Shim: cancel the most recent running classifier job."""
from app.train.train import _db, _init_db, cancel_job
from fastapi import HTTPException
_init_db()
with _db() as conn:
row = conn.execute(
"SELECT id FROM jobs WHERE type='classifier' AND status='running' ORDER BY started_at DESC LIMIT 1"
).fetchone()
if row is None:
return {"status": "nothing_running"}
return cancel_job(row["id"])
from app.data.log_corpus import router as log_corpus_router
app.include_router(log_corpus_router, prefix="/api/corpus")
from app.dashboard import router as dashboard_router from app.dashboard import router as dashboard_router
app.include_router(dashboard_router, prefix="/api") app.include_router(dashboard_router, prefix="/api")

View file

@ -16,7 +16,6 @@ import json
import logging import logging
import os import os
import re import re
import select as _select
import subprocess as _subprocess import subprocess as _subprocess
import tempfile import tempfile
from pathlib import Path from pathlib import Path
@ -312,12 +311,8 @@ def run_benchmark(
"""Spawn cf-orch benchmark.py and stream stdout as SSE progress events.""" """Spawn cf-orch benchmark.py and stream stdout as SSE progress events."""
global _BENCH_RUNNING, _bench_proc global _BENCH_RUNNING, _bench_proc
# Check if the process is actually still alive; reset stale flag if not.
if _BENCH_RUNNING: if _BENCH_RUNNING:
if _bench_proc is not None and _bench_proc.poll() is None: raise HTTPException(409, "A benchmark is already running")
raise HTTPException(409, "A benchmark is already running")
_BENCH_RUNNING = False
_bench_proc = None
cfg = _load_cforch_config() cfg = _load_cforch_config()
bench_script = cfg.get("bench_script", "") bench_script = cfg.get("bench_script", "")
@ -441,23 +436,8 @@ def run_benchmark(
env=proc_env, env=proc_env,
) )
_bench_proc = proc _bench_proc = proc
_IDLE_TIMEOUT_S = 120 # kill if no output for 2 minutes (node crash)
try: try:
while True: for line in proc.stdout:
ready = _select.select([proc.stdout], [], [], _IDLE_TIMEOUT_S)
if not ready[0]:
# No output for IDLE_TIMEOUT_S — node likely crashed
proc.terminate()
try:
proc.wait(timeout=5)
except _subprocess.TimeoutExpired:
proc.kill()
msg = f"Benchmark timed out — no output for {_IDLE_TIMEOUT_S}s (cluster node may have crashed)"
yield f"data: {json.dumps({'type': 'error', 'message': msg})}\n\n"
break
line = proc.stdout.readline()
if not line:
break
line = _strip_ansi(line.rstrip()) line = _strip_ansi(line.rstrip())
if line: if line:
yield f"data: {json.dumps({'type': 'progress', 'message': line})}\n\n" yield f"data: {json.dumps({'type': 'progress', 'message': line})}\n\n"

View file

@ -1,18 +1,17 @@
"""Avocet -- dashboard aggregate API. """Avocet -- dashboard aggregate API.
GET /api/dashboard returns the current flywheel state: GET /api/dashboard returns the current flywheel state:
labeled_since_last_eval -- items labeled after the most recent bench run labeled_since_last_eval -- items labeled after the most recent eval run
last_eval_timestamp -- ISO timestamp of newest bench_results summary last_eval_timestamp -- ISO timestamp of newest bench_results summary
last_eval_best_score -- best macro_f1 from that summary last_eval_best_score -- best macro_f1 from that summary
active_jobs -- jobs with status queued or running active_jobs -- jobs with status queued or running
corrections_pending -- sft_candidates with status=needs_review corrections_pending -- sft_candidates with status=needs_review
corrections_export_ready -- approved sft candidates with non-blank correction corrections_export_ready -- approved sft candidates with non-blank correction
recent_bench_runs -- most-recent timestamp + score per bench type
signals -- computed booleans for UI nudge indicators signals -- computed booleans for UI nudge indicators
Thresholds in label_tool.yaml pipeline: section: Thresholds in label_tool.yaml pipeline: section:
pipeline: pipeline:
data_eval_threshold: 50 # labeled items since last bench to trigger nudge data_eval_threshold: 50 # labeled items since last eval to trigger nudge
eval_train_threshold: 0.05 # improvement delta needed before retraining (future) eval_train_threshold: 0.05 # improvement delta needed before retraining (future)
""" """
from __future__ import annotations from __future__ import annotations
@ -78,7 +77,7 @@ def _load_score_records() -> list[dict]:
pass pass
return records return records
def _find_latest_classifier_bench(results_dir_override: str = "") -> tuple[str | None, float | None]: def _find_latest_eval(results_dir_override: str = "") -> tuple[str | None, float | None]:
"""Return (iso_timestamp, best_macro_f1) from the newest bench_results summary. """Return (iso_timestamp, best_macro_f1) from the newest bench_results summary.
Checks results_dir from cforch config if set, then falls back to Checks results_dir from cforch config if set, then falls back to
@ -108,8 +107,6 @@ def _find_latest_classifier_bench(results_dir_override: str = "") -> tuple[str |
if summary.exists(): if summary.exists():
try: try:
data = json.loads(summary.read_text(encoding="utf-8")) data = json.loads(summary.read_text(encoding="utf-8"))
if not isinstance(data, dict):
continue # cforch LLM-bench summaries are lists; skip
ts = data.get("timestamp") or subdir.name ts = data.get("timestamp") or subdir.name
score = data.get("best_macro_f1") or data.get("macro_f1") score = data.get("best_macro_f1") or data.get("macro_f1")
return ts, (float(score) if isinstance(score, (int, float)) else None) return ts, (float(score) if isinstance(score, (int, float)) else None)
@ -117,10 +114,6 @@ def _find_latest_classifier_bench(results_dir_override: str = "") -> tuple[str |
logger.warning("Failed to parse summary.json at %s: %s", summary, exc) logger.warning("Failed to parse summary.json at %s: %s", summary, exc)
return None, None return None, None
# Keep old name as alias so existing callers in tests still work.
_find_latest_eval = _find_latest_classifier_bench
def _count_corrections() -> tuple[int, int]: def _count_corrections() -> tuple[int, int]:
"""Return (pending_count, export_ready_count).""" """Return (pending_count, export_ready_count)."""
pending = 0 pending = 0
@ -176,106 +169,22 @@ def _count_labeled_since(since_ts: str | None) -> int:
return sum(1 for r in records if r.get("labeled_at", "") > since_ts) return sum(1 for r in records if r.get("labeled_at", "") > since_ts)
def _get_recent_bench_runs() -> dict:
"""Return most-recent run summary for each bench type.
Each entry: {"timestamp": str|None, "metric": str|None, "score": float|None}
"""
runs: dict[str, dict] = {
"classifier": {"timestamp": None, "metric": "macro_f1", "score": None},
"llm": {"timestamp": None, "metric": None, "score": None},
"style": {"timestamp": None, "metric": None, "score": None},
"plans": {"timestamp": None, "metric": "avg_score", "score": None},
}
# ── Classifier: bench_results/<run>/summary.json ──────────────────────
clf_ts, clf_score = _find_latest_classifier_bench()
if clf_ts:
runs["classifier"]["timestamp"] = clf_ts
runs["classifier"]["score"] = clf_score
# ── LLM bench + Style: benchmark_results/ ─────────────────────────────
f = _config_file()
bench_dir: Path | None = None
if f.exists():
try:
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
rd = (raw.get("cforch", {}) or {}).get("results_dir", "")
if rd:
bench_dir = Path(rd)
except Exception:
pass
if bench_dir is None:
bench_dir = _ROOT / "benchmark_results"
if bench_dir.exists():
llm_files = sorted(
[p for p in bench_dir.glob("*.json") if not p.name.startswith("style_")],
key=lambda p: p.stat().st_mtime, reverse=True,
)
if llm_files:
try:
data = json.loads(llm_files[0].read_text(encoding="utf-8"))
runs["llm"]["timestamp"] = data.get("timestamp") or llm_files[0].stem
except Exception:
pass
style_files = sorted(bench_dir.glob("style_*.json"), reverse=True)
if style_files:
try:
data = json.loads(style_files[0].read_text(encoding="utf-8"))
if isinstance(data, list) and data:
runs["style"]["timestamp"] = data[0].get("timestamp") or style_files[0].stem
except Exception:
pass
# ── Plans bench: data/plans_bench_results/plans_*.json ────────────────
plans_dir = _DATA_DIR / "plans_bench_results"
if plans_dir.exists():
plans_files = sorted(plans_dir.glob("plans_*.json"), reverse=True)
if plans_files:
run_id = plans_files[0].stem
try:
d: dict = json.loads(plans_files[0].read_text(encoding="utf-8"))
all_scores = [
r["total_score"]
for results in d.values()
for r in results
if isinstance(r, dict) and not r.get("error")
]
avg = round(sum(all_scores) / len(all_scores), 3) if all_scores else None
try:
date_part = run_id.removeprefix("plans_")
date, time_part = date_part.split("_")
ts_display = f"{date} {time_part[:2]}:{time_part[2:4]}"
except Exception:
ts_display = run_id
runs["plans"]["timestamp"] = ts_display
runs["plans"]["score"] = avg
except Exception:
pass
return runs
@router.get("/dashboard") @router.get("/dashboard")
def get_dashboard() -> dict: def get_dashboard() -> dict:
data_threshold, _train_threshold = _load_thresholds() data_eval_threshold, eval_train_threshold = _load_thresholds()
last_ts, last_score = _find_latest_classifier_bench() last_eval_ts, last_eval_score = _find_latest_eval()
labeled_since = _count_labeled_since(last_ts) labeled_since = _count_labeled_since(last_eval_ts)
corrections_pending, corrections_export_ready = _count_corrections() corrections_pending, corrections_export_ready = _count_corrections()
active_jobs = _get_active_jobs() active_jobs = _get_active_jobs()
recent_bench = _get_recent_bench_runs()
return { return {
"labeled_since_last_eval": labeled_since, "labeled_since_last_eval": labeled_since,
"last_eval_timestamp": last_ts, "last_eval_timestamp": last_eval_ts,
"last_eval_best_score": last_score, "last_eval_best_score": last_eval_score,
"active_jobs": active_jobs, "active_jobs": active_jobs,
"corrections_pending": corrections_pending, "corrections_pending": corrections_pending,
"corrections_export_ready": corrections_export_ready, "corrections_export_ready": corrections_export_ready,
"recent_bench_runs": recent_bench,
"signals": { "signals": {
"data_to_eval": labeled_since >= data_threshold, "data_to_eval": labeled_since >= data_eval_threshold,
"eval_to_train": False, # future: implement delta-F1 comparison "eval_to_train": False, # future: implement delta-F1 comparison
"train_to_fleet": False, # future: implement fleet sync signal "train_to_fleet": False, # future: implement fleet sync signal
}, },

View file

@ -1,352 +0,0 @@
"""Avocet — Log Corpus receiver and labeling API.
Receives push batches from consented Turnstone nodes, stores entries for labeling,
and exports labeled data as JSONL for the logreading fine-tune pipeline.
DB: data/corpus.db (separate from train_jobs.db different lifecycle)
Auth: Bearer token validated against corpus_sources table (seeded from label_tool.yaml).
All endpoints registered on `router`. api.py includes this with prefix="/api/corpus".
"""
from __future__ import annotations
import json
import logging
import sqlite3
import uuid
from contextlib import contextmanager
from datetime import datetime, timezone
from pathlib import Path
from typing import Generator
import yaml
from fastapi import APIRouter, Depends, HTTPException
from fastapi.requests import Request
from fastapi.responses import StreamingResponse
logger = logging.getLogger(__name__)
_ROOT = Path(__file__).parent.parent.parent
_CONFIG_DIR: Path | None = None
_DATA_DIR: Path = _ROOT / "data"
router = APIRouter()
_DB_PATH: Path = _ROOT / "data" / "corpus.db"
_SCHEMA = """
CREATE TABLE IF NOT EXISTS corpus_sources (
token TEXT PRIMARY KEY,
source_host TEXT NOT NULL,
owner TEXT NOT NULL,
consent_date TEXT NOT NULL,
consent_method TEXT NOT NULL,
active INTEGER NOT NULL DEFAULT 1
);
CREATE TABLE IF NOT EXISTS corpus_batches (
id TEXT PRIMARY KEY,
source_host TEXT NOT NULL,
batch_type TEXT NOT NULL,
received_at TEXT NOT NULL,
entry_count INTEGER NOT NULL,
watermark_from TEXT,
watermark_to TEXT,
raw_json TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS corpus_entries (
id TEXT PRIMARY KEY,
batch_id TEXT NOT NULL REFERENCES corpus_batches(id),
source_host TEXT NOT NULL,
origin_entry_id TEXT,
timestamp_iso TEXT,
severity TEXT,
source_id TEXT,
text TEXT NOT NULL,
matched_patterns TEXT DEFAULT '[]',
label_state TEXT NOT NULL DEFAULT 'unlabeled',
failure_type TEXT,
plain_explanation TEXT,
known_pattern TEXT,
labeled_at TEXT,
labeled_by TEXT DEFAULT 'alan',
pii_flagged INTEGER NOT NULL DEFAULT 0
);
CREATE INDEX IF NOT EXISTS idx_ce_label_state ON corpus_entries(label_state);
CREATE INDEX IF NOT EXISTS idx_ce_source ON corpus_entries(source_host);
CREATE INDEX IF NOT EXISTS idx_ce_severity ON corpus_entries(severity);
"""
# ── Testability seams ──────────────────────────────────────────────────────────
def set_config_dir(path: Path | None) -> None:
global _CONFIG_DIR
_CONFIG_DIR = path
def set_data_dir(path: Path) -> None:
global _DATA_DIR, _DB_PATH
_DATA_DIR = path
_DB_PATH = path / "corpus.db"
# ── Internal helpers ───────────────────────────────────────────────────────────
def _config_file() -> Path:
if _CONFIG_DIR is not None:
return _CONFIG_DIR / "label_tool.yaml"
return _ROOT / "config" / "label_tool.yaml"
@contextmanager
def _db() -> Generator[sqlite3.Connection, None, None]:
conn = sqlite3.connect(str(_DB_PATH))
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
try:
yield conn
conn.commit()
except Exception:
conn.rollback()
raise
finally:
conn.close()
def _init_db() -> None:
with _db() as conn:
conn.executescript(_SCHEMA)
_seed_sources(conn)
def _load_corpus_config() -> list[dict]:
f = _config_file()
if not f.exists():
return []
try:
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
except yaml.YAMLError as exc:
logger.warning("Failed to parse corpus config: %s", exc)
return []
return raw.get("corpus", {}).get("sources", []) or []
def _seed_sources(conn: sqlite3.Connection) -> None:
for src in _load_corpus_config():
conn.execute(
"INSERT OR IGNORE INTO corpus_sources (token, source_host, owner, consent_date, consent_method) "
"VALUES (?, ?, ?, ?, ?)",
(src["token"], src["source_host"], src["owner"],
src["consent_date"], src["consent_method"]),
)
def _validate_token(token: str, conn: sqlite3.Connection) -> str:
"""Return source_host for token, or raise 403."""
row = conn.execute(
"SELECT source_host FROM corpus_sources WHERE token = ? AND active = 1",
(token,),
).fetchone()
if row is None:
raise HTTPException(status_code=403, detail="Unknown or revoked consent token")
return row["source_host"]
def _extract_bearer(request: Request) -> str:
auth = request.headers.get("Authorization", "")
if not auth.startswith("Bearer "):
raise HTTPException(status_code=401, detail="Bearer token required")
return auth.removeprefix("Bearer ").strip()
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
# ── Startup ────────────────────────────────────────────────────────────────────
_init_db()
# ── POST /api/corpus/log-batch ─────────────────────────────────────────────────
@router.post("/log-batch")
def receive_batch(request: Request, payload: dict) -> dict:
"""Accept a push batch from a Turnstone node."""
token = _extract_bearer(request)
batch_type = payload.get("batch_type", "raw_entries")
entries_raw = payload.get("entries", [])
batch_id = payload.get("batch_id") or str(uuid.uuid4())
with _db() as conn:
source_host = _validate_token(token, conn)
conn.execute(
"INSERT INTO corpus_batches (id, source_host, batch_type, received_at, entry_count, "
"watermark_from, watermark_to, raw_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
(batch_id, source_host, batch_type, _now_iso(), len(entries_raw),
str(payload.get("watermark_from", "")),
str(payload.get("watermark_to", "")),
json.dumps(payload)),
)
stored = 0
for entry in entries_raw:
text = entry.get("text", "").strip()
if not text:
continue
conn.execute(
"INSERT OR IGNORE INTO corpus_entries "
"(id, batch_id, source_host, origin_entry_id, timestamp_iso, severity, "
"source_id, text, matched_patterns) "
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
(str(uuid.uuid4()), batch_id, source_host,
entry.get("entry_id") or entry.get("id"),
entry.get("timestamp_iso"),
entry.get("severity"),
entry.get("source_id"),
text,
json.dumps(entry.get("matched_patterns", []))),
)
stored += 1
logger.info("Received batch %s from %s: %d/%d entries stored",
batch_id, source_host, stored, len(entries_raw))
return {"received": True, "batch_id": batch_id, "entries_stored": stored}
# ── GET /api/corpus/entries ────────────────────────────────────────────────────
@router.get("/entries")
def list_entries(
state: str = "unlabeled",
source_host: str | None = None,
limit: int = 25,
) -> dict:
"""Return entries for labeling. Default: unlabeled entries, oldest first."""
with _db() as conn:
query = "SELECT * FROM corpus_entries WHERE label_state = ?"
params: list = [state]
if source_host:
query += " AND source_host = ?"
params.append(source_host)
query += " ORDER BY rowid LIMIT ?"
params.append(min(limit, 100))
rows = conn.execute(query, params).fetchall()
return {"entries": [dict(r) for r in rows], "count": len(rows)}
# ── POST /api/corpus/entries/{id}/label ───────────────────────────────────────
@router.post("/entries/{entry_id}/label")
def label_entry(entry_id: str, body: dict) -> dict:
"""Submit a label for a corpus entry."""
failure_type = body.get("failure_type")
plain_explanation = body.get("plain_explanation", "").strip()
known_pattern = body.get("known_pattern")
pii_flagged = int(bool(body.get("pii_flagged", False)))
if not failure_type:
raise HTTPException(status_code=422, detail="failure_type is required")
valid_types = {"hardware", "software", "network", "security", "application", "none", "other"}
if failure_type not in valid_types:
raise HTTPException(status_code=422, detail=f"failure_type must be one of {sorted(valid_types)}")
with _db() as conn:
row = conn.execute("SELECT id FROM corpus_entries WHERE id = ?", (entry_id,)).fetchone()
if row is None:
raise HTTPException(status_code=404, detail="Entry not found")
conn.execute(
"UPDATE corpus_entries SET label_state='labeled', failure_type=?, plain_explanation=?, "
"known_pattern=?, labeled_at=?, pii_flagged=? WHERE id=?",
(failure_type, plain_explanation, known_pattern, _now_iso(), pii_flagged, entry_id),
)
return {"labeled": True, "entry_id": entry_id}
# ── POST /api/corpus/entries/{id}/skip ────────────────────────────────────────
@router.post("/entries/{entry_id}/skip")
def skip_entry(entry_id: str) -> dict:
with _db() as conn:
row = conn.execute("SELECT id FROM corpus_entries WHERE id = ?", (entry_id,)).fetchone()
if row is None:
raise HTTPException(status_code=404, detail="Entry not found")
conn.execute(
"UPDATE corpus_entries SET label_state='skipped' WHERE id=?", (entry_id,)
)
return {"skipped": True, "entry_id": entry_id}
# ── GET /api/corpus/stats ──────────────────────────────────────────────────────
@router.get("/stats")
def get_stats() -> dict:
with _db() as conn:
total = conn.execute("SELECT COUNT(*) FROM corpus_entries").fetchone()[0]
by_state = {
r["label_state"]: r["cnt"]
for r in conn.execute(
"SELECT label_state, COUNT(*) AS cnt FROM corpus_entries GROUP BY label_state"
).fetchall()
}
by_source = {
r["source_host"]: r["cnt"]
for r in conn.execute(
"SELECT source_host, COUNT(*) AS cnt FROM corpus_entries GROUP BY source_host"
).fetchall()
}
by_severity = {
r["severity"]: r["cnt"]
for r in conn.execute(
"SELECT severity, COUNT(*) AS cnt FROM corpus_entries "
"WHERE severity IS NOT NULL GROUP BY severity"
).fetchall()
}
batch_count = conn.execute("SELECT COUNT(*) FROM corpus_batches").fetchone()[0]
return {
"total_entries": total,
"batch_count": batch_count,
"by_label_state": by_state,
"by_source": by_source,
"by_severity": by_severity,
}
# ── GET /api/corpus/export ────────────────────────────────────────────────────
@router.get("/export")
def export_labeled() -> StreamingResponse:
"""Stream labeled, non-PII entries as JSONL for SFT harness."""
with _db() as conn:
rows = conn.execute(
"SELECT source_host, source_id, severity, text, failure_type, plain_explanation, known_pattern "
"FROM corpus_entries "
"WHERE label_state = 'labeled' AND pii_flagged = 0 AND plain_explanation != ''"
"ORDER BY rowid"
).fetchall()
def _generate():
for row in rows:
record = {
"input": row["text"],
"output": row["plain_explanation"],
"metadata": {
"failure_type": row["failure_type"],
"source": row["source_host"],
"source_id": row["source_id"],
"severity": row["severity"],
"known_pattern": row["known_pattern"],
},
}
yield json.dumps(record) + "\n"
return StreamingResponse(
_generate(),
media_type="application/x-ndjson",
headers={"Content-Disposition": "attachment; filename=log_corpus_labeled.jsonl"},
)

View file

@ -12,33 +12,27 @@ Route prefixes when mounted at /api in api.py:
""" """
from __future__ import annotations from __future__ import annotations
from pathlib import Path
from fastapi import APIRouter from fastapi import APIRouter
from app.cforch import router as _cforch_router from app.cforch import router as _cforch_router
from app.style import router as _style_router from app.style import router as _style_router
from app.voice import router as _voice_router from app.voice import router as _voice_router
from app.plans_bench import router as _plans_router from app.plans_bench import router as _plans_router
from app.eval.embed_bench import router as _embed_router
router = APIRouter() router = APIRouter()
router.include_router(_cforch_router, prefix="/cforch") router.include_router(_cforch_router, prefix="/cforch")
router.include_router(_style_router, prefix="/style") router.include_router(_style_router, prefix="/style")
router.include_router(_voice_router, prefix="/voice") router.include_router(_voice_router, prefix="/voice")
router.include_router(_plans_router, prefix="/plans-bench") router.include_router(_plans_router, prefix="/plans-bench")
router.include_router(_embed_router, prefix="/embed-bench")
def set_config_dir(path: Path | None) -> None: def set_config_dir(path) -> None:
"""Propagate config dir override to all sub-modules -- used by tests.""" """Propagate config dir override to all sub-modules -- used by tests."""
import app.cforch as _cforch_mod import app.cforch as _cforch_mod
import app.style as _style_mod import app.style as _style_mod
import app.voice as _voice_mod import app.voice as _voice_mod
import app.plans_bench as _plans_mod import app.plans_bench as _plans_mod
import app.eval.embed_bench as _embed_mod
_cforch_mod.set_config_dir(path) _cforch_mod.set_config_dir(path)
_style_mod.set_config_dir(path) _style_mod.set_config_dir(path)
_voice_mod.set_config_dir(path) _voice_mod.set_config_dir(path)
_plans_mod.set_config_dir(path) _plans_mod.set_config_dir(path)
_embed_mod.set_config_dir(path)

View file

@ -1,293 +0,0 @@
"""Avocet — embedding model comparison harness.
Exposes FastAPI routes under /api/embed-bench (mounted via app/eval/cforch.py).
All computation is local: no LLM inference, Ollama only. MIT tier throughout.
"""
from __future__ import annotations
import csv
import io
import json
import logging
import math
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import httpx
import yaml
from fastapi import APIRouter, HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel, field_validator
logger = logging.getLogger(__name__)
_ROOT = Path(__file__).parent.parent.parent
_CONFIG_DIR: Path | None = None # override via set_config_dir() in tests
_RUN_ACTIVE: bool = False
_RATINGS_FILE = _ROOT / "data" / "embed_bench_ratings.jsonl"
router = APIRouter()
# ── Testability seam ──────────────────────────────────────────────────────────
def set_config_dir(path: Path | None) -> None:
global _CONFIG_DIR
_CONFIG_DIR = path
# ── Internal helpers ──────────────────────────────────────────────────────────
def _config_file() -> Path:
if _CONFIG_DIR is not None:
return _CONFIG_DIR / "label_tool.yaml"
return _ROOT / "config" / "label_tool.yaml"
def _load_config() -> dict[str, Any]:
f = _config_file()
if not f.exists():
return {}
try:
return yaml.safe_load(f.read_text(encoding="utf-8")) or {}
except yaml.YAMLError as exc:
logger.warning("Failed to parse embed_bench config %s: %s", f, exc)
return {}
def _ollama_url() -> str:
cfg = _load_config()
embed_cfg = cfg.get("embed_bench", {}) or {}
cforch_cfg = cfg.get("cforch", {}) or {}
return (
embed_cfg.get("ollama_url")
or cforch_cfg.get("ollama_url", "http://localhost:11434")
)
def _ratings_path() -> Path:
if _CONFIG_DIR is not None:
return _CONFIG_DIR / "embed_bench_ratings.jsonl"
return _RATINGS_FILE
def _cosine(a: list[float], b: list[float]) -> float:
if len(a) != len(b):
raise ValueError(
f"Embedding dimension mismatch: {len(a)} vs {len(b)}"
)
dot = sum(x * y for x, y in zip(a, b))
mag_a = math.sqrt(sum(x * x for x in a))
mag_b = math.sqrt(sum(x * x for x in b))
if mag_a == 0.0 or mag_b == 0.0:
return 0.0
return dot / (mag_a * mag_b)
# ── GET /models ───────────────────────────────────────────────────────────────
@router.get("/models")
def get_models() -> dict:
"""Return Ollama embedding models available on the configured instance."""
ollama = _ollama_url()
models: list[dict] = []
try:
resp = httpx.get(f"{ollama}/api/tags", timeout=5.0)
resp.raise_for_status()
for entry in resp.json().get("models", []):
models.append({
"name": entry.get("name", ""),
"size": entry.get("size", 0),
})
except httpx.HTTPStatusError as exc:
logger.warning("Ollama /api/tags returned HTTP %s: %s", exc.response.status_code, exc)
except httpx.RequestError as exc:
logger.warning("Failed to reach Ollama for model list: %s", exc)
return {"models": models, "ollama_url": ollama}
# ── POST /run ─────────────────────────────────────────────────────────────────
class RunRequest(BaseModel):
corpus: list[str]
queries: list[str]
models: list[str]
top_k: int = 5
ollama_url: str = ""
@field_validator("corpus")
@classmethod
def corpus_nonempty(cls, v: list[str]) -> list[str]:
if not v:
raise ValueError("corpus must not be empty")
return v
@field_validator("queries")
@classmethod
def queries_nonempty(cls, v: list[str]) -> list[str]:
if not v:
raise ValueError("queries must not be empty")
return v
@field_validator("models")
@classmethod
def models_nonempty(cls, v: list[str]) -> list[str]:
if not v:
raise ValueError("models must contain at least one model name")
return v
def _embed_texts(ollama: str, model: str, texts: list[str]) -> list[list[float]]:
"""Batch-embed texts via Ollama /v1/embeddings. Returns one vector per text."""
resp = httpx.post(
f"{ollama}/v1/embeddings",
json={"model": model, "input": texts},
timeout=120.0,
)
resp.raise_for_status()
data = resp.json().get("data", [])
return [item["embedding"] for item in data]
def _sse(event: dict) -> str:
return f"data: {json.dumps(event)}\n\n"
@router.post("/run")
def run_embed_bench(req: RunRequest) -> StreamingResponse:
"""Embed corpus + queries with each model; stream SSE results."""
global _RUN_ACTIVE
if _RUN_ACTIVE:
raise HTTPException(409, "An embedding benchmark run is already active")
ollama = req.ollama_url or _ollama_url()
def _generate():
global _RUN_ACTIVE
_RUN_ACTIVE = True
try:
for model_idx, model in enumerate(req.models, start=1):
yield _sse({
"type": "progress",
"msg": f"Indexing corpus with {model} ({model_idx}/{len(req.models)})...",
})
try:
corpus_vecs = _embed_texts(ollama, model, req.corpus)
except Exception as exc:
yield _sse({"type": "error", "msg": f"Ollama error for {model}: {exc}"})
continue
yield _sse({
"type": "progress",
"msg": f"Running queries with {model}...",
})
for q_idx, query in enumerate(req.queries):
try:
q_vecs = _embed_texts(ollama, model, [query])
except Exception as exc:
yield _sse({"type": "error", "msg": f"Query embed error ({model}): {exc}"})
continue
q_vec = q_vecs[0]
scored = sorted(
[
{"chunk_idx": i, "text": chunk, "score": round(_cosine(q_vec, cv), 4)}
for i, (chunk, cv) in enumerate(zip(req.corpus, corpus_vecs))
],
key=lambda h: h["score"],
reverse=True,
)[: req.top_k]
yield _sse({
"type": "result",
"query_idx": q_idx,
"query": query,
"model": model,
"hits": scored,
})
yield _sse({"type": "done"})
finally:
_RUN_ACTIVE = False
return StreamingResponse(_generate(), media_type="text/event-stream")
# ── POST /rate ────────────────────────────────────────────────────────────────
_VALID_RATINGS = {"relevant", "not_relevant"}
class RatingRequest(BaseModel):
query: str
model: str
chunk_text: str
chunk_idx: int
rating: str
@field_validator("rating")
@classmethod
def rating_valid(cls, v: str) -> str:
if v not in _VALID_RATINGS:
raise ValueError(f"rating must be one of {_VALID_RATINGS}")
return v
@router.post("/rate")
def rate_result(req: RatingRequest) -> dict:
"""Append one rating to the JSONL ratings file."""
entry = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"query": req.query,
"model": req.model,
"chunk_idx": req.chunk_idx,
"chunk_text": req.chunk_text,
"rating": req.rating,
}
path = _ratings_path()
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("a", encoding="utf-8") as fh:
fh.write(json.dumps(entry) + "\n")
return {"ok": True}
# ── GET /export ───────────────────────────────────────────────────────────────
_CSV_FIELDS = ["timestamp", "query", "model", "chunk_idx", "chunk_text", "rating"]
@router.get("/export")
def export_ratings(format: str = "csv") -> Any:
"""Download ratings as CSV or JSON."""
path = _ratings_path()
rows: list[dict] = []
if path.exists():
for raw in path.read_text(encoding="utf-8").splitlines():
raw = raw.strip()
if raw:
try:
rows.append(json.loads(raw))
except json.JSONDecodeError:
pass
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
if format == "json":
content = json.dumps(rows, ensure_ascii=False, indent=2)
return StreamingResponse(
iter([content]),
media_type="application/json",
headers={"Content-Disposition": f'attachment; filename="embed_comparison_{date_str}.json"'},
)
# Default: CSV
buf = io.StringIO()
writer = csv.DictWriter(buf, fieldnames=_CSV_FIELDS, extrasaction="ignore")
writer.writeheader()
writer.writerows(rows)
return StreamingResponse(
iter([buf.getvalue()]),
media_type="text/csv",
headers={"Content-Disposition": f'attachment; filename="embed_comparison_{date_str}.csv"'},
)

View file

@ -38,17 +38,13 @@ except ImportError: # pragma: no cover
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_ROOT = Path(__file__).parent.parent _ROOT = Path(__file__).parent.parent
_MODELS_DIR: Path = Path( _MODELS_DIR: Path = _ROOT / "models"
os.environ.get("AVOCET_MODELS_DIR", str(_ROOT / "models"))
)
_QUEUE_DIR: Path = _ROOT / "data" _QUEUE_DIR: Path = _ROOT / "data"
# Service-specific model destinations. # Service-specific model destinations.
# cf-text models land on the NFS-mounted shared asset store so every cluster # cf-text models land on the NFS-mounted shared asset store so every cluster
# node can reach them without a separate download. Avocet classifiers default # node can reach them without a separate download. Avocet classifiers stay local
# to a local path but can be redirected via AVOCET_MODELS_DIR — set this to # because they are fine-tuned in-place and are only consumed by avocet itself.
# /Library/Assets/LLM/avocet/models on NFS-connected nodes to keep all model
# weights out of the repo directory.
# Override via CF_TEXT_MODELS_DIR env var (useful for dev / non-NFS setups). # Override via CF_TEXT_MODELS_DIR env var (useful for dev / non-NFS setups).
_CF_TEXT_MODELS_DIR: Path = Path( _CF_TEXT_MODELS_DIR: Path = Path(
os.environ.get("CF_TEXT_MODELS_DIR", "/Library/Assets/LLM/cf-text/models") os.environ.get("CF_TEXT_MODELS_DIR", "/Library/Assets/LLM/cf-text/models")

View file

@ -110,19 +110,3 @@ imitate:
sample_endpoint: /api/listings sample_endpoint: /api/listings
text_fields: [title, description, seller_info] text_fields: [title, description, seller_info]
prompt_template: "Evaluate the trustworthiness of this listing and flag any red flags:\n\n{text}" prompt_template: "Evaluate the trustworthiness of this listing and flag any red flags:\n\n{text}"
- id: pagepiper
name: Pagepiper
icon: "📄"
description: "PDF/rulebook RAG tool: page-level text chunks"
base_url: http://localhost:8511
health_path: /api/health
sample_endpoint: /api/library
chunk_endpoint: /api/library/sample-chunks?limit=50 # requires pagepiper#6
text_fields: [title]
prompt_template: "Summarize the key rules described in this passage:\n\n{text}"
# ── Embedding model comparison harness ────────────────────────────────────────
embed_bench:
# ollama_url: http://localhost:11434 # optional; falls back to cforch.ollama_url
# top_k: 5 # default hits per model per query

View file

@ -1,234 +0,0 @@
"""Tests for app/eval/embed_bench.py."""
from __future__ import annotations
import json
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
from fastapi.testclient import TestClient
# ── Fixtures ──────────────────────────────────────────────────────────────────
@pytest.fixture(autouse=True)
def reset_embed_bench_globals(tmp_path):
"""Redirect config dir to tmp_path and reset running flag."""
from app.eval import embed_bench as mod
prev_config_dir = mod._CONFIG_DIR
prev_running = mod._RUN_ACTIVE
mod.set_config_dir(tmp_path)
mod._RUN_ACTIVE = False
yield tmp_path
mod.set_config_dir(prev_config_dir)
mod._RUN_ACTIVE = prev_running
@pytest.fixture
def client():
from app.api import app
return TestClient(app)
# ── cosine helper ──────────────────────────────────────────────────────────────
def test_cosine_identical():
from app.eval.embed_bench import _cosine
assert _cosine([1.0, 0.0], [1.0, 0.0]) == pytest.approx(1.0)
def test_cosine_orthogonal():
from app.eval.embed_bench import _cosine
assert _cosine([1.0, 0.0], [0.0, 1.0]) == pytest.approx(0.0)
def test_cosine_opposite():
from app.eval.embed_bench import _cosine
assert _cosine([1.0, 0.0], [-1.0, 0.0]) == pytest.approx(-1.0)
def test_cosine_zero_vector_returns_zero():
from app.eval.embed_bench import _cosine
assert _cosine([0.0, 0.0], [1.0, 0.0]) == pytest.approx(0.0)
# ── models endpoint ────────────────────────────────────────────────────────────
def test_models_returns_list_with_mock(client, tmp_path):
"""GET /api/embed-bench/models returns list from Ollama tags endpoint."""
import yaml
cfg = {"cforch": {"ollama_url": "http://localhost:11434"}}
(tmp_path / "label_tool.yaml").write_text(yaml.dump(cfg))
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.json.return_value = {
"models": [
{"name": "nomic-embed-text", "size": 274302480},
{"name": "mxbai-embed-large", "size": 669000000},
]
}
mock_resp.raise_for_status = MagicMock()
with patch("app.eval.embed_bench.httpx.get", return_value=mock_resp):
r = client.get("/api/embed-bench/models")
assert r.status_code == 200
data = r.json()
assert isinstance(data["models"], list)
assert any(m["name"] == "nomic-embed-text" for m in data["models"])
def test_models_returns_empty_on_ollama_error(client, tmp_path):
"""GET /api/embed-bench/models returns empty list if Ollama unreachable."""
import httpx
with patch("app.eval.embed_bench.httpx.get", side_effect=httpx.ConnectError("refused")):
r = client.get("/api/embed-bench/models")
assert r.status_code == 200
assert r.json()["models"] == []
# ── run endpoint ───────────────────────────────────────────────────────────────
def test_run_empty_corpus_returns_422(client):
r = client.post("/api/embed-bench/run", json={
"corpus": [], "queries": ["test"], "models": ["nomic-embed-text"], "top_k": 3
})
assert r.status_code == 422
def test_run_empty_queries_returns_422(client):
r = client.post("/api/embed-bench/run", json={
"corpus": ["chunk 1"], "queries": [], "models": ["nomic-embed-text"], "top_k": 3
})
assert r.status_code == 422
def test_run_empty_models_returns_422(client):
r = client.post("/api/embed-bench/run", json={
"corpus": ["chunk 1"], "queries": ["test"], "models": [], "top_k": 3
})
assert r.status_code == 422
def _fake_embed_response(texts: list[str]) -> MagicMock:
"""Build a mock httpx.post response returning unit vectors for each text."""
resp = MagicMock()
resp.raise_for_status = MagicMock()
resp.json.return_value = {
"data": [{"embedding": [1.0, 0.0, 0.0] if i % 2 == 0 else [0.0, 1.0, 0.0]}
for i, _ in enumerate(texts)]
}
return resp
def _collect_sse(raw: bytes) -> list[dict]:
"""Parse SSE stream bytes into a list of decoded event dicts."""
events = []
for line in raw.decode().splitlines():
if line.startswith("data: "):
events.append(json.loads(line[6:]))
return events
def test_run_single_model_returns_result_and_done(client, tmp_path):
import yaml
(tmp_path / "label_tool.yaml").write_text(yaml.dump({"cforch": {"ollama_url": "http://localhost:11434"}}))
with patch("app.eval.embed_bench.httpx.post", return_value=_fake_embed_response(["chunk 1", "chunk 2"])):
r = client.post("/api/embed-bench/run", json={
"corpus": ["chunk 1", "chunk 2"],
"queries": ["what is chunk one?"],
"models": ["nomic-embed-text"],
"top_k": 2,
})
assert r.status_code == 200
events = _collect_sse(r.content)
types = [e["type"] for e in events]
assert "result" in types
assert types[-1] == "done"
result_events = [e for e in events if e["type"] == "result"]
assert result_events[0]["model"] == "nomic-embed-text"
assert result_events[0]["query_idx"] == 0
assert len(result_events[0]["hits"]) <= 2
def test_run_two_models_returns_two_result_events_per_query(client, tmp_path):
import yaml
(tmp_path / "label_tool.yaml").write_text(yaml.dump({"cforch": {"ollama_url": "http://localhost:11434"}}))
with patch("app.eval.embed_bench.httpx.post", return_value=_fake_embed_response(["chunk A", "chunk B"])):
r = client.post("/api/embed-bench/run", json={
"corpus": ["chunk A", "chunk B"],
"queries": ["find it"],
"models": ["nomic-embed-text", "mxbai-embed-large"],
"top_k": 2,
})
events = _collect_sse(r.content)
result_events = [e for e in events if e["type"] == "result"]
models_seen = {e["model"] for e in result_events}
assert "nomic-embed-text" in models_seen
assert "mxbai-embed-large" in models_seen
# ── rate + export ──────────────────────────────────────────────────────────────
def test_rate_appends_jsonl_line(client, tmp_path):
r = client.post("/api/embed-bench/rate", json={
"query": "test query",
"model": "nomic-embed-text",
"chunk_text": "some text",
"chunk_idx": 2,
"rating": "relevant",
})
assert r.status_code == 200
assert r.json() == {"ok": True}
ratings_file = tmp_path / "embed_bench_ratings.jsonl"
assert ratings_file.exists()
line = json.loads(ratings_file.read_text().strip())
assert line["query"] == "test query"
assert line["rating"] == "relevant"
assert line["chunk_idx"] == 2
assert "timestamp" in line
def test_export_csv_two_rows(client, tmp_path):
for i in range(2):
client.post("/api/embed-bench/rate", json={
"query": f"q{i}", "model": "nomic-embed-text",
"chunk_text": f"chunk {i}", "chunk_idx": i, "rating": "relevant",
})
r = client.get("/api/embed-bench/export?format=csv")
assert r.status_code == 200
assert "text/csv" in r.headers["content-type"]
lines = r.text.strip().splitlines()
assert len(lines) == 3 # header + 2 rows
assert "query" in lines[0]
def test_export_json_two_entries(client, tmp_path):
for i in range(2):
client.post("/api/embed-bench/rate", json={
"query": f"q{i}", "model": "nomic-embed-text",
"chunk_text": f"chunk {i}", "chunk_idx": i, "rating": "not_relevant",
})
r = client.get("/api/embed-bench/export?format=json")
assert r.status_code == 200
data = r.json()
assert isinstance(data, list)
assert len(data) == 2
assert data[0]["rating"] == "not_relevant"
def test_export_empty_returns_csv_header_only(client):
r = client.get("/api/embed-bench/export?format=csv")
assert r.status_code == 200
lines = r.text.strip().splitlines()
assert len(lines) == 1 # header only
assert "query" in lines[0]

View file

@ -1,272 +0,0 @@
"""Tests for app/data/log_corpus.py — corpus receiver and labeling endpoints."""
from __future__ import annotations
import json
import uuid
from pathlib import Path
import pytest
from fastapi.testclient import TestClient
from app.data import log_corpus as lc
VALID_TOKEN = str(uuid.uuid4())
VALID_HOST = "testnode.local"
@pytest.fixture(autouse=True)
def isolated_db(tmp_path, monkeypatch):
"""Each test gets its own fresh corpus DB and config dir."""
monkeypatch.setattr(lc, "_DATA_DIR", tmp_path)
monkeypatch.setattr(lc, "_DB_PATH", tmp_path / "corpus.db")
# Config dir pointing to a temp yaml with one test source
config_dir = tmp_path / "config"
config_dir.mkdir()
(config_dir / "label_tool.yaml").write_text(
f"corpus:\n sources:\n"
f" - token: \"{VALID_TOKEN}\"\n"
f" source_host: \"{VALID_HOST}\"\n"
f" owner: TestOwner\n"
f" consent_date: \"2026-05-11\"\n"
f" consent_method: signal_chat\n"
)
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
lc._init_db()
@pytest.fixture()
def client():
from fastapi import FastAPI
app = FastAPI()
app.include_router(lc.router, prefix="/api/corpus")
return TestClient(app)
def _batch(batch_type="raw_entries", entries=None, source_host=VALID_HOST):
return {
"batch_version": 1,
"batch_id": str(uuid.uuid4()),
"pushed_at": "2026-05-11T10:00:00Z",
"source_host": source_host,
"batch_type": batch_type,
"watermark_from": 0,
"watermark_to": 5,
"entries": entries or [
{
"entry_id": str(uuid.uuid4()),
"source_id": "sonarr",
"timestamp_iso": "2026-05-11T09:58:00Z",
"severity": "ERROR",
"text": "Connection refused to indexer",
"matched_patterns": [],
}
],
}
# ── Receive endpoint ───────────────────────────────────────────────────────────
def test_receive_missing_auth(client):
resp = client.post("/api/corpus/log-batch", json=_batch())
assert resp.status_code == 401
def test_receive_invalid_token(client):
resp = client.post(
"/api/corpus/log-batch",
json=_batch(),
headers={"Authorization": "Bearer bad-token"},
)
assert resp.status_code == 403
def test_receive_valid_batch(client):
resp = client.post(
"/api/corpus/log-batch",
json=_batch(),
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
)
assert resp.status_code == 200
data = resp.json()
assert data["received"] is True
assert data["entries_stored"] == 1
def test_receive_stores_source_host_from_token_not_payload(client):
"""source_host is always taken from the DB lookup, not the payload."""
payload = _batch(source_host="attacker-injected-host")
resp = client.post(
"/api/corpus/log-batch",
json=payload,
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
)
assert resp.status_code == 200
entries_resp = client.get("/api/corpus/entries")
entry = entries_resp.json()["entries"][0]
assert entry["source_host"] == VALID_HOST
def test_receive_skips_empty_text_entries(client):
payload = _batch(entries=[
{"entry_id": "e1", "source_id": "svc", "severity": "ERROR", "text": ""},
{"entry_id": "e2", "source_id": "svc", "severity": "ERROR", "text": " "},
{"entry_id": "e3", "source_id": "svc", "severity": "ERROR", "text": "real error"},
])
resp = client.post(
"/api/corpus/log-batch",
json=payload,
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
)
assert resp.json()["entries_stored"] == 1
def test_receive_incident_bundle(client):
payload = _batch(batch_type="incident_bundles", entries=[
{"id": "inc-1", "label": "plex crash", "issue_type": "plex",
"started_at": "2026-05-11T09:00:00", "ended_at": "2026-05-11T09:30:00",
"notes": "audio dropped", "created_at": "2026-05-11T09:35:00",
"severity": "high", "text": "plex crash"},
])
resp = client.post(
"/api/corpus/log-batch",
json=payload,
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
)
assert resp.status_code == 200
assert resp.json()["entries_stored"] == 1
# ── Labeling endpoints ─────────────────────────────────────────────────────────
def test_label_entry(client):
client.post(
"/api/corpus/log-batch",
json=_batch(),
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
)
entry_id = client.get("/api/corpus/entries").json()["entries"][0]["id"]
resp = client.post(f"/api/corpus/entries/{entry_id}/label", json={
"failure_type": "software",
"plain_explanation": "Sonarr lost connection to its indexer — restart the service.",
"known_pattern": "y",
})
assert resp.status_code == 200
assert resp.json()["labeled"] is True
entries = client.get("/api/corpus/entries", params={"state": "labeled"}).json()["entries"]
assert len(entries) == 1
assert entries[0]["failure_type"] == "software"
def test_label_entry_invalid_failure_type(client):
client.post(
"/api/corpus/log-batch",
json=_batch(),
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
)
entry_id = client.get("/api/corpus/entries").json()["entries"][0]["id"]
resp = client.post(f"/api/corpus/entries/{entry_id}/label", json={"failure_type": "aliens"})
assert resp.status_code == 422
def test_label_entry_missing_failure_type(client):
client.post(
"/api/corpus/log-batch",
json=_batch(),
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
)
entry_id = client.get("/api/corpus/entries").json()["entries"][0]["id"]
resp = client.post(f"/api/corpus/entries/{entry_id}/label", json={})
assert resp.status_code == 422
def test_label_entry_not_found(client):
resp = client.post("/api/corpus/entries/nonexistent/label", json={"failure_type": "software"})
assert resp.status_code == 404
def test_skip_entry(client):
client.post(
"/api/corpus/log-batch",
json=_batch(),
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
)
entry_id = client.get("/api/corpus/entries").json()["entries"][0]["id"]
resp = client.post(f"/api/corpus/entries/{entry_id}/skip")
assert resp.status_code == 200
unlabeled = client.get("/api/corpus/entries").json()["entries"]
assert len(unlabeled) == 0
# ── Stats ──────────────────────────────────────────────────────────────────────
def test_stats_empty(client):
stats = client.get("/api/corpus/stats").json()
assert stats["total_entries"] == 0
assert stats["batch_count"] == 0
def test_stats_after_receive(client):
client.post(
"/api/corpus/log-batch",
json=_batch(),
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
)
stats = client.get("/api/corpus/stats").json()
assert stats["total_entries"] == 1
assert stats["batch_count"] == 1
assert stats["by_label_state"].get("unlabeled", 0) == 1
# ── Export ─────────────────────────────────────────────────────────────────────
def test_export_excludes_unlabeled(client):
client.post(
"/api/corpus/log-batch",
json=_batch(),
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
)
resp = client.get("/api/corpus/export")
assert resp.status_code == 200
assert resp.text.strip() == ""
def test_export_includes_labeled(client):
client.post(
"/api/corpus/log-batch",
json=_batch(),
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
)
entry_id = client.get("/api/corpus/entries").json()["entries"][0]["id"]
client.post(f"/api/corpus/entries/{entry_id}/label", json={
"failure_type": "software",
"plain_explanation": "Sonarr lost connection to indexer.",
})
resp = client.get("/api/corpus/export")
assert resp.status_code == 200
lines = [l for l in resp.text.strip().splitlines() if l]
assert len(lines) == 1
record = json.loads(lines[0])
assert record["output"] == "Sonarr lost connection to indexer."
assert record["metadata"]["failure_type"] == "software"
def test_export_excludes_pii_flagged(client):
client.post(
"/api/corpus/log-batch",
json=_batch(),
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
)
entry_id = client.get("/api/corpus/entries").json()["entries"][0]["id"]
client.post(f"/api/corpus/entries/{entry_id}/label", json={
"failure_type": "software",
"plain_explanation": "Contains username — should not export.",
"pii_flagged": True,
})
resp = client.get("/api/corpus/export")
assert resp.text.strip() == ""

View file

@ -223,9 +223,8 @@ const dataItems: NavItem[] = [
] ]
const evalItems: NavItem[] = [ const evalItems: NavItem[] = [
{ path: '/eval/benchmark', icon: '📊', label: 'Benchmark' }, { path: '/eval/benchmark', icon: '📊', label: 'Benchmark' },
{ path: '/eval/compare', icon: '🔍', label: 'Compare' }, { path: '/eval/compare', icon: '🔍', label: 'Compare' },
{ path: '/eval/embed-compare', icon: '🧮', label: 'Embed Compare' },
] ]
const trainItems: NavItem[] = [ const trainItems: NavItem[] = [

View file

@ -30,7 +30,6 @@ export const routes = [
// ── Eval domain ────────────────────────────────────────── // ── Eval domain ──────────────────────────────────────────
{ path: '/eval/benchmark', component: BenchmarkView, meta: { title: 'Benchmark' } }, { path: '/eval/benchmark', component: BenchmarkView, meta: { title: 'Benchmark' } },
{ path: '/eval/compare', component: CompareView, meta: { title: 'Compare' } }, { path: '/eval/compare', component: CompareView, meta: { title: 'Compare' } },
{ path: '/eval/embed-compare', component: () => import('../views/EmbedCompareView.vue'), meta: { title: 'Embed Compare' } },
// ── Train domain ───────────────────────────────────────── // ── Train domain ─────────────────────────────────────────
{ path: '/train/jobs', component: TrainJobsView, meta: { title: 'Training Jobs' } }, { path: '/train/jobs', component: TrainJobsView, meta: { title: 'Training Jobs' } },

View file

@ -325,7 +325,7 @@ function toggleCategory(models: AvailableModel[], checked: boolean) {
async function loadModelCategories() { async function loadModelCategories() {
modelsLoading.value = true modelsLoading.value = true
const { data } = await useApiFetch<ModelCategoriesResponse>('/api/cforch/models') const { data } = await useApiFetch<ModelCategoriesResponse>('/api/benchmark/models')
modelsLoading.value = false modelsLoading.value = false
if (data?.categories) { if (data?.categories) {
modelCategories.value = data.categories modelCategories.value = data.categories
@ -342,7 +342,7 @@ const modelCount = computed(() => modelNames.value.length)
const labelNames = computed(() => { const labelNames = computed(() => {
const canonical = Object.keys(LABEL_META) const canonical = Object.keys(LABEL_META)
const inResults = new Set( const inResults = new Set(
modelNames.value.flatMap(n => Object.keys(results.value?.models[n]?.per_label ?? {})) modelNames.value.flatMap(n => Object.keys(results.value!.models[n].per_label))
) )
return [...canonical.filter(l => inResults.has(l)), ...[...inResults].filter(l => !canonical.includes(l))] return [...canonical.filter(l => inResults.has(l)), ...[...inResults].filter(l => !canonical.includes(l))]
}) })
@ -401,16 +401,16 @@ function formatDate(iso: string | null): string {
// Data loading // Data loading
async function loadResults() { async function loadResults() {
loading.value = true loading.value = true
const { data } = await useApiFetch<BenchResults>('/api/cforch/results') const { data } = await useApiFetch<BenchResults>('/api/benchmark/results')
loading.value = false loading.value = false
if (data?.models && Object.keys(data.models).length > 0) { if (data && Object.keys(data.models).length > 0) {
results.value = data results.value = data
} }
} }
async function loadFineTunedModels() { async function loadFineTunedModels() {
const { data } = await useApiFetch<{ results: FineTunedModel[] }>('/api/train/results') const { data } = await useApiFetch<FineTunedModel[]>('/api/finetune/status')
if (Array.isArray(data?.results)) fineTunedModels.value = data.results if (Array.isArray(data)) fineTunedModels.value = data
} }
// Benchmark run // Benchmark run
@ -428,7 +428,7 @@ function startBenchmark() {
params.set('model_names', [...selectedModels.value].join(',')) params.set('model_names', [...selectedModels.value].join(','))
} }
const qs = params.toString() const qs = params.toString()
const url = `/api/cforch/run${qs ? `?${qs}` : ''}` const url = `/api/benchmark/run${qs ? `?${qs}` : ''}`
useApiSSE( useApiSSE(
url, url,
async (event) => { async (event) => {
@ -457,7 +457,7 @@ function startBenchmark() {
} }
async function cancelBenchmark() { async function cancelBenchmark() {
await fetch('/api/cforch/cancel', { method: 'POST' }).catch(() => {}) await fetch('/api/benchmark/cancel', { method: 'POST' }).catch(() => {})
} }
// Fine-tune // Fine-tune

View file

@ -71,35 +71,32 @@
rows="6" rows="6"
/> />
<!-- LLM model picker (ollama + vllm + cf-text) --> <!-- Ollama model picker -->
<details class="model-picker" open> <details class="model-picker" open>
<summary class="picker-summary"> <summary class="picker-summary">
<span class="picker-title">🤖 LLM Models</span> <span class="picker-title">🤖 Ollama Models</span>
<span class="picker-badge">{{ cmpSelectedModels.size }} / {{ llmSelectableModels.length }}</span> <span class="picker-badge">{{ cmpSelectedModels.size }} / {{ ollamaLlmModels.length }}</span>
</summary> </summary>
<div class="picker-body"> <div class="picker-body">
<label class="picker-cat-header"> <label class="picker-cat-header">
<input <input
type="checkbox" type="checkbox"
:checked="cmpSelectedModels.size === llmSelectableModels.length" :checked="cmpSelectedModels.size === ollamaLlmModels.length"
:indeterminate="cmpSelectedModels.size > 0 && cmpSelectedModels.size < llmSelectableModels.length" :indeterminate="cmpSelectedModels.size > 0 && cmpSelectedModels.size < ollamaLlmModels.length"
@change="toggleAllCmpModels(($event.target as HTMLInputElement).checked)" @change="toggleAllCmpModels(($event.target as HTMLInputElement).checked)"
/> />
<span class="picker-cat-name">All LLM models</span> <span class="picker-cat-name">All ollama models</span>
</label> </label>
<div v-for="(models, service) in llmModelsByService" :key="service" class="picker-category"> <div class="picker-model-list">
<span class="picker-cat-section">{{ service }}</span> <label v-for="m in ollamaLlmModels" :key="m.id" class="picker-model-row">
<div class="picker-model-list"> <input
<label v-for="m in models" :key="m.id" class="picker-model-row"> type="checkbox"
<input :checked="cmpSelectedModels.has(m.id)"
type="checkbox" @change="toggleCmpModel(m.id, ($event.target as HTMLInputElement).checked)"
:checked="cmpSelectedModels.has(m.id)" />
@change="toggleCmpModel(m.id, ($event.target as HTMLInputElement).checked)" <span class="picker-model-name">{{ m.name }}</span>
/> <span class="picker-adapter-type">{{ m.tags.slice(0, 3).join(', ') }}</span>
<span class="picker-model-name">{{ m.name }}</span> </label>
<span class="picker-adapter-type">{{ m.tags.slice(0, 2).join(', ') }}</span>
</label>
</div>
</div> </div>
</div> </div>
</details> </details>
@ -235,22 +232,10 @@ const cmpResults = ref<CmpResult[]>([])
const cmpEventSource = ref<EventSource | null>(null) const cmpEventSource = ref<EventSource | null>(null)
// Computed // Computed
const LLM_SERVICES = new Set(['ollama', 'vllm', 'cf-text']) const ollamaLlmModels = computed(() =>
llmModels.value.filter(m => m.service === 'ollama')
const llmSelectableModels = computed(() =>
llmModels.value.filter(m => LLM_SERVICES.has(m.service))
) )
/** Group selectable models by service for the picker UI */
const llmModelsByService = computed((): Record<string, CfOrchModel[]> => {
const groups: Record<string, CfOrchModel[]> = {}
for (const m of llmSelectableModels.value) {
if (!groups[m.service]) groups[m.service] = []
groups[m.service].push(m)
}
return groups
})
const llmTasksByType = computed((): Record<string, CfOrchTask[]> => { const llmTasksByType = computed((): Record<string, CfOrchTask[]> => {
const groups: Record<string, CfOrchTask[]> = {} const groups: Record<string, CfOrchTask[]> = {}
for (const t of llmTasks.value) { for (const t of llmTasks.value) {
@ -285,7 +270,7 @@ function toggleCmpModel(id: string, checked: boolean) {
function toggleAllCmpModels(checked: boolean) { function toggleAllCmpModels(checked: boolean) {
cmpSelectedModels.value = checked cmpSelectedModels.value = checked
? new Set(llmSelectableModels.value.map(m => m.id)) ? new Set(ollamaLlmModels.value.map(m => m.id))
: new Set() : new Set()
} }
@ -303,8 +288,9 @@ async function loadLlmModels() {
const { data } = await useApiFetch<{ models: CfOrchModel[] }>('/api/cforch/models') const { data } = await useApiFetch<{ models: CfOrchModel[] }>('/api/cforch/models')
if (data?.models) { if (data?.models) {
llmModels.value = data.models llmModels.value = data.models
// Pre-select all ollama models
cmpSelectedModels.value = new Set( cmpSelectedModels.value = new Set(
data.models.filter(m => LLM_SERVICES.has(m.service)).map(m => m.id) data.models.filter(m => m.service === 'ollama').map(m => m.id)
) )
} }
} }

View file

@ -28,6 +28,9 @@
<span class="metric-label"> labeled since last eval</span> <span class="metric-label"> labeled since last eval</span>
</p> </p>
</div> </div>
<div v-if="data.signals.data_to_eval" class="card-cta">
<RouterLink to="/eval/benchmark" class="cta-btn">Run Eval</RouterLink>
</div>
</div> </div>
<!-- Eval card --> <!-- Eval card -->
@ -37,28 +40,18 @@
<h2 class="card-title">Eval</h2> <h2 class="card-title">Eval</h2>
</div> </div>
<div class="card-body"> <div class="card-body">
<div class="bench-run-table"> <p class="card-metric">
<div <span class="metric-label">Last run: </span>
v-for="(run, type) in data.recent_bench_runs" <strong class="metric-value">{{ formattedEvalTime }}</strong>
:key="type" </p>
class="bench-run-row" <p v-if="data.last_eval_best_score != null" class="card-metric">
> <span class="metric-label">Best score: </span>
<span class="bench-type-label">{{ BENCH_LABELS[type as BenchType] ?? type }}</span> <strong class="metric-value">{{ formatScore(data.last_eval_best_score) }}</strong>
<span class="bench-run-time" :class="{ 'metric-muted': !run.timestamp }"> </p>
{{ run.timestamp ? formatBenchTs(run.timestamp) : '—' }}
</span>
<span v-if="run.score != null" class="bench-run-score">
{{ formatScore(run.score) }}
</span>
</div>
</div>
</div> </div>
<div v-if="data.signals.eval_to_train" class="card-cta"> <div v-if="data.signals.eval_to_train" class="card-cta">
<RouterLink to="/train/jobs" class="cta-btn">Queue Finetune</RouterLink> <RouterLink to="/train/jobs" class="cta-btn">Queue Finetune</RouterLink>
</div> </div>
<div v-if="data.signals.data_to_eval" class="card-cta">
<RouterLink to="/eval/benchmark" class="cta-btn">Run Eval</RouterLink>
</div>
</div> </div>
<!-- Train card --> <!-- Train card -->
@ -111,49 +104,33 @@ interface DashboardSignals {
train_to_fleet: boolean train_to_fleet: boolean
} }
interface BenchRun {
timestamp: string | null
metric: string | null
score: number | null
}
type BenchType = 'classifier' | 'llm' | 'style' | 'plans'
interface DashboardData { interface DashboardData {
labeled_since_last_eval: number labeled_since_last_eval: number
last_eval_timestamp: string | null last_eval_timestamp: string | null
last_eval_best_score: number | null last_eval_best_score: number | null
active_jobs: ActiveJob[] active_jobs: ActiveJob[]
corrections_export_ready: number corrections_export_ready: number
recent_bench_runs: Record<BenchType, BenchRun>
signals: DashboardSignals signals: DashboardSignals
} }
const BENCH_LABELS: Record<BenchType, string> = {
classifier: 'Classifier',
llm: 'LLM Eval',
style: 'Style',
plans: 'Planning',
}
const data = ref<DashboardData | null>(null) const data = ref<DashboardData | null>(null)
const loading = ref(false) const loading = ref(false)
const error = ref<string | null>(null) const error = ref<string | null>(null)
function formatBenchTs(ts: string): string { const formattedEvalTime = computed(() => {
const date = new Date(ts) if (!data.value?.last_eval_timestamp) return 'Never'
if (!isNaN(date.getTime())) { const date = new Date(data.value.last_eval_timestamp)
const diff = Date.now() - date.getTime() if (isNaN(date.getTime())) return 'Unknown'
const mins = Math.floor(diff / 60000) const now = Date.now()
if (mins < 1) return 'just now' const diff = now - date.getTime()
if (mins < 60) return `${mins}m ago` const mins = Math.floor(diff / 60000)
const hrs = Math.floor(mins / 60) if (mins < 1) return 'just now'
if (hrs < 24) return `${hrs}h ago` if (mins < 60) return `${mins}m ago`
return `${Math.floor(hrs / 24)}d ago` const hrs = Math.floor(mins / 60)
} if (hrs < 24) return `${hrs}h ago`
// Non-ISO: show as-is (plans bench uses "YYYY-MM-DD HH:MM") const days = Math.floor(hrs / 24)
return ts.length > 16 ? ts.slice(0, 16) : ts return `${days}d ago`
} })
function formatScore(score: number): string { function formatScore(score: number): string {
return `${(score * 100).toFixed(1)}%` return `${(score * 100).toFixed(1)}%`
@ -308,42 +285,6 @@ onMounted(() => load())
.cta-btn:hover { background: color-mix(in srgb, var(--app-primary, #2A6080) 85%, black); } .cta-btn:hover { background: color-mix(in srgb, var(--app-primary, #2A6080) 85%, black); }
/* ── Bench run table ── */
.bench-run-table {
display: flex;
flex-direction: column;
gap: 0.3rem;
}
.bench-run-row {
display: grid;
grid-template-columns: 6rem 1fr auto;
align-items: center;
gap: 0.5rem;
font-size: 0.82rem;
}
.bench-type-label {
font-weight: 600;
color: var(--color-text, #1a2338);
font-size: 0.78rem;
}
.bench-run-time {
color: var(--color-text-secondary, #6b7a99);
font-size: 0.78rem;
}
.bench-run-score {
font-family: var(--font-mono, monospace);
font-size: 0.75rem;
font-weight: 600;
color: var(--app-primary, #2A6080);
background: color-mix(in srgb, var(--app-primary, #2A6080) 10%, transparent);
padding: 0.1rem 0.35rem;
border-radius: 0.25rem;
}
/* ── Job pills ── */ /* ── Job pills ── */
.job-row { .job-row {
display: flex; display: flex;

View file

@ -1,705 +0,0 @@
<template>
<div class="embed-compare-page">
<!-- Step indicator (non-interactive) -->
<ol class="step-indicator" aria-label="Setup progress">
<li :class="{ complete: corpus.length > 0 }">Corpus</li>
<li :class="{ complete: queries.length > 0 }">Queries</li>
<li :class="{ complete: selectedModels.length > 0 }">Models</li>
<li :class="{ complete: hasResults }">Run &amp; Rate</li>
</ol>
<!-- Persistent aria-live region always in DOM, never v-if -->
<div
ref="liveRegion"
class="sr-live"
aria-live="polite"
aria-atomic="true"
v-text="liveMessage"
/>
<!-- Corpus section -->
<section class="card" aria-labelledby="corpus-heading">
<h2 id="corpus-heading"> Corpus</h2>
<div class="corpus-controls">
<div class="field">
<label for="corpus-paste">Paste chunks (one per line)</label>
<textarea
id="corpus-paste"
v-model="rawCorpus"
rows="6"
placeholder="Paste one chunk per line, or use Import below..."
@change="onCorpusPaste"
/>
</div>
<div class="import-row">
<label for="imitate-product-select">Import from product</label>
<select id="imitate-product-select" v-model="selectedProduct">
<option value="">-- select product --</option>
<option
v-for="p in imitateProducts"
:key="p.id"
:value="p.id"
>{{ p.name }}</option>
</select>
<button
class="btn-secondary"
:disabled="!selectedProduct || importing"
@click="importCorpus"
>
{{ importing ? 'Importing…' : 'Import' }}
</button>
<span v-if="importError" class="error-text" role="alert">{{ importError }}</span>
</div>
<p v-if="corpus.length > 0" class="corpus-count">
{{ corpus.length }} chunk{{ corpus.length === 1 ? '' : 's' }} loaded.
</p>
</div>
</section>
<!-- Queries section -->
<section class="card" aria-labelledby="queries-heading">
<h2 id="queries-heading"> Queries</h2>
<div class="field">
<label for="query-input">Enter queries (one per line)</label>
<textarea
id="query-input"
v-model="rawQueries"
rows="4"
placeholder="One query per line..."
@change="onQueriesChange"
/>
</div>
<p v-if="queries.length > 0" class="query-count">
{{ queries.length }} quer{{ queries.length === 1 ? 'y' : 'ies' }}.
</p>
</section>
<!-- Model selection -->
<section class="card" aria-labelledby="models-heading">
<h2 id="models-heading"> Models</h2>
<p v-if="loadingModels" class="muted">Loading models from Ollama</p>
<p v-else-if="modelsError" class="error-text" role="alert">{{ modelsError }}</p>
<ul v-else class="model-list" role="list">
<li v-for="m in availableModels" :key="m.name">
<label class="model-checkbox">
<input
type="checkbox"
:value="m.name"
v-model="selectedModels"
/>
{{ m.name }}
<span class="model-size muted" aria-label="model size">
{{ formatBytes(m.size) }}
</span>
</label>
</li>
</ul>
<p v-if="availableModels.length === 0 && !loadingModels && !modelsError" class="muted">
No Ollama models found. Pull an embedding model first.
</p>
</section>
<!-- Run controls -->
<section class="card run-controls" aria-labelledby="run-heading">
<h2 id="run-heading"> Run</h2>
<div class="run-row">
<div class="field-inline">
<label for="top-k-input">Results per query</label>
<input
id="top-k-input"
type="number"
v-model.number="topK"
min="1"
max="20"
style="width: 5rem"
/>
</div>
<button
class="btn-primary"
:disabled="!canRun || running"
@click="startRun"
>
{{ running ? 'Running…' : 'Run' }}
</button>
<button
v-if="running"
class="btn-danger"
aria-label="Cancel embedding run"
@click="cancelRun"
>
Cancel
</button>
</div>
<p v-if="!canRun && !running" class="muted">
Fill corpus, at least one query, and select at least one model to run.
</p>
</section>
<!-- Results -->
<section
v-if="hasResults"
class="card results-section"
aria-labelledby="results-heading"
>
<h2 id="results-heading">Results</h2>
<!-- Query pagination -->
<div class="query-nav" role="navigation" aria-label="Query navigation">
<button
class="btn-secondary"
aria-label="Previous query"
:disabled="currentQueryIdx === 0"
@click="currentQueryIdx--"
></button>
<span class="query-counter">
Query {{ currentQueryIdx + 1 }} of {{ uniqueQueries.length }}:
<em>{{ uniqueQueries[currentQueryIdx] }}</em>
</span>
<button
class="btn-secondary"
aria-label="Next query"
:disabled="currentQueryIdx >= uniqueQueries.length - 1"
@click="currentQueryIdx++"
></button>
</div>
<!-- Results table: one column per model -->
<div class="table-wrap">
<table class="results-table">
<thead>
<tr>
<th scope="col" class="rank-col">#</th>
<th
v-for="model in selectedModels"
:key="model"
scope="col"
>{{ model }}</th>
</tr>
</thead>
<tbody>
<tr v-for="rank in topK" :key="rank">
<td class="rank-col muted">{{ rank }}</td>
<td
v-for="model in selectedModels"
:key="model"
class="hit-cell"
>
<template v-if="getHit(currentQueryIdx, model, rank - 1) as hit">
<div class="hit-text">{{ hit.text }}</div>
<!-- Visual score bar: decorative only -->
<div class="score-row">
<div class="score-bar-wrap" aria-hidden="true">
<div class="score-bar" :style="{ width: `${hit.score * 100}%` }" />
</div>
<span class="score-label">{{ hit.score.toFixed(3) }}</span>
</div>
<!-- Rating buttons -->
<div class="rating-row">
<button
class="rate-btn"
:class="{ active: getRating(currentQueryIdx, model, hit.chunk_idx) === 'relevant' }"
:aria-pressed="getRating(currentQueryIdx, model, hit.chunk_idx) === 'relevant'"
aria-label="Mark as relevant"
@click="rate(currentQueryIdx, model, hit, 'relevant')"
>
👍 Relevant
</button>
<button
class="rate-btn rate-btn-neg"
:class="{ active: getRating(currentQueryIdx, model, hit.chunk_idx) === 'not_relevant' }"
:aria-pressed="getRating(currentQueryIdx, model, hit.chunk_idx) === 'not_relevant'"
aria-label="Mark as not relevant"
@click="rate(currentQueryIdx, model, hit, 'not_relevant')"
>
👎 Not relevant
</button>
</div>
</template>
<span v-else class="muted"></span>
</td>
</tr>
</tbody>
</table>
</div>
</section>
<!-- Export -->
<section
v-if="hasResults"
class="card export-section"
aria-labelledby="export-heading"
>
<h2 id="export-heading">Export Ratings</h2>
<div class="export-row">
<fieldset class="export-format-group">
<legend>Format</legend>
<label><input type="radio" v-model="exportFormat" value="csv" /> CSV</label>
<label><input type="radio" v-model="exportFormat" value="json" /> JSON</label>
</fieldset>
<button class="btn-secondary" @click="exportRatings">Export</button>
</div>
</section>
</div>
</template>
<script setup lang="ts">
import { ref, computed, onMounted } from 'vue'
// Types
interface OllamaModel { name: string; size: number }
interface ImitateProduct { id: string; name: string }
interface HitResult { chunk_idx: number; text: string; score: number }
interface ResultEvent {
type: 'result'
query_idx: number
query: string
model: string
hits: HitResult[]
}
// State
const rawCorpus = ref('')
const corpus = ref<string[]>([])
const rawQueries = ref('')
const queries = ref<string[]>([])
const selectedModels = ref<string[]>([])
const topK = ref(5)
const availableModels = ref<OllamaModel[]>([])
const loadingModels = ref(false)
const modelsError = ref('')
const imitateProducts = ref<ImitateProduct[]>([])
const selectedProduct = ref('')
const importing = ref(false)
const importError = ref('')
const running = ref(false)
const liveMessage = ref('')
const resultEvents = ref<ResultEvent[]>([])
const runController = ref<AbortController | null>(null)
const currentQueryIdx = ref(0)
const exportFormat = ref<'csv' | 'json'>('csv')
type RatingMap = Record<string, Record<string, Record<number, 'relevant' | 'not_relevant'>>>
const ratings = ref<RatingMap>({})
const uniqueQueries = computed(() => {
const seen = new Set<string>()
const out: string[] = []
for (const e of resultEvents.value) {
if (!seen.has(e.query)) { seen.add(e.query); out.push(e.query) }
}
return out
})
const hasResults = computed(() => resultEvents.value.length > 0)
const canRun = computed(
() => corpus.value.length > 0 && queries.value.length > 0 && selectedModels.value.length > 0
)
// Corpus helpers
function onCorpusPaste() {
const chunks = rawCorpus.value.split('\n').map(l => l.trim()).filter(Boolean)
corpus.value = chunks
if (chunks.length > 0) {
liveMessage.value = `${chunks.length} chunk${chunks.length === 1 ? '' : 's'} loaded.`
}
}
function onQueriesChange() {
queries.value = rawQueries.value.split('\n').map(l => l.trim()).filter(Boolean)
}
async function importCorpus() {
if (!selectedProduct.value) return
importing.value = true
importError.value = ''
try {
const r = await fetch(`/api/imitate/products/${selectedProduct.value}/sample-chunks`)
if (!r.ok) {
const text = await r.text()
throw new Error(text || `HTTP ${r.status}`)
}
const data = await r.json() as { chunks?: string[] }
const chunks = data.chunks ?? []
corpus.value = chunks
rawCorpus.value = chunks.join('\n')
liveMessage.value = `${chunks.length} chunk${chunks.length === 1 ? '' : 's'} loaded from import.`
} catch (err) {
importError.value = String(err)
} finally {
importing.value = false
}
}
// Model loading
async function loadModels() {
loadingModels.value = true
modelsError.value = ''
try {
const r = await fetch('/api/embed-bench/models')
if (!r.ok) throw new Error(`HTTP ${r.status}`)
const data = await r.json() as { models: OllamaModel[] }
availableModels.value = data.models
} catch (err) {
modelsError.value = `Failed to load models: ${err}`
} finally {
loadingModels.value = false
}
}
// Run
async function startRun() {
if (!canRun.value) return
running.value = true
resultEvents.value = []
liveMessage.value = 'Starting embedding run…'
runController.value = new AbortController()
try {
const resp = await fetch('/api/embed-bench/run', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
corpus: corpus.value,
queries: queries.value,
models: selectedModels.value,
top_k: topK.value,
}),
signal: runController.value.signal,
})
const reader = resp.body!.getReader()
const decoder = new TextDecoder()
let buf = ''
while (true) {
const { done, value } = await reader.read()
if (done) break
buf += decoder.decode(value, { stream: true })
const lines = buf.split('\n')
buf = lines.pop() ?? ''
for (const line of lines) {
if (!line.startsWith('data: ')) continue
const event = JSON.parse(line.slice(6))
if (event.type === 'progress') {
liveMessage.value = event.msg
} else if (event.type === 'result') {
resultEvents.value.push(event as ResultEvent)
} else if (event.type === 'done') {
liveMessage.value = 'Run complete.'
} else if (event.type === 'error') {
liveMessage.value = `Error: ${event.msg}`
}
}
}
} catch (err) {
if ((err as Error).name !== 'AbortError') {
liveMessage.value = `Run failed: ${err}`
}
} finally {
running.value = false
runController.value = null
}
}
function cancelRun() {
runController.value?.abort()
liveMessage.value = 'Run cancelled.'
}
// Utilities
function formatBytes(bytes: number): string {
if (bytes < 1_000_000) return `${(bytes / 1000).toFixed(0)} KB`
if (bytes < 1_000_000_000) return `${(bytes / 1_000_000).toFixed(0)} MB`
return `${(bytes / 1_000_000_000).toFixed(1)} GB`
}
function getHit(queryIdx: number, model: string, rank: number): HitResult | null {
const query = uniqueQueries.value[queryIdx]
if (!query) return null
const ev = resultEvents.value.find(e => e.query === query && e.model === model)
return ev?.hits[rank] ?? null
}
function getRating(queryIdx: number, model: string, chunkIdx: number): string | undefined {
const query = uniqueQueries.value[queryIdx]
return ratings.value[query]?.[model]?.[chunkIdx]
}
async function rate(
queryIdx: number,
model: string,
hit: HitResult,
rating: 'relevant' | 'not_relevant',
) {
const query = uniqueQueries.value[queryIdx]
// Optimistic update
if (!ratings.value[query]) ratings.value[query] = {}
if (!ratings.value[query][model]) ratings.value[query][model] = {}
ratings.value[query][model][hit.chunk_idx] = rating
try {
await fetch('/api/embed-bench/rate', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
query,
model,
chunk_text: hit.text,
chunk_idx: hit.chunk_idx,
rating,
}),
})
liveMessage.value = `Rated chunk ${hit.chunk_idx + 1} as ${rating}.`
} catch (err) {
liveMessage.value = `Rating failed: ${err}`
}
}
async function exportRatings() {
const r = await fetch(`/api/embed-bench/export?format=${exportFormat.value}`)
if (!r.ok) {
liveMessage.value = `Export failed: HTTP ${r.status}`
return
}
const blob = await r.blob()
const disposition = r.headers.get('Content-Disposition') ?? ''
const filenameMatch = disposition.match(/filename="([^"]+)"/)
const filename = filenameMatch ? filenameMatch[1] : `embed_comparison.${exportFormat.value}`
const url = URL.createObjectURL(blob)
const a = document.createElement('a')
a.href = url
a.download = filename
a.click()
URL.revokeObjectURL(url)
liveMessage.value = `Exported ${filename}.`
}
// Lifecycle
onMounted(() => {
loadModels()
})
</script>
<style scoped>
.embed-compare-page {
padding: var(--space-4, 1.5rem);
max-width: 1100px;
}
/* Step indicator */
.step-indicator {
display: flex;
gap: 0;
list-style: none;
margin: 0 0 var(--space-4, 1.5rem);
padding: 0;
border-bottom: 2px solid var(--color-border, #d0d7e8);
}
.step-indicator li {
padding: 0.4rem 1rem;
font-size: 0.8rem;
font-weight: 600;
color: var(--color-text-muted, #4a5c7a);
text-transform: uppercase;
letter-spacing: 0.05em;
border-bottom: 2px solid transparent;
margin-bottom: -2px;
}
.step-indicator li.complete {
color: var(--app-primary, #2A6080);
border-bottom-color: var(--app-primary, #2A6080);
}
/* Accessibility: screen-reader live region — visually hidden but always present */
.sr-live {
position: absolute;
width: 1px; height: 1px;
overflow: hidden;
clip: rect(0 0 0 0);
white-space: nowrap;
}
/* Cards */
.card {
background: var(--color-surface-raised, #e4ebf5);
border: 1px solid var(--color-border, #d0d7e8);
border-radius: var(--radius-md, 0.5rem);
padding: var(--space-4, 1.5rem);
margin-bottom: var(--space-4, 1.5rem);
}
.card h2 {
font-size: 1rem;
font-weight: 700;
margin: 0 0 var(--space-3, 1rem);
color: var(--color-text, #1a2338);
}
.field { display: flex; flex-direction: column; gap: 0.3rem; margin-bottom: 0.75rem; }
.field label { font-size: 0.85rem; font-weight: 600; }
textarea, input[type="number"] {
border: 1px solid var(--color-border, #d0d7e8);
border-radius: var(--radius-sm, 0.25rem);
padding: 0.5rem;
font-size: 0.875rem;
background: var(--color-surface, #f0f4fb);
color: var(--color-text, #1a2338);
resize: vertical;
}
.corpus-controls { display: flex; flex-direction: column; gap: 0.5rem; }
.import-row {
display: flex; flex-wrap: wrap; gap: 0.5rem; align-items: center;
}
.import-row label { font-size: 0.85rem; font-weight: 600; }
.corpus-count, .query-count { font-size: 0.875rem; color: var(--app-primary, #2A6080); margin: 0; }
.model-list { list-style: none; padding: 0; margin: 0; display: flex; flex-wrap: wrap; gap: 0.5rem; }
.model-checkbox {
display: flex; align-items: center; gap: 0.4rem;
font-size: 0.875rem; cursor: pointer;
padding: 0.3rem 0.6rem;
border: 1px solid var(--color-border, #d0d7e8);
border-radius: var(--radius-sm, 0.25rem);
background: var(--color-surface, #f0f4fb);
}
.model-size { font-size: 0.75rem; }
.run-row { display: flex; flex-wrap: wrap; gap: 0.75rem; align-items: flex-end; }
.field-inline { display: flex; align-items: center; gap: 0.4rem; }
.field-inline label { font-size: 0.85rem; font-weight: 600; white-space: nowrap; }
.btn-primary, .btn-secondary, .btn-danger {
padding: 0.4rem 1rem;
border-radius: var(--radius-sm, 0.25rem);
border: 1px solid transparent;
font-size: 0.875rem;
font-weight: 600;
cursor: pointer;
transition: background 0.15s;
}
.btn-primary { background: var(--app-primary, #2A6080); color: #fff; }
.btn-primary:hover:not(:disabled) { filter: brightness(1.1); }
.btn-primary:disabled { opacity: 0.5; cursor: not-allowed; }
.btn-secondary { background: var(--color-surface, #f0f4fb); color: var(--color-text, #1a2338); border-color: var(--color-border, #d0d7e8); }
.btn-secondary:hover:not(:disabled) { background: var(--color-border, #d0d7e8); }
.btn-secondary:disabled { opacity: 0.5; cursor: not-allowed; }
.btn-danger { background: var(--color-error, #c0392b); color: #fff; }
.muted { color: var(--color-text-muted, #4a5c7a); font-size: 0.875rem; }
.error-text { color: var(--color-error, #c0392b); font-size: 0.875rem; }
@media (max-width: 768px) {
.import-row { flex-direction: column; align-items: flex-start; }
.run-row { flex-direction: column; }
.model-list { flex-direction: column; }
}
/* Results table */
.table-wrap { overflow-x: auto; }
.results-table {
width: 100%;
border-collapse: collapse;
font-size: 0.875rem;
}
.results-table thead th {
position: sticky;
top: 0;
background: var(--color-surface-raised, #e4ebf5);
border-bottom: 2px solid var(--color-border, #d0d7e8);
padding: 0.5rem 0.75rem;
text-align: left;
font-weight: 700;
white-space: nowrap;
z-index: 1;
}
.results-table td {
padding: 0.5rem 0.75rem;
vertical-align: top;
border-bottom: 1px solid var(--color-border, #d0d7e8);
}
.rank-col { width: 2rem; text-align: center; }
.hit-text { margin-bottom: 0.25rem; line-height: 1.4; }
.score-row { display: flex; align-items: center; gap: 0.4rem; margin-bottom: 0.25rem; }
.score-bar-wrap {
flex: 1;
height: 6px;
background: var(--color-border, #d0d7e8);
border-radius: 3px;
overflow: hidden;
}
.score-bar {
height: 100%;
background: var(--app-primary, #2A6080);
border-radius: 3px;
transition: width 0.3s ease;
}
.score-label { font-size: 0.75rem; color: var(--color-text-muted, #4a5c7a); min-width: 3rem; text-align: right; }
.rating-row { display: flex; gap: 0.4rem; flex-wrap: wrap; }
.rate-btn {
padding: 0.2rem 0.5rem;
border: 1px solid var(--color-border, #d0d7e8);
border-radius: var(--radius-sm, 0.25rem);
background: var(--color-surface, #f0f4fb);
color: var(--color-text, #1a2338);
font-size: 0.75rem;
cursor: pointer;
transition: background 0.15s, border-color 0.15s;
}
.rate-btn.active {
background: color-mix(in srgb, var(--app-primary, #2A6080) 20%, transparent);
border-color: var(--app-primary, #2A6080);
font-weight: 700;
}
.rate-btn-neg.active {
background: color-mix(in srgb, var(--color-error, #c0392b) 15%, transparent);
border-color: var(--color-error, #c0392b);
}
/* Query nav */
.query-nav {
display: flex;
align-items: center;
gap: 0.5rem;
margin-bottom: 0.75rem;
flex-wrap: wrap;
}
.query-counter { font-size: 0.875rem; flex: 1; }
/* Export */
.export-row { display: flex; gap: 1rem; align-items: center; flex-wrap: wrap; }
.export-format-group {
border: none;
padding: 0;
display: flex;
gap: 0.75rem;
}
.export-format-group legend {
font-size: 0.85rem;
font-weight: 600;
margin-bottom: 0.25rem;
float: left;
margin-right: 0.5rem;
}
.export-format-group label { font-size: 0.875rem; display: flex; align-items: center; gap: 0.3rem; }
@media (max-width: 768px) {
.results-table thead th,
.results-table td { padding: 0.35rem 0.4rem; font-size: 0.8rem; }
.query-nav { flex-direction: column; align-items: flex-start; }
}
@media (prefers-reduced-motion: reduce) {
.score-bar { transition: none; }
}
</style>

View file

@ -1,7 +0,0 @@
<template>
<EmbedCompareTab />
</template>
<script setup lang="ts">
import EmbedCompareTab from './EmbedCompareTab.vue'
</script>

View file

@ -302,7 +302,7 @@ const llmModelBadge = computed(() => {
const llmTaskTypeCols = computed(() => { const llmTaskTypeCols = computed(() => {
const types = new Set<string>() const types = new Set<string>()
for (const r of llmResults.value) { for (const r of llmResults.value) {
for (const k of Object.keys(r.quality_by_task_type ?? {})) types.add(k) for (const k of Object.keys(r.quality_by_task_type)) types.add(k)
} }
return [...types].sort() return [...types].sort()
}) })
@ -338,7 +338,7 @@ const llmBestByCol = computed((): Record<string, string> => {
for (const col of llmTaskTypeCols.value) { for (const col of llmTaskTypeCols.value) {
bestId = ''; bestVal = -Infinity bestId = ''; bestVal = -Infinity
for (const r of llmResults.value) { for (const r of llmResults.value) {
const v = r.quality_by_task_type?.[col] const v = r.quality_by_task_type[col]
if (v != null && v > bestVal) { bestVal = v; bestId = r.model_id } if (v != null && v > bestVal) { bestVal = v; bestId = r.model_id }
} }
best[col] = bestId best[col] = bestId