Compare commits
No commits in common. "2b990a603aeda90b910c02cc5151ade25f6c0075" and "6f9aad126e921071ddd15b6fef169637c17c35d4" have entirely different histories.
2b990a603a
...
6f9aad126e
19 changed files with 73 additions and 2181 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -8,9 +8,6 @@ __pycache__/
|
|||
config/label_tool.yaml
|
||||
|
||||
# Data files (user-generated, not for version control)
|
||||
data/corpus.db
|
||||
data/corpus.db-wal
|
||||
data/corpus.db-shm
|
||||
data/email_score.jsonl
|
||||
data/email_label_queue.jsonl
|
||||
data/email_compare_sample.jsonl
|
||||
|
|
|
|||
30
app/api.py
30
app/api.py
|
|
@ -40,36 +40,6 @@ app.include_router(plans_bench_router, prefix="/api/plans-bench")
|
|||
# In-memory last-action store (single user, local tool — in-memory is fine)
|
||||
_last_action: dict | None = None
|
||||
|
||||
# -- Backward-compat shims (ClassifierTab still uses old /api/finetune/* paths)
|
||||
# Remove once ClassifierTab fine-tune section is migrated to TrainJobsView.
|
||||
|
||||
from fastapi import Query
|
||||
from fastapi.responses import StreamingResponse as _StreamingResponse
|
||||
|
||||
@app.get("/api/finetune/run")
|
||||
def finetune_run_compat(model: str = Query(...), epochs: int = Query(5)) -> _StreamingResponse:
|
||||
"""Shim: create a classifier train job and immediately stream it."""
|
||||
from app.train.train import create_job, run_job, CreateJobRequest
|
||||
job = create_job(CreateJobRequest(type="classifier", model_key=model, config_json={"epochs": epochs}))
|
||||
return run_job(job["id"])
|
||||
|
||||
@app.post("/api/finetune/cancel")
|
||||
def finetune_cancel_compat() -> dict:
|
||||
"""Shim: cancel the most recent running classifier job."""
|
||||
from app.train.train import _db, _init_db, cancel_job
|
||||
from fastapi import HTTPException
|
||||
_init_db()
|
||||
with _db() as conn:
|
||||
row = conn.execute(
|
||||
"SELECT id FROM jobs WHERE type='classifier' AND status='running' ORDER BY started_at DESC LIMIT 1"
|
||||
).fetchone()
|
||||
if row is None:
|
||||
return {"status": "nothing_running"}
|
||||
return cancel_job(row["id"])
|
||||
|
||||
from app.data.log_corpus import router as log_corpus_router
|
||||
app.include_router(log_corpus_router, prefix="/api/corpus")
|
||||
|
||||
from app.dashboard import router as dashboard_router
|
||||
app.include_router(dashboard_router, prefix="/api")
|
||||
|
||||
|
|
|
|||
|
|
@ -16,7 +16,6 @@ import json
|
|||
import logging
|
||||
import os
|
||||
import re
|
||||
import select as _select
|
||||
import subprocess as _subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
|
@ -312,12 +311,8 @@ def run_benchmark(
|
|||
"""Spawn cf-orch benchmark.py and stream stdout as SSE progress events."""
|
||||
global _BENCH_RUNNING, _bench_proc
|
||||
|
||||
# Check if the process is actually still alive; reset stale flag if not.
|
||||
if _BENCH_RUNNING:
|
||||
if _bench_proc is not None and _bench_proc.poll() is None:
|
||||
raise HTTPException(409, "A benchmark is already running")
|
||||
_BENCH_RUNNING = False
|
||||
_bench_proc = None
|
||||
|
||||
cfg = _load_cforch_config()
|
||||
bench_script = cfg.get("bench_script", "")
|
||||
|
|
@ -441,23 +436,8 @@ def run_benchmark(
|
|||
env=proc_env,
|
||||
)
|
||||
_bench_proc = proc
|
||||
_IDLE_TIMEOUT_S = 120 # kill if no output for 2 minutes (node crash)
|
||||
try:
|
||||
while True:
|
||||
ready = _select.select([proc.stdout], [], [], _IDLE_TIMEOUT_S)
|
||||
if not ready[0]:
|
||||
# No output for IDLE_TIMEOUT_S — node likely crashed
|
||||
proc.terminate()
|
||||
try:
|
||||
proc.wait(timeout=5)
|
||||
except _subprocess.TimeoutExpired:
|
||||
proc.kill()
|
||||
msg = f"Benchmark timed out — no output for {_IDLE_TIMEOUT_S}s (cluster node may have crashed)"
|
||||
yield f"data: {json.dumps({'type': 'error', 'message': msg})}\n\n"
|
||||
break
|
||||
line = proc.stdout.readline()
|
||||
if not line:
|
||||
break
|
||||
for line in proc.stdout:
|
||||
line = _strip_ansi(line.rstrip())
|
||||
if line:
|
||||
yield f"data: {json.dumps({'type': 'progress', 'message': line})}\n\n"
|
||||
|
|
|
|||
109
app/dashboard.py
109
app/dashboard.py
|
|
@ -1,18 +1,17 @@
|
|||
"""Avocet -- dashboard aggregate API.
|
||||
|
||||
GET /api/dashboard returns the current flywheel state:
|
||||
labeled_since_last_eval -- items labeled after the most recent bench run
|
||||
labeled_since_last_eval -- items labeled after the most recent eval run
|
||||
last_eval_timestamp -- ISO timestamp of newest bench_results summary
|
||||
last_eval_best_score -- best macro_f1 from that summary
|
||||
active_jobs -- jobs with status queued or running
|
||||
corrections_pending -- sft_candidates with status=needs_review
|
||||
corrections_export_ready -- approved sft candidates with non-blank correction
|
||||
recent_bench_runs -- most-recent timestamp + score per bench type
|
||||
signals -- computed booleans for UI nudge indicators
|
||||
|
||||
Thresholds in label_tool.yaml pipeline: section:
|
||||
pipeline:
|
||||
data_eval_threshold: 50 # labeled items since last bench to trigger nudge
|
||||
data_eval_threshold: 50 # labeled items since last eval to trigger nudge
|
||||
eval_train_threshold: 0.05 # improvement delta needed before retraining (future)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
|
@ -78,7 +77,7 @@ def _load_score_records() -> list[dict]:
|
|||
pass
|
||||
return records
|
||||
|
||||
def _find_latest_classifier_bench(results_dir_override: str = "") -> tuple[str | None, float | None]:
|
||||
def _find_latest_eval(results_dir_override: str = "") -> tuple[str | None, float | None]:
|
||||
"""Return (iso_timestamp, best_macro_f1) from the newest bench_results summary.
|
||||
|
||||
Checks results_dir from cforch config if set, then falls back to
|
||||
|
|
@ -108,8 +107,6 @@ def _find_latest_classifier_bench(results_dir_override: str = "") -> tuple[str |
|
|||
if summary.exists():
|
||||
try:
|
||||
data = json.loads(summary.read_text(encoding="utf-8"))
|
||||
if not isinstance(data, dict):
|
||||
continue # cforch LLM-bench summaries are lists; skip
|
||||
ts = data.get("timestamp") or subdir.name
|
||||
score = data.get("best_macro_f1") or data.get("macro_f1")
|
||||
return ts, (float(score) if isinstance(score, (int, float)) else None)
|
||||
|
|
@ -117,10 +114,6 @@ def _find_latest_classifier_bench(results_dir_override: str = "") -> tuple[str |
|
|||
logger.warning("Failed to parse summary.json at %s: %s", summary, exc)
|
||||
return None, None
|
||||
|
||||
# Keep old name as alias so existing callers in tests still work.
|
||||
_find_latest_eval = _find_latest_classifier_bench
|
||||
|
||||
|
||||
def _count_corrections() -> tuple[int, int]:
|
||||
"""Return (pending_count, export_ready_count)."""
|
||||
pending = 0
|
||||
|
|
@ -176,106 +169,22 @@ def _count_labeled_since(since_ts: str | None) -> int:
|
|||
return sum(1 for r in records if r.get("labeled_at", "") > since_ts)
|
||||
|
||||
|
||||
def _get_recent_bench_runs() -> dict:
|
||||
"""Return most-recent run summary for each bench type.
|
||||
|
||||
Each entry: {"timestamp": str|None, "metric": str|None, "score": float|None}
|
||||
"""
|
||||
runs: dict[str, dict] = {
|
||||
"classifier": {"timestamp": None, "metric": "macro_f1", "score": None},
|
||||
"llm": {"timestamp": None, "metric": None, "score": None},
|
||||
"style": {"timestamp": None, "metric": None, "score": None},
|
||||
"plans": {"timestamp": None, "metric": "avg_score", "score": None},
|
||||
}
|
||||
|
||||
# ── Classifier: bench_results/<run>/summary.json ──────────────────────
|
||||
clf_ts, clf_score = _find_latest_classifier_bench()
|
||||
if clf_ts:
|
||||
runs["classifier"]["timestamp"] = clf_ts
|
||||
runs["classifier"]["score"] = clf_score
|
||||
|
||||
# ── LLM bench + Style: benchmark_results/ ─────────────────────────────
|
||||
f = _config_file()
|
||||
bench_dir: Path | None = None
|
||||
if f.exists():
|
||||
try:
|
||||
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
|
||||
rd = (raw.get("cforch", {}) or {}).get("results_dir", "")
|
||||
if rd:
|
||||
bench_dir = Path(rd)
|
||||
except Exception:
|
||||
pass
|
||||
if bench_dir is None:
|
||||
bench_dir = _ROOT / "benchmark_results"
|
||||
|
||||
if bench_dir.exists():
|
||||
llm_files = sorted(
|
||||
[p for p in bench_dir.glob("*.json") if not p.name.startswith("style_")],
|
||||
key=lambda p: p.stat().st_mtime, reverse=True,
|
||||
)
|
||||
if llm_files:
|
||||
try:
|
||||
data = json.loads(llm_files[0].read_text(encoding="utf-8"))
|
||||
runs["llm"]["timestamp"] = data.get("timestamp") or llm_files[0].stem
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
style_files = sorted(bench_dir.glob("style_*.json"), reverse=True)
|
||||
if style_files:
|
||||
try:
|
||||
data = json.loads(style_files[0].read_text(encoding="utf-8"))
|
||||
if isinstance(data, list) and data:
|
||||
runs["style"]["timestamp"] = data[0].get("timestamp") or style_files[0].stem
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ── Plans bench: data/plans_bench_results/plans_*.json ────────────────
|
||||
plans_dir = _DATA_DIR / "plans_bench_results"
|
||||
if plans_dir.exists():
|
||||
plans_files = sorted(plans_dir.glob("plans_*.json"), reverse=True)
|
||||
if plans_files:
|
||||
run_id = plans_files[0].stem
|
||||
try:
|
||||
d: dict = json.loads(plans_files[0].read_text(encoding="utf-8"))
|
||||
all_scores = [
|
||||
r["total_score"]
|
||||
for results in d.values()
|
||||
for r in results
|
||||
if isinstance(r, dict) and not r.get("error")
|
||||
]
|
||||
avg = round(sum(all_scores) / len(all_scores), 3) if all_scores else None
|
||||
try:
|
||||
date_part = run_id.removeprefix("plans_")
|
||||
date, time_part = date_part.split("_")
|
||||
ts_display = f"{date} {time_part[:2]}:{time_part[2:4]}"
|
||||
except Exception:
|
||||
ts_display = run_id
|
||||
runs["plans"]["timestamp"] = ts_display
|
||||
runs["plans"]["score"] = avg
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return runs
|
||||
|
||||
|
||||
@router.get("/dashboard")
|
||||
def get_dashboard() -> dict:
|
||||
data_threshold, _train_threshold = _load_thresholds()
|
||||
last_ts, last_score = _find_latest_classifier_bench()
|
||||
labeled_since = _count_labeled_since(last_ts)
|
||||
data_eval_threshold, eval_train_threshold = _load_thresholds()
|
||||
last_eval_ts, last_eval_score = _find_latest_eval()
|
||||
labeled_since = _count_labeled_since(last_eval_ts)
|
||||
corrections_pending, corrections_export_ready = _count_corrections()
|
||||
active_jobs = _get_active_jobs()
|
||||
recent_bench = _get_recent_bench_runs()
|
||||
return {
|
||||
"labeled_since_last_eval": labeled_since,
|
||||
"last_eval_timestamp": last_ts,
|
||||
"last_eval_best_score": last_score,
|
||||
"last_eval_timestamp": last_eval_ts,
|
||||
"last_eval_best_score": last_eval_score,
|
||||
"active_jobs": active_jobs,
|
||||
"corrections_pending": corrections_pending,
|
||||
"corrections_export_ready": corrections_export_ready,
|
||||
"recent_bench_runs": recent_bench,
|
||||
"signals": {
|
||||
"data_to_eval": labeled_since >= data_threshold,
|
||||
"data_to_eval": labeled_since >= data_eval_threshold,
|
||||
"eval_to_train": False, # future: implement delta-F1 comparison
|
||||
"train_to_fleet": False, # future: implement fleet sync signal
|
||||
},
|
||||
|
|
|
|||
|
|
@ -1,352 +0,0 @@
|
|||
"""Avocet — Log Corpus receiver and labeling API.
|
||||
|
||||
Receives push batches from consented Turnstone nodes, stores entries for labeling,
|
||||
and exports labeled data as JSONL for the logreading fine-tune pipeline.
|
||||
|
||||
DB: data/corpus.db (separate from train_jobs.db — different lifecycle)
|
||||
Auth: Bearer token validated against corpus_sources table (seeded from label_tool.yaml).
|
||||
|
||||
All endpoints registered on `router`. api.py includes this with prefix="/api/corpus".
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import sqlite3
|
||||
import uuid
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Generator
|
||||
|
||||
import yaml
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from fastapi.requests import Request
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_ROOT = Path(__file__).parent.parent.parent
|
||||
_CONFIG_DIR: Path | None = None
|
||||
_DATA_DIR: Path = _ROOT / "data"
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
_DB_PATH: Path = _ROOT / "data" / "corpus.db"
|
||||
|
||||
_SCHEMA = """
|
||||
CREATE TABLE IF NOT EXISTS corpus_sources (
|
||||
token TEXT PRIMARY KEY,
|
||||
source_host TEXT NOT NULL,
|
||||
owner TEXT NOT NULL,
|
||||
consent_date TEXT NOT NULL,
|
||||
consent_method TEXT NOT NULL,
|
||||
active INTEGER NOT NULL DEFAULT 1
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS corpus_batches (
|
||||
id TEXT PRIMARY KEY,
|
||||
source_host TEXT NOT NULL,
|
||||
batch_type TEXT NOT NULL,
|
||||
received_at TEXT NOT NULL,
|
||||
entry_count INTEGER NOT NULL,
|
||||
watermark_from TEXT,
|
||||
watermark_to TEXT,
|
||||
raw_json TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS corpus_entries (
|
||||
id TEXT PRIMARY KEY,
|
||||
batch_id TEXT NOT NULL REFERENCES corpus_batches(id),
|
||||
source_host TEXT NOT NULL,
|
||||
origin_entry_id TEXT,
|
||||
timestamp_iso TEXT,
|
||||
severity TEXT,
|
||||
source_id TEXT,
|
||||
text TEXT NOT NULL,
|
||||
matched_patterns TEXT DEFAULT '[]',
|
||||
label_state TEXT NOT NULL DEFAULT 'unlabeled',
|
||||
failure_type TEXT,
|
||||
plain_explanation TEXT,
|
||||
known_pattern TEXT,
|
||||
labeled_at TEXT,
|
||||
labeled_by TEXT DEFAULT 'alan',
|
||||
pii_flagged INTEGER NOT NULL DEFAULT 0
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_ce_label_state ON corpus_entries(label_state);
|
||||
CREATE INDEX IF NOT EXISTS idx_ce_source ON corpus_entries(source_host);
|
||||
CREATE INDEX IF NOT EXISTS idx_ce_severity ON corpus_entries(severity);
|
||||
"""
|
||||
|
||||
|
||||
# ── Testability seams ──────────────────────────────────────────────────────────
|
||||
|
||||
def set_config_dir(path: Path | None) -> None:
|
||||
global _CONFIG_DIR
|
||||
_CONFIG_DIR = path
|
||||
|
||||
|
||||
def set_data_dir(path: Path) -> None:
|
||||
global _DATA_DIR, _DB_PATH
|
||||
_DATA_DIR = path
|
||||
_DB_PATH = path / "corpus.db"
|
||||
|
||||
|
||||
# ── Internal helpers ───────────────────────────────────────────────────────────
|
||||
|
||||
def _config_file() -> Path:
|
||||
if _CONFIG_DIR is not None:
|
||||
return _CONFIG_DIR / "label_tool.yaml"
|
||||
return _ROOT / "config" / "label_tool.yaml"
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _db() -> Generator[sqlite3.Connection, None, None]:
|
||||
conn = sqlite3.connect(str(_DB_PATH))
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
try:
|
||||
yield conn
|
||||
conn.commit()
|
||||
except Exception:
|
||||
conn.rollback()
|
||||
raise
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _init_db() -> None:
|
||||
with _db() as conn:
|
||||
conn.executescript(_SCHEMA)
|
||||
_seed_sources(conn)
|
||||
|
||||
|
||||
def _load_corpus_config() -> list[dict]:
|
||||
f = _config_file()
|
||||
if not f.exists():
|
||||
return []
|
||||
try:
|
||||
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
|
||||
except yaml.YAMLError as exc:
|
||||
logger.warning("Failed to parse corpus config: %s", exc)
|
||||
return []
|
||||
return raw.get("corpus", {}).get("sources", []) or []
|
||||
|
||||
|
||||
def _seed_sources(conn: sqlite3.Connection) -> None:
|
||||
for src in _load_corpus_config():
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO corpus_sources (token, source_host, owner, consent_date, consent_method) "
|
||||
"VALUES (?, ?, ?, ?, ?)",
|
||||
(src["token"], src["source_host"], src["owner"],
|
||||
src["consent_date"], src["consent_method"]),
|
||||
)
|
||||
|
||||
|
||||
def _validate_token(token: str, conn: sqlite3.Connection) -> str:
|
||||
"""Return source_host for token, or raise 403."""
|
||||
row = conn.execute(
|
||||
"SELECT source_host FROM corpus_sources WHERE token = ? AND active = 1",
|
||||
(token,),
|
||||
).fetchone()
|
||||
if row is None:
|
||||
raise HTTPException(status_code=403, detail="Unknown or revoked consent token")
|
||||
return row["source_host"]
|
||||
|
||||
|
||||
def _extract_bearer(request: Request) -> str:
|
||||
auth = request.headers.get("Authorization", "")
|
||||
if not auth.startswith("Bearer "):
|
||||
raise HTTPException(status_code=401, detail="Bearer token required")
|
||||
return auth.removeprefix("Bearer ").strip()
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
# ── Startup ────────────────────────────────────────────────────────────────────
|
||||
|
||||
_init_db()
|
||||
|
||||
|
||||
# ── POST /api/corpus/log-batch ─────────────────────────────────────────────────
|
||||
|
||||
@router.post("/log-batch")
|
||||
def receive_batch(request: Request, payload: dict) -> dict:
|
||||
"""Accept a push batch from a Turnstone node."""
|
||||
token = _extract_bearer(request)
|
||||
|
||||
batch_type = payload.get("batch_type", "raw_entries")
|
||||
entries_raw = payload.get("entries", [])
|
||||
batch_id = payload.get("batch_id") or str(uuid.uuid4())
|
||||
|
||||
with _db() as conn:
|
||||
source_host = _validate_token(token, conn)
|
||||
|
||||
conn.execute(
|
||||
"INSERT INTO corpus_batches (id, source_host, batch_type, received_at, entry_count, "
|
||||
"watermark_from, watermark_to, raw_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
|
||||
(batch_id, source_host, batch_type, _now_iso(), len(entries_raw),
|
||||
str(payload.get("watermark_from", "")),
|
||||
str(payload.get("watermark_to", "")),
|
||||
json.dumps(payload)),
|
||||
)
|
||||
|
||||
stored = 0
|
||||
for entry in entries_raw:
|
||||
text = entry.get("text", "").strip()
|
||||
if not text:
|
||||
continue
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO corpus_entries "
|
||||
"(id, batch_id, source_host, origin_entry_id, timestamp_iso, severity, "
|
||||
"source_id, text, matched_patterns) "
|
||||
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
|
||||
(str(uuid.uuid4()), batch_id, source_host,
|
||||
entry.get("entry_id") or entry.get("id"),
|
||||
entry.get("timestamp_iso"),
|
||||
entry.get("severity"),
|
||||
entry.get("source_id"),
|
||||
text,
|
||||
json.dumps(entry.get("matched_patterns", []))),
|
||||
)
|
||||
stored += 1
|
||||
|
||||
logger.info("Received batch %s from %s: %d/%d entries stored",
|
||||
batch_id, source_host, stored, len(entries_raw))
|
||||
return {"received": True, "batch_id": batch_id, "entries_stored": stored}
|
||||
|
||||
|
||||
# ── GET /api/corpus/entries ────────────────────────────────────────────────────
|
||||
|
||||
@router.get("/entries")
|
||||
def list_entries(
|
||||
state: str = "unlabeled",
|
||||
source_host: str | None = None,
|
||||
limit: int = 25,
|
||||
) -> dict:
|
||||
"""Return entries for labeling. Default: unlabeled entries, oldest first."""
|
||||
with _db() as conn:
|
||||
query = "SELECT * FROM corpus_entries WHERE label_state = ?"
|
||||
params: list = [state]
|
||||
if source_host:
|
||||
query += " AND source_host = ?"
|
||||
params.append(source_host)
|
||||
query += " ORDER BY rowid LIMIT ?"
|
||||
params.append(min(limit, 100))
|
||||
rows = conn.execute(query, params).fetchall()
|
||||
return {"entries": [dict(r) for r in rows], "count": len(rows)}
|
||||
|
||||
|
||||
# ── POST /api/corpus/entries/{id}/label ───────────────────────────────────────
|
||||
|
||||
@router.post("/entries/{entry_id}/label")
|
||||
def label_entry(entry_id: str, body: dict) -> dict:
|
||||
"""Submit a label for a corpus entry."""
|
||||
failure_type = body.get("failure_type")
|
||||
plain_explanation = body.get("plain_explanation", "").strip()
|
||||
known_pattern = body.get("known_pattern")
|
||||
pii_flagged = int(bool(body.get("pii_flagged", False)))
|
||||
|
||||
if not failure_type:
|
||||
raise HTTPException(status_code=422, detail="failure_type is required")
|
||||
valid_types = {"hardware", "software", "network", "security", "application", "none", "other"}
|
||||
if failure_type not in valid_types:
|
||||
raise HTTPException(status_code=422, detail=f"failure_type must be one of {sorted(valid_types)}")
|
||||
|
||||
with _db() as conn:
|
||||
row = conn.execute("SELECT id FROM corpus_entries WHERE id = ?", (entry_id,)).fetchone()
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="Entry not found")
|
||||
conn.execute(
|
||||
"UPDATE corpus_entries SET label_state='labeled', failure_type=?, plain_explanation=?, "
|
||||
"known_pattern=?, labeled_at=?, pii_flagged=? WHERE id=?",
|
||||
(failure_type, plain_explanation, known_pattern, _now_iso(), pii_flagged, entry_id),
|
||||
)
|
||||
return {"labeled": True, "entry_id": entry_id}
|
||||
|
||||
|
||||
# ── POST /api/corpus/entries/{id}/skip ────────────────────────────────────────
|
||||
|
||||
@router.post("/entries/{entry_id}/skip")
|
||||
def skip_entry(entry_id: str) -> dict:
|
||||
with _db() as conn:
|
||||
row = conn.execute("SELECT id FROM corpus_entries WHERE id = ?", (entry_id,)).fetchone()
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="Entry not found")
|
||||
conn.execute(
|
||||
"UPDATE corpus_entries SET label_state='skipped' WHERE id=?", (entry_id,)
|
||||
)
|
||||
return {"skipped": True, "entry_id": entry_id}
|
||||
|
||||
|
||||
# ── GET /api/corpus/stats ──────────────────────────────────────────────────────
|
||||
|
||||
@router.get("/stats")
|
||||
def get_stats() -> dict:
|
||||
with _db() as conn:
|
||||
total = conn.execute("SELECT COUNT(*) FROM corpus_entries").fetchone()[0]
|
||||
by_state = {
|
||||
r["label_state"]: r["cnt"]
|
||||
for r in conn.execute(
|
||||
"SELECT label_state, COUNT(*) AS cnt FROM corpus_entries GROUP BY label_state"
|
||||
).fetchall()
|
||||
}
|
||||
by_source = {
|
||||
r["source_host"]: r["cnt"]
|
||||
for r in conn.execute(
|
||||
"SELECT source_host, COUNT(*) AS cnt FROM corpus_entries GROUP BY source_host"
|
||||
).fetchall()
|
||||
}
|
||||
by_severity = {
|
||||
r["severity"]: r["cnt"]
|
||||
for r in conn.execute(
|
||||
"SELECT severity, COUNT(*) AS cnt FROM corpus_entries "
|
||||
"WHERE severity IS NOT NULL GROUP BY severity"
|
||||
).fetchall()
|
||||
}
|
||||
batch_count = conn.execute("SELECT COUNT(*) FROM corpus_batches").fetchone()[0]
|
||||
return {
|
||||
"total_entries": total,
|
||||
"batch_count": batch_count,
|
||||
"by_label_state": by_state,
|
||||
"by_source": by_source,
|
||||
"by_severity": by_severity,
|
||||
}
|
||||
|
||||
|
||||
# ── GET /api/corpus/export ────────────────────────────────────────────────────
|
||||
|
||||
@router.get("/export")
|
||||
def export_labeled() -> StreamingResponse:
|
||||
"""Stream labeled, non-PII entries as JSONL for SFT harness."""
|
||||
with _db() as conn:
|
||||
rows = conn.execute(
|
||||
"SELECT source_host, source_id, severity, text, failure_type, plain_explanation, known_pattern "
|
||||
"FROM corpus_entries "
|
||||
"WHERE label_state = 'labeled' AND pii_flagged = 0 AND plain_explanation != ''"
|
||||
"ORDER BY rowid"
|
||||
).fetchall()
|
||||
|
||||
def _generate():
|
||||
for row in rows:
|
||||
record = {
|
||||
"input": row["text"],
|
||||
"output": row["plain_explanation"],
|
||||
"metadata": {
|
||||
"failure_type": row["failure_type"],
|
||||
"source": row["source_host"],
|
||||
"source_id": row["source_id"],
|
||||
"severity": row["severity"],
|
||||
"known_pattern": row["known_pattern"],
|
||||
},
|
||||
}
|
||||
yield json.dumps(record) + "\n"
|
||||
|
||||
return StreamingResponse(
|
||||
_generate(),
|
||||
media_type="application/x-ndjson",
|
||||
headers={"Content-Disposition": "attachment; filename=log_corpus_labeled.jsonl"},
|
||||
)
|
||||
|
|
@ -12,33 +12,27 @@ Route prefixes when mounted at /api in api.py:
|
|||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import APIRouter
|
||||
|
||||
from app.cforch import router as _cforch_router
|
||||
from app.style import router as _style_router
|
||||
from app.voice import router as _voice_router
|
||||
from app.plans_bench import router as _plans_router
|
||||
from app.eval.embed_bench import router as _embed_router
|
||||
|
||||
router = APIRouter()
|
||||
router.include_router(_cforch_router, prefix="/cforch")
|
||||
router.include_router(_style_router, prefix="/style")
|
||||
router.include_router(_voice_router, prefix="/voice")
|
||||
router.include_router(_plans_router, prefix="/plans-bench")
|
||||
router.include_router(_embed_router, prefix="/embed-bench")
|
||||
|
||||
|
||||
def set_config_dir(path: Path | None) -> None:
|
||||
def set_config_dir(path) -> None:
|
||||
"""Propagate config dir override to all sub-modules -- used by tests."""
|
||||
import app.cforch as _cforch_mod
|
||||
import app.style as _style_mod
|
||||
import app.voice as _voice_mod
|
||||
import app.plans_bench as _plans_mod
|
||||
import app.eval.embed_bench as _embed_mod
|
||||
_cforch_mod.set_config_dir(path)
|
||||
_style_mod.set_config_dir(path)
|
||||
_voice_mod.set_config_dir(path)
|
||||
_plans_mod.set_config_dir(path)
|
||||
_embed_mod.set_config_dir(path)
|
||||
|
|
|
|||
|
|
@ -1,293 +0,0 @@
|
|||
"""Avocet — embedding model comparison harness.
|
||||
|
||||
Exposes FastAPI routes under /api/embed-bench (mounted via app/eval/cforch.py).
|
||||
All computation is local: no LLM inference, Ollama only. MIT tier throughout.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
import yaml
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from fastapi.responses import StreamingResponse
|
||||
from pydantic import BaseModel, field_validator
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_ROOT = Path(__file__).parent.parent.parent
|
||||
_CONFIG_DIR: Path | None = None # override via set_config_dir() in tests
|
||||
_RUN_ACTIVE: bool = False
|
||||
_RATINGS_FILE = _ROOT / "data" / "embed_bench_ratings.jsonl"
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
# ── Testability seam ──────────────────────────────────────────────────────────
|
||||
|
||||
def set_config_dir(path: Path | None) -> None:
|
||||
global _CONFIG_DIR
|
||||
_CONFIG_DIR = path
|
||||
|
||||
|
||||
# ── Internal helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
def _config_file() -> Path:
|
||||
if _CONFIG_DIR is not None:
|
||||
return _CONFIG_DIR / "label_tool.yaml"
|
||||
return _ROOT / "config" / "label_tool.yaml"
|
||||
|
||||
|
||||
def _load_config() -> dict[str, Any]:
|
||||
f = _config_file()
|
||||
if not f.exists():
|
||||
return {}
|
||||
try:
|
||||
return yaml.safe_load(f.read_text(encoding="utf-8")) or {}
|
||||
except yaml.YAMLError as exc:
|
||||
logger.warning("Failed to parse embed_bench config %s: %s", f, exc)
|
||||
return {}
|
||||
|
||||
|
||||
def _ollama_url() -> str:
|
||||
cfg = _load_config()
|
||||
embed_cfg = cfg.get("embed_bench", {}) or {}
|
||||
cforch_cfg = cfg.get("cforch", {}) or {}
|
||||
return (
|
||||
embed_cfg.get("ollama_url")
|
||||
or cforch_cfg.get("ollama_url", "http://localhost:11434")
|
||||
)
|
||||
|
||||
|
||||
def _ratings_path() -> Path:
|
||||
if _CONFIG_DIR is not None:
|
||||
return _CONFIG_DIR / "embed_bench_ratings.jsonl"
|
||||
return _RATINGS_FILE
|
||||
|
||||
|
||||
def _cosine(a: list[float], b: list[float]) -> float:
|
||||
if len(a) != len(b):
|
||||
raise ValueError(
|
||||
f"Embedding dimension mismatch: {len(a)} vs {len(b)}"
|
||||
)
|
||||
dot = sum(x * y for x, y in zip(a, b))
|
||||
mag_a = math.sqrt(sum(x * x for x in a))
|
||||
mag_b = math.sqrt(sum(x * x for x in b))
|
||||
if mag_a == 0.0 or mag_b == 0.0:
|
||||
return 0.0
|
||||
return dot / (mag_a * mag_b)
|
||||
|
||||
|
||||
# ── GET /models ───────────────────────────────────────────────────────────────
|
||||
|
||||
@router.get("/models")
|
||||
def get_models() -> dict:
|
||||
"""Return Ollama embedding models available on the configured instance."""
|
||||
ollama = _ollama_url()
|
||||
models: list[dict] = []
|
||||
try:
|
||||
resp = httpx.get(f"{ollama}/api/tags", timeout=5.0)
|
||||
resp.raise_for_status()
|
||||
for entry in resp.json().get("models", []):
|
||||
models.append({
|
||||
"name": entry.get("name", ""),
|
||||
"size": entry.get("size", 0),
|
||||
})
|
||||
except httpx.HTTPStatusError as exc:
|
||||
logger.warning("Ollama /api/tags returned HTTP %s: %s", exc.response.status_code, exc)
|
||||
except httpx.RequestError as exc:
|
||||
logger.warning("Failed to reach Ollama for model list: %s", exc)
|
||||
return {"models": models, "ollama_url": ollama}
|
||||
|
||||
|
||||
# ── POST /run ─────────────────────────────────────────────────────────────────
|
||||
|
||||
class RunRequest(BaseModel):
|
||||
corpus: list[str]
|
||||
queries: list[str]
|
||||
models: list[str]
|
||||
top_k: int = 5
|
||||
ollama_url: str = ""
|
||||
|
||||
@field_validator("corpus")
|
||||
@classmethod
|
||||
def corpus_nonempty(cls, v: list[str]) -> list[str]:
|
||||
if not v:
|
||||
raise ValueError("corpus must not be empty")
|
||||
return v
|
||||
|
||||
@field_validator("queries")
|
||||
@classmethod
|
||||
def queries_nonempty(cls, v: list[str]) -> list[str]:
|
||||
if not v:
|
||||
raise ValueError("queries must not be empty")
|
||||
return v
|
||||
|
||||
@field_validator("models")
|
||||
@classmethod
|
||||
def models_nonempty(cls, v: list[str]) -> list[str]:
|
||||
if not v:
|
||||
raise ValueError("models must contain at least one model name")
|
||||
return v
|
||||
|
||||
|
||||
def _embed_texts(ollama: str, model: str, texts: list[str]) -> list[list[float]]:
|
||||
"""Batch-embed texts via Ollama /v1/embeddings. Returns one vector per text."""
|
||||
resp = httpx.post(
|
||||
f"{ollama}/v1/embeddings",
|
||||
json={"model": model, "input": texts},
|
||||
timeout=120.0,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json().get("data", [])
|
||||
return [item["embedding"] for item in data]
|
||||
|
||||
|
||||
def _sse(event: dict) -> str:
|
||||
return f"data: {json.dumps(event)}\n\n"
|
||||
|
||||
|
||||
@router.post("/run")
|
||||
def run_embed_bench(req: RunRequest) -> StreamingResponse:
|
||||
"""Embed corpus + queries with each model; stream SSE results."""
|
||||
global _RUN_ACTIVE
|
||||
|
||||
if _RUN_ACTIVE:
|
||||
raise HTTPException(409, "An embedding benchmark run is already active")
|
||||
|
||||
ollama = req.ollama_url or _ollama_url()
|
||||
|
||||
def _generate():
|
||||
global _RUN_ACTIVE
|
||||
_RUN_ACTIVE = True
|
||||
try:
|
||||
for model_idx, model in enumerate(req.models, start=1):
|
||||
yield _sse({
|
||||
"type": "progress",
|
||||
"msg": f"Indexing corpus with {model} ({model_idx}/{len(req.models)})...",
|
||||
})
|
||||
try:
|
||||
corpus_vecs = _embed_texts(ollama, model, req.corpus)
|
||||
except Exception as exc:
|
||||
yield _sse({"type": "error", "msg": f"Ollama error for {model}: {exc}"})
|
||||
continue
|
||||
|
||||
yield _sse({
|
||||
"type": "progress",
|
||||
"msg": f"Running queries with {model}...",
|
||||
})
|
||||
|
||||
for q_idx, query in enumerate(req.queries):
|
||||
try:
|
||||
q_vecs = _embed_texts(ollama, model, [query])
|
||||
except Exception as exc:
|
||||
yield _sse({"type": "error", "msg": f"Query embed error ({model}): {exc}"})
|
||||
continue
|
||||
q_vec = q_vecs[0]
|
||||
scored = sorted(
|
||||
[
|
||||
{"chunk_idx": i, "text": chunk, "score": round(_cosine(q_vec, cv), 4)}
|
||||
for i, (chunk, cv) in enumerate(zip(req.corpus, corpus_vecs))
|
||||
],
|
||||
key=lambda h: h["score"],
|
||||
reverse=True,
|
||||
)[: req.top_k]
|
||||
yield _sse({
|
||||
"type": "result",
|
||||
"query_idx": q_idx,
|
||||
"query": query,
|
||||
"model": model,
|
||||
"hits": scored,
|
||||
})
|
||||
|
||||
yield _sse({"type": "done"})
|
||||
finally:
|
||||
_RUN_ACTIVE = False
|
||||
|
||||
return StreamingResponse(_generate(), media_type="text/event-stream")
|
||||
|
||||
|
||||
# ── POST /rate ────────────────────────────────────────────────────────────────
|
||||
|
||||
_VALID_RATINGS = {"relevant", "not_relevant"}
|
||||
|
||||
|
||||
class RatingRequest(BaseModel):
|
||||
query: str
|
||||
model: str
|
||||
chunk_text: str
|
||||
chunk_idx: int
|
||||
rating: str
|
||||
|
||||
@field_validator("rating")
|
||||
@classmethod
|
||||
def rating_valid(cls, v: str) -> str:
|
||||
if v not in _VALID_RATINGS:
|
||||
raise ValueError(f"rating must be one of {_VALID_RATINGS}")
|
||||
return v
|
||||
|
||||
|
||||
@router.post("/rate")
|
||||
def rate_result(req: RatingRequest) -> dict:
|
||||
"""Append one rating to the JSONL ratings file."""
|
||||
entry = {
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"query": req.query,
|
||||
"model": req.model,
|
||||
"chunk_idx": req.chunk_idx,
|
||||
"chunk_text": req.chunk_text,
|
||||
"rating": req.rating,
|
||||
}
|
||||
path = _ratings_path()
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with path.open("a", encoding="utf-8") as fh:
|
||||
fh.write(json.dumps(entry) + "\n")
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
# ── GET /export ───────────────────────────────────────────────────────────────
|
||||
|
||||
_CSV_FIELDS = ["timestamp", "query", "model", "chunk_idx", "chunk_text", "rating"]
|
||||
|
||||
|
||||
@router.get("/export")
|
||||
def export_ratings(format: str = "csv") -> Any:
|
||||
"""Download ratings as CSV or JSON."""
|
||||
path = _ratings_path()
|
||||
rows: list[dict] = []
|
||||
if path.exists():
|
||||
for raw in path.read_text(encoding="utf-8").splitlines():
|
||||
raw = raw.strip()
|
||||
if raw:
|
||||
try:
|
||||
rows.append(json.loads(raw))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
|
||||
if format == "json":
|
||||
content = json.dumps(rows, ensure_ascii=False, indent=2)
|
||||
return StreamingResponse(
|
||||
iter([content]),
|
||||
media_type="application/json",
|
||||
headers={"Content-Disposition": f'attachment; filename="embed_comparison_{date_str}.json"'},
|
||||
)
|
||||
|
||||
# Default: CSV
|
||||
buf = io.StringIO()
|
||||
writer = csv.DictWriter(buf, fieldnames=_CSV_FIELDS, extrasaction="ignore")
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
return StreamingResponse(
|
||||
iter([buf.getvalue()]),
|
||||
media_type="text/csv",
|
||||
headers={"Content-Disposition": f'attachment; filename="embed_comparison_{date_str}.csv"'},
|
||||
)
|
||||
|
|
@ -38,17 +38,13 @@ except ImportError: # pragma: no cover
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
_ROOT = Path(__file__).parent.parent
|
||||
_MODELS_DIR: Path = Path(
|
||||
os.environ.get("AVOCET_MODELS_DIR", str(_ROOT / "models"))
|
||||
)
|
||||
_MODELS_DIR: Path = _ROOT / "models"
|
||||
_QUEUE_DIR: Path = _ROOT / "data"
|
||||
|
||||
# Service-specific model destinations.
|
||||
# cf-text models land on the NFS-mounted shared asset store so every cluster
|
||||
# node can reach them without a separate download. Avocet classifiers default
|
||||
# to a local path but can be redirected via AVOCET_MODELS_DIR — set this to
|
||||
# /Library/Assets/LLM/avocet/models on NFS-connected nodes to keep all model
|
||||
# weights out of the repo directory.
|
||||
# node can reach them without a separate download. Avocet classifiers stay local
|
||||
# because they are fine-tuned in-place and are only consumed by avocet itself.
|
||||
# Override via CF_TEXT_MODELS_DIR env var (useful for dev / non-NFS setups).
|
||||
_CF_TEXT_MODELS_DIR: Path = Path(
|
||||
os.environ.get("CF_TEXT_MODELS_DIR", "/Library/Assets/LLM/cf-text/models")
|
||||
|
|
|
|||
|
|
@ -110,19 +110,3 @@ imitate:
|
|||
sample_endpoint: /api/listings
|
||||
text_fields: [title, description, seller_info]
|
||||
prompt_template: "Evaluate the trustworthiness of this listing and flag any red flags:\n\n{text}"
|
||||
|
||||
- id: pagepiper
|
||||
name: Pagepiper
|
||||
icon: "📄"
|
||||
description: "PDF/rulebook RAG tool: page-level text chunks"
|
||||
base_url: http://localhost:8511
|
||||
health_path: /api/health
|
||||
sample_endpoint: /api/library
|
||||
chunk_endpoint: /api/library/sample-chunks?limit=50 # requires pagepiper#6
|
||||
text_fields: [title]
|
||||
prompt_template: "Summarize the key rules described in this passage:\n\n{text}"
|
||||
|
||||
# ── Embedding model comparison harness ────────────────────────────────────────
|
||||
embed_bench:
|
||||
# ollama_url: http://localhost:11434 # optional; falls back to cforch.ollama_url
|
||||
# top_k: 5 # default hits per model per query
|
||||
|
|
|
|||
|
|
@ -1,234 +0,0 @@
|
|||
"""Tests for app/eval/embed_bench.py."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
|
||||
# ── Fixtures ──────────────────────────────────────────────────────────────────
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def reset_embed_bench_globals(tmp_path):
|
||||
"""Redirect config dir to tmp_path and reset running flag."""
|
||||
from app.eval import embed_bench as mod
|
||||
|
||||
prev_config_dir = mod._CONFIG_DIR
|
||||
prev_running = mod._RUN_ACTIVE
|
||||
|
||||
mod.set_config_dir(tmp_path)
|
||||
mod._RUN_ACTIVE = False
|
||||
|
||||
yield tmp_path
|
||||
|
||||
mod.set_config_dir(prev_config_dir)
|
||||
mod._RUN_ACTIVE = prev_running
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client():
|
||||
from app.api import app
|
||||
return TestClient(app)
|
||||
|
||||
|
||||
# ── cosine helper ──────────────────────────────────────────────────────────────
|
||||
|
||||
def test_cosine_identical():
|
||||
from app.eval.embed_bench import _cosine
|
||||
assert _cosine([1.0, 0.0], [1.0, 0.0]) == pytest.approx(1.0)
|
||||
|
||||
|
||||
def test_cosine_orthogonal():
|
||||
from app.eval.embed_bench import _cosine
|
||||
assert _cosine([1.0, 0.0], [0.0, 1.0]) == pytest.approx(0.0)
|
||||
|
||||
|
||||
def test_cosine_opposite():
|
||||
from app.eval.embed_bench import _cosine
|
||||
assert _cosine([1.0, 0.0], [-1.0, 0.0]) == pytest.approx(-1.0)
|
||||
|
||||
|
||||
def test_cosine_zero_vector_returns_zero():
|
||||
from app.eval.embed_bench import _cosine
|
||||
assert _cosine([0.0, 0.0], [1.0, 0.0]) == pytest.approx(0.0)
|
||||
|
||||
|
||||
# ── models endpoint ────────────────────────────────────────────────────────────
|
||||
|
||||
def test_models_returns_list_with_mock(client, tmp_path):
|
||||
"""GET /api/embed-bench/models returns list from Ollama tags endpoint."""
|
||||
import yaml
|
||||
cfg = {"cforch": {"ollama_url": "http://localhost:11434"}}
|
||||
(tmp_path / "label_tool.yaml").write_text(yaml.dump(cfg))
|
||||
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.json.return_value = {
|
||||
"models": [
|
||||
{"name": "nomic-embed-text", "size": 274302480},
|
||||
{"name": "mxbai-embed-large", "size": 669000000},
|
||||
]
|
||||
}
|
||||
mock_resp.raise_for_status = MagicMock()
|
||||
|
||||
with patch("app.eval.embed_bench.httpx.get", return_value=mock_resp):
|
||||
r = client.get("/api/embed-bench/models")
|
||||
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert isinstance(data["models"], list)
|
||||
assert any(m["name"] == "nomic-embed-text" for m in data["models"])
|
||||
|
||||
|
||||
def test_models_returns_empty_on_ollama_error(client, tmp_path):
|
||||
"""GET /api/embed-bench/models returns empty list if Ollama unreachable."""
|
||||
import httpx
|
||||
with patch("app.eval.embed_bench.httpx.get", side_effect=httpx.ConnectError("refused")):
|
||||
r = client.get("/api/embed-bench/models")
|
||||
assert r.status_code == 200
|
||||
assert r.json()["models"] == []
|
||||
|
||||
|
||||
# ── run endpoint ───────────────────────────────────────────────────────────────
|
||||
|
||||
def test_run_empty_corpus_returns_422(client):
|
||||
r = client.post("/api/embed-bench/run", json={
|
||||
"corpus": [], "queries": ["test"], "models": ["nomic-embed-text"], "top_k": 3
|
||||
})
|
||||
assert r.status_code == 422
|
||||
|
||||
|
||||
def test_run_empty_queries_returns_422(client):
|
||||
r = client.post("/api/embed-bench/run", json={
|
||||
"corpus": ["chunk 1"], "queries": [], "models": ["nomic-embed-text"], "top_k": 3
|
||||
})
|
||||
assert r.status_code == 422
|
||||
|
||||
|
||||
def test_run_empty_models_returns_422(client):
|
||||
r = client.post("/api/embed-bench/run", json={
|
||||
"corpus": ["chunk 1"], "queries": ["test"], "models": [], "top_k": 3
|
||||
})
|
||||
assert r.status_code == 422
|
||||
|
||||
|
||||
def _fake_embed_response(texts: list[str]) -> MagicMock:
|
||||
"""Build a mock httpx.post response returning unit vectors for each text."""
|
||||
resp = MagicMock()
|
||||
resp.raise_for_status = MagicMock()
|
||||
resp.json.return_value = {
|
||||
"data": [{"embedding": [1.0, 0.0, 0.0] if i % 2 == 0 else [0.0, 1.0, 0.0]}
|
||||
for i, _ in enumerate(texts)]
|
||||
}
|
||||
return resp
|
||||
|
||||
|
||||
def _collect_sse(raw: bytes) -> list[dict]:
|
||||
"""Parse SSE stream bytes into a list of decoded event dicts."""
|
||||
events = []
|
||||
for line in raw.decode().splitlines():
|
||||
if line.startswith("data: "):
|
||||
events.append(json.loads(line[6:]))
|
||||
return events
|
||||
|
||||
|
||||
def test_run_single_model_returns_result_and_done(client, tmp_path):
|
||||
import yaml
|
||||
(tmp_path / "label_tool.yaml").write_text(yaml.dump({"cforch": {"ollama_url": "http://localhost:11434"}}))
|
||||
|
||||
with patch("app.eval.embed_bench.httpx.post", return_value=_fake_embed_response(["chunk 1", "chunk 2"])):
|
||||
r = client.post("/api/embed-bench/run", json={
|
||||
"corpus": ["chunk 1", "chunk 2"],
|
||||
"queries": ["what is chunk one?"],
|
||||
"models": ["nomic-embed-text"],
|
||||
"top_k": 2,
|
||||
})
|
||||
|
||||
assert r.status_code == 200
|
||||
events = _collect_sse(r.content)
|
||||
types = [e["type"] for e in events]
|
||||
assert "result" in types
|
||||
assert types[-1] == "done"
|
||||
result_events = [e for e in events if e["type"] == "result"]
|
||||
assert result_events[0]["model"] == "nomic-embed-text"
|
||||
assert result_events[0]["query_idx"] == 0
|
||||
assert len(result_events[0]["hits"]) <= 2
|
||||
|
||||
|
||||
def test_run_two_models_returns_two_result_events_per_query(client, tmp_path):
|
||||
import yaml
|
||||
(tmp_path / "label_tool.yaml").write_text(yaml.dump({"cforch": {"ollama_url": "http://localhost:11434"}}))
|
||||
|
||||
with patch("app.eval.embed_bench.httpx.post", return_value=_fake_embed_response(["chunk A", "chunk B"])):
|
||||
r = client.post("/api/embed-bench/run", json={
|
||||
"corpus": ["chunk A", "chunk B"],
|
||||
"queries": ["find it"],
|
||||
"models": ["nomic-embed-text", "mxbai-embed-large"],
|
||||
"top_k": 2,
|
||||
})
|
||||
|
||||
events = _collect_sse(r.content)
|
||||
result_events = [e for e in events if e["type"] == "result"]
|
||||
models_seen = {e["model"] for e in result_events}
|
||||
assert "nomic-embed-text" in models_seen
|
||||
assert "mxbai-embed-large" in models_seen
|
||||
|
||||
|
||||
# ── rate + export ──────────────────────────────────────────────────────────────
|
||||
|
||||
def test_rate_appends_jsonl_line(client, tmp_path):
|
||||
r = client.post("/api/embed-bench/rate", json={
|
||||
"query": "test query",
|
||||
"model": "nomic-embed-text",
|
||||
"chunk_text": "some text",
|
||||
"chunk_idx": 2,
|
||||
"rating": "relevant",
|
||||
})
|
||||
assert r.status_code == 200
|
||||
assert r.json() == {"ok": True}
|
||||
ratings_file = tmp_path / "embed_bench_ratings.jsonl"
|
||||
assert ratings_file.exists()
|
||||
line = json.loads(ratings_file.read_text().strip())
|
||||
assert line["query"] == "test query"
|
||||
assert line["rating"] == "relevant"
|
||||
assert line["chunk_idx"] == 2
|
||||
assert "timestamp" in line
|
||||
|
||||
|
||||
def test_export_csv_two_rows(client, tmp_path):
|
||||
for i in range(2):
|
||||
client.post("/api/embed-bench/rate", json={
|
||||
"query": f"q{i}", "model": "nomic-embed-text",
|
||||
"chunk_text": f"chunk {i}", "chunk_idx": i, "rating": "relevant",
|
||||
})
|
||||
r = client.get("/api/embed-bench/export?format=csv")
|
||||
assert r.status_code == 200
|
||||
assert "text/csv" in r.headers["content-type"]
|
||||
lines = r.text.strip().splitlines()
|
||||
assert len(lines) == 3 # header + 2 rows
|
||||
assert "query" in lines[0]
|
||||
|
||||
|
||||
def test_export_json_two_entries(client, tmp_path):
|
||||
for i in range(2):
|
||||
client.post("/api/embed-bench/rate", json={
|
||||
"query": f"q{i}", "model": "nomic-embed-text",
|
||||
"chunk_text": f"chunk {i}", "chunk_idx": i, "rating": "not_relevant",
|
||||
})
|
||||
r = client.get("/api/embed-bench/export?format=json")
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert isinstance(data, list)
|
||||
assert len(data) == 2
|
||||
assert data[0]["rating"] == "not_relevant"
|
||||
|
||||
|
||||
def test_export_empty_returns_csv_header_only(client):
|
||||
r = client.get("/api/embed-bench/export?format=csv")
|
||||
assert r.status_code == 200
|
||||
lines = r.text.strip().splitlines()
|
||||
assert len(lines) == 1 # header only
|
||||
assert "query" in lines[0]
|
||||
|
|
@ -1,272 +0,0 @@
|
|||
"""Tests for app/data/log_corpus.py — corpus receiver and labeling endpoints."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from app.data import log_corpus as lc
|
||||
|
||||
|
||||
VALID_TOKEN = str(uuid.uuid4())
|
||||
VALID_HOST = "testnode.local"
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def isolated_db(tmp_path, monkeypatch):
|
||||
"""Each test gets its own fresh corpus DB and config dir."""
|
||||
monkeypatch.setattr(lc, "_DATA_DIR", tmp_path)
|
||||
monkeypatch.setattr(lc, "_DB_PATH", tmp_path / "corpus.db")
|
||||
# Config dir pointing to a temp yaml with one test source
|
||||
config_dir = tmp_path / "config"
|
||||
config_dir.mkdir()
|
||||
(config_dir / "label_tool.yaml").write_text(
|
||||
f"corpus:\n sources:\n"
|
||||
f" - token: \"{VALID_TOKEN}\"\n"
|
||||
f" source_host: \"{VALID_HOST}\"\n"
|
||||
f" owner: TestOwner\n"
|
||||
f" consent_date: \"2026-05-11\"\n"
|
||||
f" consent_method: signal_chat\n"
|
||||
)
|
||||
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
|
||||
lc._init_db()
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def client():
|
||||
from fastapi import FastAPI
|
||||
app = FastAPI()
|
||||
app.include_router(lc.router, prefix="/api/corpus")
|
||||
return TestClient(app)
|
||||
|
||||
|
||||
def _batch(batch_type="raw_entries", entries=None, source_host=VALID_HOST):
|
||||
return {
|
||||
"batch_version": 1,
|
||||
"batch_id": str(uuid.uuid4()),
|
||||
"pushed_at": "2026-05-11T10:00:00Z",
|
||||
"source_host": source_host,
|
||||
"batch_type": batch_type,
|
||||
"watermark_from": 0,
|
||||
"watermark_to": 5,
|
||||
"entries": entries or [
|
||||
{
|
||||
"entry_id": str(uuid.uuid4()),
|
||||
"source_id": "sonarr",
|
||||
"timestamp_iso": "2026-05-11T09:58:00Z",
|
||||
"severity": "ERROR",
|
||||
"text": "Connection refused to indexer",
|
||||
"matched_patterns": [],
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
# ── Receive endpoint ───────────────────────────────────────────────────────────
|
||||
|
||||
def test_receive_missing_auth(client):
|
||||
resp = client.post("/api/corpus/log-batch", json=_batch())
|
||||
assert resp.status_code == 401
|
||||
|
||||
|
||||
def test_receive_invalid_token(client):
|
||||
resp = client.post(
|
||||
"/api/corpus/log-batch",
|
||||
json=_batch(),
|
||||
headers={"Authorization": "Bearer bad-token"},
|
||||
)
|
||||
assert resp.status_code == 403
|
||||
|
||||
|
||||
def test_receive_valid_batch(client):
|
||||
resp = client.post(
|
||||
"/api/corpus/log-batch",
|
||||
json=_batch(),
|
||||
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert data["received"] is True
|
||||
assert data["entries_stored"] == 1
|
||||
|
||||
|
||||
def test_receive_stores_source_host_from_token_not_payload(client):
|
||||
"""source_host is always taken from the DB lookup, not the payload."""
|
||||
payload = _batch(source_host="attacker-injected-host")
|
||||
resp = client.post(
|
||||
"/api/corpus/log-batch",
|
||||
json=payload,
|
||||
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
entries_resp = client.get("/api/corpus/entries")
|
||||
entry = entries_resp.json()["entries"][0]
|
||||
assert entry["source_host"] == VALID_HOST
|
||||
|
||||
|
||||
def test_receive_skips_empty_text_entries(client):
|
||||
payload = _batch(entries=[
|
||||
{"entry_id": "e1", "source_id": "svc", "severity": "ERROR", "text": ""},
|
||||
{"entry_id": "e2", "source_id": "svc", "severity": "ERROR", "text": " "},
|
||||
{"entry_id": "e3", "source_id": "svc", "severity": "ERROR", "text": "real error"},
|
||||
])
|
||||
resp = client.post(
|
||||
"/api/corpus/log-batch",
|
||||
json=payload,
|
||||
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
|
||||
)
|
||||
assert resp.json()["entries_stored"] == 1
|
||||
|
||||
|
||||
def test_receive_incident_bundle(client):
|
||||
payload = _batch(batch_type="incident_bundles", entries=[
|
||||
{"id": "inc-1", "label": "plex crash", "issue_type": "plex",
|
||||
"started_at": "2026-05-11T09:00:00", "ended_at": "2026-05-11T09:30:00",
|
||||
"notes": "audio dropped", "created_at": "2026-05-11T09:35:00",
|
||||
"severity": "high", "text": "plex crash"},
|
||||
])
|
||||
resp = client.post(
|
||||
"/api/corpus/log-batch",
|
||||
json=payload,
|
||||
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["entries_stored"] == 1
|
||||
|
||||
|
||||
# ── Labeling endpoints ─────────────────────────────────────────────────────────
|
||||
|
||||
def test_label_entry(client):
|
||||
client.post(
|
||||
"/api/corpus/log-batch",
|
||||
json=_batch(),
|
||||
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
|
||||
)
|
||||
entry_id = client.get("/api/corpus/entries").json()["entries"][0]["id"]
|
||||
|
||||
resp = client.post(f"/api/corpus/entries/{entry_id}/label", json={
|
||||
"failure_type": "software",
|
||||
"plain_explanation": "Sonarr lost connection to its indexer — restart the service.",
|
||||
"known_pattern": "y",
|
||||
})
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["labeled"] is True
|
||||
|
||||
entries = client.get("/api/corpus/entries", params={"state": "labeled"}).json()["entries"]
|
||||
assert len(entries) == 1
|
||||
assert entries[0]["failure_type"] == "software"
|
||||
|
||||
|
||||
def test_label_entry_invalid_failure_type(client):
|
||||
client.post(
|
||||
"/api/corpus/log-batch",
|
||||
json=_batch(),
|
||||
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
|
||||
)
|
||||
entry_id = client.get("/api/corpus/entries").json()["entries"][0]["id"]
|
||||
resp = client.post(f"/api/corpus/entries/{entry_id}/label", json={"failure_type": "aliens"})
|
||||
assert resp.status_code == 422
|
||||
|
||||
|
||||
def test_label_entry_missing_failure_type(client):
|
||||
client.post(
|
||||
"/api/corpus/log-batch",
|
||||
json=_batch(),
|
||||
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
|
||||
)
|
||||
entry_id = client.get("/api/corpus/entries").json()["entries"][0]["id"]
|
||||
resp = client.post(f"/api/corpus/entries/{entry_id}/label", json={})
|
||||
assert resp.status_code == 422
|
||||
|
||||
|
||||
def test_label_entry_not_found(client):
|
||||
resp = client.post("/api/corpus/entries/nonexistent/label", json={"failure_type": "software"})
|
||||
assert resp.status_code == 404
|
||||
|
||||
|
||||
def test_skip_entry(client):
|
||||
client.post(
|
||||
"/api/corpus/log-batch",
|
||||
json=_batch(),
|
||||
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
|
||||
)
|
||||
entry_id = client.get("/api/corpus/entries").json()["entries"][0]["id"]
|
||||
resp = client.post(f"/api/corpus/entries/{entry_id}/skip")
|
||||
assert resp.status_code == 200
|
||||
|
||||
unlabeled = client.get("/api/corpus/entries").json()["entries"]
|
||||
assert len(unlabeled) == 0
|
||||
|
||||
|
||||
# ── Stats ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
def test_stats_empty(client):
|
||||
stats = client.get("/api/corpus/stats").json()
|
||||
assert stats["total_entries"] == 0
|
||||
assert stats["batch_count"] == 0
|
||||
|
||||
|
||||
def test_stats_after_receive(client):
|
||||
client.post(
|
||||
"/api/corpus/log-batch",
|
||||
json=_batch(),
|
||||
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
|
||||
)
|
||||
stats = client.get("/api/corpus/stats").json()
|
||||
assert stats["total_entries"] == 1
|
||||
assert stats["batch_count"] == 1
|
||||
assert stats["by_label_state"].get("unlabeled", 0) == 1
|
||||
|
||||
|
||||
# ── Export ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def test_export_excludes_unlabeled(client):
|
||||
client.post(
|
||||
"/api/corpus/log-batch",
|
||||
json=_batch(),
|
||||
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
|
||||
)
|
||||
resp = client.get("/api/corpus/export")
|
||||
assert resp.status_code == 200
|
||||
assert resp.text.strip() == ""
|
||||
|
||||
|
||||
def test_export_includes_labeled(client):
|
||||
client.post(
|
||||
"/api/corpus/log-batch",
|
||||
json=_batch(),
|
||||
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
|
||||
)
|
||||
entry_id = client.get("/api/corpus/entries").json()["entries"][0]["id"]
|
||||
client.post(f"/api/corpus/entries/{entry_id}/label", json={
|
||||
"failure_type": "software",
|
||||
"plain_explanation": "Sonarr lost connection to indexer.",
|
||||
})
|
||||
|
||||
resp = client.get("/api/corpus/export")
|
||||
assert resp.status_code == 200
|
||||
lines = [l for l in resp.text.strip().splitlines() if l]
|
||||
assert len(lines) == 1
|
||||
record = json.loads(lines[0])
|
||||
assert record["output"] == "Sonarr lost connection to indexer."
|
||||
assert record["metadata"]["failure_type"] == "software"
|
||||
|
||||
|
||||
def test_export_excludes_pii_flagged(client):
|
||||
client.post(
|
||||
"/api/corpus/log-batch",
|
||||
json=_batch(),
|
||||
headers={"Authorization": f"Bearer {VALID_TOKEN}"},
|
||||
)
|
||||
entry_id = client.get("/api/corpus/entries").json()["entries"][0]["id"]
|
||||
client.post(f"/api/corpus/entries/{entry_id}/label", json={
|
||||
"failure_type": "software",
|
||||
"plain_explanation": "Contains username — should not export.",
|
||||
"pii_flagged": True,
|
||||
})
|
||||
|
||||
resp = client.get("/api/corpus/export")
|
||||
assert resp.text.strip() == ""
|
||||
|
|
@ -225,7 +225,6 @@ const dataItems: NavItem[] = [
|
|||
const evalItems: NavItem[] = [
|
||||
{ path: '/eval/benchmark', icon: '📊', label: 'Benchmark' },
|
||||
{ path: '/eval/compare', icon: '🔍', label: 'Compare' },
|
||||
{ path: '/eval/embed-compare', icon: '🧮', label: 'Embed Compare' },
|
||||
]
|
||||
|
||||
const trainItems: NavItem[] = [
|
||||
|
|
|
|||
|
|
@ -30,7 +30,6 @@ export const routes = [
|
|||
// ── Eval domain ──────────────────────────────────────────
|
||||
{ path: '/eval/benchmark', component: BenchmarkView, meta: { title: 'Benchmark' } },
|
||||
{ path: '/eval/compare', component: CompareView, meta: { title: 'Compare' } },
|
||||
{ path: '/eval/embed-compare', component: () => import('../views/EmbedCompareView.vue'), meta: { title: 'Embed Compare' } },
|
||||
|
||||
// ── Train domain ─────────────────────────────────────────
|
||||
{ path: '/train/jobs', component: TrainJobsView, meta: { title: 'Training Jobs' } },
|
||||
|
|
|
|||
|
|
@ -325,7 +325,7 @@ function toggleCategory(models: AvailableModel[], checked: boolean) {
|
|||
|
||||
async function loadModelCategories() {
|
||||
modelsLoading.value = true
|
||||
const { data } = await useApiFetch<ModelCategoriesResponse>('/api/cforch/models')
|
||||
const { data } = await useApiFetch<ModelCategoriesResponse>('/api/benchmark/models')
|
||||
modelsLoading.value = false
|
||||
if (data?.categories) {
|
||||
modelCategories.value = data.categories
|
||||
|
|
@ -342,7 +342,7 @@ const modelCount = computed(() => modelNames.value.length)
|
|||
const labelNames = computed(() => {
|
||||
const canonical = Object.keys(LABEL_META)
|
||||
const inResults = new Set(
|
||||
modelNames.value.flatMap(n => Object.keys(results.value?.models[n]?.per_label ?? {}))
|
||||
modelNames.value.flatMap(n => Object.keys(results.value!.models[n].per_label))
|
||||
)
|
||||
return [...canonical.filter(l => inResults.has(l)), ...[...inResults].filter(l => !canonical.includes(l))]
|
||||
})
|
||||
|
|
@ -401,16 +401,16 @@ function formatDate(iso: string | null): string {
|
|||
// ── Data loading ─────────────────────────────────────────────────────────────
|
||||
async function loadResults() {
|
||||
loading.value = true
|
||||
const { data } = await useApiFetch<BenchResults>('/api/cforch/results')
|
||||
const { data } = await useApiFetch<BenchResults>('/api/benchmark/results')
|
||||
loading.value = false
|
||||
if (data?.models && Object.keys(data.models).length > 0) {
|
||||
if (data && Object.keys(data.models).length > 0) {
|
||||
results.value = data
|
||||
}
|
||||
}
|
||||
|
||||
async function loadFineTunedModels() {
|
||||
const { data } = await useApiFetch<{ results: FineTunedModel[] }>('/api/train/results')
|
||||
if (Array.isArray(data?.results)) fineTunedModels.value = data.results
|
||||
const { data } = await useApiFetch<FineTunedModel[]>('/api/finetune/status')
|
||||
if (Array.isArray(data)) fineTunedModels.value = data
|
||||
}
|
||||
|
||||
// ── Benchmark run ────────────────────────────────────────────────────────────
|
||||
|
|
@ -428,7 +428,7 @@ function startBenchmark() {
|
|||
params.set('model_names', [...selectedModels.value].join(','))
|
||||
}
|
||||
const qs = params.toString()
|
||||
const url = `/api/cforch/run${qs ? `?${qs}` : ''}`
|
||||
const url = `/api/benchmark/run${qs ? `?${qs}` : ''}`
|
||||
useApiSSE(
|
||||
url,
|
||||
async (event) => {
|
||||
|
|
@ -457,7 +457,7 @@ function startBenchmark() {
|
|||
}
|
||||
|
||||
async function cancelBenchmark() {
|
||||
await fetch('/api/cforch/cancel', { method: 'POST' }).catch(() => {})
|
||||
await fetch('/api/benchmark/cancel', { method: 'POST' }).catch(() => {})
|
||||
}
|
||||
|
||||
// ── Fine-tune ─────────────────────────────────────────────────────────────────
|
||||
|
|
|
|||
|
|
@ -71,37 +71,34 @@
|
|||
rows="6"
|
||||
/>
|
||||
|
||||
<!-- LLM model picker (ollama + vllm + cf-text) -->
|
||||
<!-- Ollama model picker -->
|
||||
<details class="model-picker" open>
|
||||
<summary class="picker-summary">
|
||||
<span class="picker-title">🤖 LLM Models</span>
|
||||
<span class="picker-badge">{{ cmpSelectedModels.size }} / {{ llmSelectableModels.length }}</span>
|
||||
<span class="picker-title">🤖 Ollama Models</span>
|
||||
<span class="picker-badge">{{ cmpSelectedModels.size }} / {{ ollamaLlmModels.length }}</span>
|
||||
</summary>
|
||||
<div class="picker-body">
|
||||
<label class="picker-cat-header">
|
||||
<input
|
||||
type="checkbox"
|
||||
:checked="cmpSelectedModels.size === llmSelectableModels.length"
|
||||
:indeterminate="cmpSelectedModels.size > 0 && cmpSelectedModels.size < llmSelectableModels.length"
|
||||
:checked="cmpSelectedModels.size === ollamaLlmModels.length"
|
||||
:indeterminate="cmpSelectedModels.size > 0 && cmpSelectedModels.size < ollamaLlmModels.length"
|
||||
@change="toggleAllCmpModels(($event.target as HTMLInputElement).checked)"
|
||||
/>
|
||||
<span class="picker-cat-name">All LLM models</span>
|
||||
<span class="picker-cat-name">All ollama models</span>
|
||||
</label>
|
||||
<div v-for="(models, service) in llmModelsByService" :key="service" class="picker-category">
|
||||
<span class="picker-cat-section">{{ service }}</span>
|
||||
<div class="picker-model-list">
|
||||
<label v-for="m in models" :key="m.id" class="picker-model-row">
|
||||
<label v-for="m in ollamaLlmModels" :key="m.id" class="picker-model-row">
|
||||
<input
|
||||
type="checkbox"
|
||||
:checked="cmpSelectedModels.has(m.id)"
|
||||
@change="toggleCmpModel(m.id, ($event.target as HTMLInputElement).checked)"
|
||||
/>
|
||||
<span class="picker-model-name">{{ m.name }}</span>
|
||||
<span class="picker-adapter-type">{{ m.tags.slice(0, 2).join(', ') }}</span>
|
||||
<span class="picker-adapter-type">{{ m.tags.slice(0, 3).join(', ') }}</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
<!-- Run controls -->
|
||||
|
|
@ -235,22 +232,10 @@ const cmpResults = ref<CmpResult[]>([])
|
|||
const cmpEventSource = ref<EventSource | null>(null)
|
||||
|
||||
// ── Computed ────────────────────────────────────────────────────────────────
|
||||
const LLM_SERVICES = new Set(['ollama', 'vllm', 'cf-text'])
|
||||
|
||||
const llmSelectableModels = computed(() =>
|
||||
llmModels.value.filter(m => LLM_SERVICES.has(m.service))
|
||||
const ollamaLlmModels = computed(() =>
|
||||
llmModels.value.filter(m => m.service === 'ollama')
|
||||
)
|
||||
|
||||
/** Group selectable models by service for the picker UI */
|
||||
const llmModelsByService = computed((): Record<string, CfOrchModel[]> => {
|
||||
const groups: Record<string, CfOrchModel[]> = {}
|
||||
for (const m of llmSelectableModels.value) {
|
||||
if (!groups[m.service]) groups[m.service] = []
|
||||
groups[m.service].push(m)
|
||||
}
|
||||
return groups
|
||||
})
|
||||
|
||||
const llmTasksByType = computed((): Record<string, CfOrchTask[]> => {
|
||||
const groups: Record<string, CfOrchTask[]> = {}
|
||||
for (const t of llmTasks.value) {
|
||||
|
|
@ -285,7 +270,7 @@ function toggleCmpModel(id: string, checked: boolean) {
|
|||
|
||||
function toggleAllCmpModels(checked: boolean) {
|
||||
cmpSelectedModels.value = checked
|
||||
? new Set(llmSelectableModels.value.map(m => m.id))
|
||||
? new Set(ollamaLlmModels.value.map(m => m.id))
|
||||
: new Set()
|
||||
}
|
||||
|
||||
|
|
@ -303,8 +288,9 @@ async function loadLlmModels() {
|
|||
const { data } = await useApiFetch<{ models: CfOrchModel[] }>('/api/cforch/models')
|
||||
if (data?.models) {
|
||||
llmModels.value = data.models
|
||||
// Pre-select all ollama models
|
||||
cmpSelectedModels.value = new Set(
|
||||
data.models.filter(m => LLM_SERVICES.has(m.service)).map(m => m.id)
|
||||
data.models.filter(m => m.service === 'ollama').map(m => m.id)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,6 +28,9 @@
|
|||
<span class="metric-label"> labeled since last eval</span>
|
||||
</p>
|
||||
</div>
|
||||
<div v-if="data.signals.data_to_eval" class="card-cta">
|
||||
<RouterLink to="/eval/benchmark" class="cta-btn">Run Eval</RouterLink>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- ② Eval card -->
|
||||
|
|
@ -37,28 +40,18 @@
|
|||
<h2 class="card-title">Eval</h2>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="bench-run-table">
|
||||
<div
|
||||
v-for="(run, type) in data.recent_bench_runs"
|
||||
:key="type"
|
||||
class="bench-run-row"
|
||||
>
|
||||
<span class="bench-type-label">{{ BENCH_LABELS[type as BenchType] ?? type }}</span>
|
||||
<span class="bench-run-time" :class="{ 'metric-muted': !run.timestamp }">
|
||||
{{ run.timestamp ? formatBenchTs(run.timestamp) : '—' }}
|
||||
</span>
|
||||
<span v-if="run.score != null" class="bench-run-score">
|
||||
{{ formatScore(run.score) }}
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
<p class="card-metric">
|
||||
<span class="metric-label">Last run: </span>
|
||||
<strong class="metric-value">{{ formattedEvalTime }}</strong>
|
||||
</p>
|
||||
<p v-if="data.last_eval_best_score != null" class="card-metric">
|
||||
<span class="metric-label">Best score: </span>
|
||||
<strong class="metric-value">{{ formatScore(data.last_eval_best_score) }}</strong>
|
||||
</p>
|
||||
</div>
|
||||
<div v-if="data.signals.eval_to_train" class="card-cta">
|
||||
<RouterLink to="/train/jobs" class="cta-btn">Queue Finetune</RouterLink>
|
||||
</div>
|
||||
<div v-if="data.signals.data_to_eval" class="card-cta">
|
||||
<RouterLink to="/eval/benchmark" class="cta-btn">Run Eval</RouterLink>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- ③ Train card -->
|
||||
|
|
@ -111,49 +104,33 @@ interface DashboardSignals {
|
|||
train_to_fleet: boolean
|
||||
}
|
||||
|
||||
interface BenchRun {
|
||||
timestamp: string | null
|
||||
metric: string | null
|
||||
score: number | null
|
||||
}
|
||||
|
||||
type BenchType = 'classifier' | 'llm' | 'style' | 'plans'
|
||||
|
||||
interface DashboardData {
|
||||
labeled_since_last_eval: number
|
||||
last_eval_timestamp: string | null
|
||||
last_eval_best_score: number | null
|
||||
active_jobs: ActiveJob[]
|
||||
corrections_export_ready: number
|
||||
recent_bench_runs: Record<BenchType, BenchRun>
|
||||
signals: DashboardSignals
|
||||
}
|
||||
|
||||
const BENCH_LABELS: Record<BenchType, string> = {
|
||||
classifier: 'Classifier',
|
||||
llm: 'LLM Eval',
|
||||
style: 'Style',
|
||||
plans: 'Planning',
|
||||
}
|
||||
|
||||
const data = ref<DashboardData | null>(null)
|
||||
const loading = ref(false)
|
||||
const error = ref<string | null>(null)
|
||||
|
||||
function formatBenchTs(ts: string): string {
|
||||
const date = new Date(ts)
|
||||
if (!isNaN(date.getTime())) {
|
||||
const diff = Date.now() - date.getTime()
|
||||
const formattedEvalTime = computed(() => {
|
||||
if (!data.value?.last_eval_timestamp) return 'Never'
|
||||
const date = new Date(data.value.last_eval_timestamp)
|
||||
if (isNaN(date.getTime())) return 'Unknown'
|
||||
const now = Date.now()
|
||||
const diff = now - date.getTime()
|
||||
const mins = Math.floor(diff / 60000)
|
||||
if (mins < 1) return 'just now'
|
||||
if (mins < 60) return `${mins}m ago`
|
||||
const hrs = Math.floor(mins / 60)
|
||||
if (hrs < 24) return `${hrs}h ago`
|
||||
return `${Math.floor(hrs / 24)}d ago`
|
||||
}
|
||||
// Non-ISO: show as-is (plans bench uses "YYYY-MM-DD HH:MM")
|
||||
return ts.length > 16 ? ts.slice(0, 16) : ts
|
||||
}
|
||||
const days = Math.floor(hrs / 24)
|
||||
return `${days}d ago`
|
||||
})
|
||||
|
||||
function formatScore(score: number): string {
|
||||
return `${(score * 100).toFixed(1)}%`
|
||||
|
|
@ -308,42 +285,6 @@ onMounted(() => load())
|
|||
|
||||
.cta-btn:hover { background: color-mix(in srgb, var(--app-primary, #2A6080) 85%, black); }
|
||||
|
||||
/* ── Bench run table ── */
|
||||
.bench-run-table {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.3rem;
|
||||
}
|
||||
|
||||
.bench-run-row {
|
||||
display: grid;
|
||||
grid-template-columns: 6rem 1fr auto;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
font-size: 0.82rem;
|
||||
}
|
||||
|
||||
.bench-type-label {
|
||||
font-weight: 600;
|
||||
color: var(--color-text, #1a2338);
|
||||
font-size: 0.78rem;
|
||||
}
|
||||
|
||||
.bench-run-time {
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
font-size: 0.78rem;
|
||||
}
|
||||
|
||||
.bench-run-score {
|
||||
font-family: var(--font-mono, monospace);
|
||||
font-size: 0.75rem;
|
||||
font-weight: 600;
|
||||
color: var(--app-primary, #2A6080);
|
||||
background: color-mix(in srgb, var(--app-primary, #2A6080) 10%, transparent);
|
||||
padding: 0.1rem 0.35rem;
|
||||
border-radius: 0.25rem;
|
||||
}
|
||||
|
||||
/* ── Job pills ── */
|
||||
.job-row {
|
||||
display: flex;
|
||||
|
|
|
|||
|
|
@ -1,705 +0,0 @@
|
|||
<template>
|
||||
<div class="embed-compare-page">
|
||||
<!-- Step indicator (non-interactive) -->
|
||||
<ol class="step-indicator" aria-label="Setup progress">
|
||||
<li :class="{ complete: corpus.length > 0 }">Corpus</li>
|
||||
<li :class="{ complete: queries.length > 0 }">Queries</li>
|
||||
<li :class="{ complete: selectedModels.length > 0 }">Models</li>
|
||||
<li :class="{ complete: hasResults }">Run & Rate</li>
|
||||
</ol>
|
||||
|
||||
<!-- Persistent aria-live region — always in DOM, never v-if -->
|
||||
<div
|
||||
ref="liveRegion"
|
||||
class="sr-live"
|
||||
aria-live="polite"
|
||||
aria-atomic="true"
|
||||
v-text="liveMessage"
|
||||
/>
|
||||
|
||||
<!-- ① Corpus section -->
|
||||
<section class="card" aria-labelledby="corpus-heading">
|
||||
<h2 id="corpus-heading">① Corpus</h2>
|
||||
<div class="corpus-controls">
|
||||
<div class="field">
|
||||
<label for="corpus-paste">Paste chunks (one per line)</label>
|
||||
<textarea
|
||||
id="corpus-paste"
|
||||
v-model="rawCorpus"
|
||||
rows="6"
|
||||
placeholder="Paste one chunk per line, or use Import below..."
|
||||
@change="onCorpusPaste"
|
||||
/>
|
||||
</div>
|
||||
<div class="import-row">
|
||||
<label for="imitate-product-select">Import from product</label>
|
||||
<select id="imitate-product-select" v-model="selectedProduct">
|
||||
<option value="">-- select product --</option>
|
||||
<option
|
||||
v-for="p in imitateProducts"
|
||||
:key="p.id"
|
||||
:value="p.id"
|
||||
>{{ p.name }}</option>
|
||||
</select>
|
||||
<button
|
||||
class="btn-secondary"
|
||||
:disabled="!selectedProduct || importing"
|
||||
@click="importCorpus"
|
||||
>
|
||||
{{ importing ? 'Importing…' : 'Import' }}
|
||||
</button>
|
||||
<span v-if="importError" class="error-text" role="alert">{{ importError }}</span>
|
||||
</div>
|
||||
<p v-if="corpus.length > 0" class="corpus-count">
|
||||
{{ corpus.length }} chunk{{ corpus.length === 1 ? '' : 's' }} loaded.
|
||||
</p>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- ② Queries section -->
|
||||
<section class="card" aria-labelledby="queries-heading">
|
||||
<h2 id="queries-heading">② Queries</h2>
|
||||
<div class="field">
|
||||
<label for="query-input">Enter queries (one per line)</label>
|
||||
<textarea
|
||||
id="query-input"
|
||||
v-model="rawQueries"
|
||||
rows="4"
|
||||
placeholder="One query per line..."
|
||||
@change="onQueriesChange"
|
||||
/>
|
||||
</div>
|
||||
<p v-if="queries.length > 0" class="query-count">
|
||||
{{ queries.length }} quer{{ queries.length === 1 ? 'y' : 'ies' }}.
|
||||
</p>
|
||||
</section>
|
||||
|
||||
<!-- ③ Model selection -->
|
||||
<section class="card" aria-labelledby="models-heading">
|
||||
<h2 id="models-heading">③ Models</h2>
|
||||
<p v-if="loadingModels" class="muted">Loading models from Ollama…</p>
|
||||
<p v-else-if="modelsError" class="error-text" role="alert">{{ modelsError }}</p>
|
||||
<ul v-else class="model-list" role="list">
|
||||
<li v-for="m in availableModels" :key="m.name">
|
||||
<label class="model-checkbox">
|
||||
<input
|
||||
type="checkbox"
|
||||
:value="m.name"
|
||||
v-model="selectedModels"
|
||||
/>
|
||||
{{ m.name }}
|
||||
<span class="model-size muted" aria-label="model size">
|
||||
{{ formatBytes(m.size) }}
|
||||
</span>
|
||||
</label>
|
||||
</li>
|
||||
</ul>
|
||||
<p v-if="availableModels.length === 0 && !loadingModels && !modelsError" class="muted">
|
||||
No Ollama models found. Pull an embedding model first.
|
||||
</p>
|
||||
</section>
|
||||
|
||||
<!-- ④ Run controls -->
|
||||
<section class="card run-controls" aria-labelledby="run-heading">
|
||||
<h2 id="run-heading">④ Run</h2>
|
||||
<div class="run-row">
|
||||
<div class="field-inline">
|
||||
<label for="top-k-input">Results per query</label>
|
||||
<input
|
||||
id="top-k-input"
|
||||
type="number"
|
||||
v-model.number="topK"
|
||||
min="1"
|
||||
max="20"
|
||||
style="width: 5rem"
|
||||
/>
|
||||
</div>
|
||||
<button
|
||||
class="btn-primary"
|
||||
:disabled="!canRun || running"
|
||||
@click="startRun"
|
||||
>
|
||||
{{ running ? 'Running…' : 'Run' }}
|
||||
</button>
|
||||
<button
|
||||
v-if="running"
|
||||
class="btn-danger"
|
||||
aria-label="Cancel embedding run"
|
||||
@click="cancelRun"
|
||||
>
|
||||
Cancel
|
||||
</button>
|
||||
</div>
|
||||
<p v-if="!canRun && !running" class="muted">
|
||||
Fill corpus, at least one query, and select at least one model to run.
|
||||
</p>
|
||||
</section>
|
||||
|
||||
<!-- Results -->
|
||||
<section
|
||||
v-if="hasResults"
|
||||
class="card results-section"
|
||||
aria-labelledby="results-heading"
|
||||
>
|
||||
<h2 id="results-heading">Results</h2>
|
||||
|
||||
<!-- Query pagination -->
|
||||
<div class="query-nav" role="navigation" aria-label="Query navigation">
|
||||
<button
|
||||
class="btn-secondary"
|
||||
aria-label="Previous query"
|
||||
:disabled="currentQueryIdx === 0"
|
||||
@click="currentQueryIdx--"
|
||||
>‹</button>
|
||||
<span class="query-counter">
|
||||
Query {{ currentQueryIdx + 1 }} of {{ uniqueQueries.length }}:
|
||||
<em>{{ uniqueQueries[currentQueryIdx] }}</em>
|
||||
</span>
|
||||
<button
|
||||
class="btn-secondary"
|
||||
aria-label="Next query"
|
||||
:disabled="currentQueryIdx >= uniqueQueries.length - 1"
|
||||
@click="currentQueryIdx++"
|
||||
>›</button>
|
||||
</div>
|
||||
|
||||
<!-- Results table: one column per model -->
|
||||
<div class="table-wrap">
|
||||
<table class="results-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th scope="col" class="rank-col">#</th>
|
||||
<th
|
||||
v-for="model in selectedModels"
|
||||
:key="model"
|
||||
scope="col"
|
||||
>{{ model }}</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr v-for="rank in topK" :key="rank">
|
||||
<td class="rank-col muted">{{ rank }}</td>
|
||||
<td
|
||||
v-for="model in selectedModels"
|
||||
:key="model"
|
||||
class="hit-cell"
|
||||
>
|
||||
<template v-if="getHit(currentQueryIdx, model, rank - 1) as hit">
|
||||
<div class="hit-text">{{ hit.text }}</div>
|
||||
<!-- Visual score bar: decorative only -->
|
||||
<div class="score-row">
|
||||
<div class="score-bar-wrap" aria-hidden="true">
|
||||
<div class="score-bar" :style="{ width: `${hit.score * 100}%` }" />
|
||||
</div>
|
||||
<span class="score-label">{{ hit.score.toFixed(3) }}</span>
|
||||
</div>
|
||||
<!-- Rating buttons -->
|
||||
<div class="rating-row">
|
||||
<button
|
||||
class="rate-btn"
|
||||
:class="{ active: getRating(currentQueryIdx, model, hit.chunk_idx) === 'relevant' }"
|
||||
:aria-pressed="getRating(currentQueryIdx, model, hit.chunk_idx) === 'relevant'"
|
||||
aria-label="Mark as relevant"
|
||||
@click="rate(currentQueryIdx, model, hit, 'relevant')"
|
||||
>
|
||||
👍 Relevant
|
||||
</button>
|
||||
<button
|
||||
class="rate-btn rate-btn-neg"
|
||||
:class="{ active: getRating(currentQueryIdx, model, hit.chunk_idx) === 'not_relevant' }"
|
||||
:aria-pressed="getRating(currentQueryIdx, model, hit.chunk_idx) === 'not_relevant'"
|
||||
aria-label="Mark as not relevant"
|
||||
@click="rate(currentQueryIdx, model, hit, 'not_relevant')"
|
||||
>
|
||||
👎 Not relevant
|
||||
</button>
|
||||
</div>
|
||||
</template>
|
||||
<span v-else class="muted">—</span>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Export -->
|
||||
<section
|
||||
v-if="hasResults"
|
||||
class="card export-section"
|
||||
aria-labelledby="export-heading"
|
||||
>
|
||||
<h2 id="export-heading">Export Ratings</h2>
|
||||
<div class="export-row">
|
||||
<fieldset class="export-format-group">
|
||||
<legend>Format</legend>
|
||||
<label><input type="radio" v-model="exportFormat" value="csv" /> CSV</label>
|
||||
<label><input type="radio" v-model="exportFormat" value="json" /> JSON</label>
|
||||
</fieldset>
|
||||
<button class="btn-secondary" @click="exportRatings">Export</button>
|
||||
</div>
|
||||
</section>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup lang="ts">
|
||||
import { ref, computed, onMounted } from 'vue'
|
||||
|
||||
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
interface OllamaModel { name: string; size: number }
|
||||
interface ImitateProduct { id: string; name: string }
|
||||
interface HitResult { chunk_idx: number; text: string; score: number }
|
||||
interface ResultEvent {
|
||||
type: 'result'
|
||||
query_idx: number
|
||||
query: string
|
||||
model: string
|
||||
hits: HitResult[]
|
||||
}
|
||||
|
||||
// ── State ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
const rawCorpus = ref('')
|
||||
const corpus = ref<string[]>([])
|
||||
const rawQueries = ref('')
|
||||
const queries = ref<string[]>([])
|
||||
const selectedModels = ref<string[]>([])
|
||||
const topK = ref(5)
|
||||
const availableModels = ref<OllamaModel[]>([])
|
||||
const loadingModels = ref(false)
|
||||
const modelsError = ref('')
|
||||
const imitateProducts = ref<ImitateProduct[]>([])
|
||||
const selectedProduct = ref('')
|
||||
const importing = ref(false)
|
||||
const importError = ref('')
|
||||
const running = ref(false)
|
||||
const liveMessage = ref('')
|
||||
const resultEvents = ref<ResultEvent[]>([])
|
||||
const runController = ref<AbortController | null>(null)
|
||||
|
||||
const currentQueryIdx = ref(0)
|
||||
const exportFormat = ref<'csv' | 'json'>('csv')
|
||||
|
||||
type RatingMap = Record<string, Record<string, Record<number, 'relevant' | 'not_relevant'>>>
|
||||
const ratings = ref<RatingMap>({})
|
||||
|
||||
const uniqueQueries = computed(() => {
|
||||
const seen = new Set<string>()
|
||||
const out: string[] = []
|
||||
for (const e of resultEvents.value) {
|
||||
if (!seen.has(e.query)) { seen.add(e.query); out.push(e.query) }
|
||||
}
|
||||
return out
|
||||
})
|
||||
|
||||
const hasResults = computed(() => resultEvents.value.length > 0)
|
||||
const canRun = computed(
|
||||
() => corpus.value.length > 0 && queries.value.length > 0 && selectedModels.value.length > 0
|
||||
)
|
||||
|
||||
// ── Corpus helpers ────────────────────────────────────────────────────────────
|
||||
|
||||
function onCorpusPaste() {
|
||||
const chunks = rawCorpus.value.split('\n').map(l => l.trim()).filter(Boolean)
|
||||
corpus.value = chunks
|
||||
if (chunks.length > 0) {
|
||||
liveMessage.value = `${chunks.length} chunk${chunks.length === 1 ? '' : 's'} loaded.`
|
||||
}
|
||||
}
|
||||
|
||||
function onQueriesChange() {
|
||||
queries.value = rawQueries.value.split('\n').map(l => l.trim()).filter(Boolean)
|
||||
}
|
||||
|
||||
async function importCorpus() {
|
||||
if (!selectedProduct.value) return
|
||||
importing.value = true
|
||||
importError.value = ''
|
||||
try {
|
||||
const r = await fetch(`/api/imitate/products/${selectedProduct.value}/sample-chunks`)
|
||||
if (!r.ok) {
|
||||
const text = await r.text()
|
||||
throw new Error(text || `HTTP ${r.status}`)
|
||||
}
|
||||
const data = await r.json() as { chunks?: string[] }
|
||||
const chunks = data.chunks ?? []
|
||||
corpus.value = chunks
|
||||
rawCorpus.value = chunks.join('\n')
|
||||
liveMessage.value = `${chunks.length} chunk${chunks.length === 1 ? '' : 's'} loaded from import.`
|
||||
} catch (err) {
|
||||
importError.value = String(err)
|
||||
} finally {
|
||||
importing.value = false
|
||||
}
|
||||
}
|
||||
|
||||
// ── Model loading ─────────────────────────────────────────────────────────────
|
||||
|
||||
async function loadModels() {
|
||||
loadingModels.value = true
|
||||
modelsError.value = ''
|
||||
try {
|
||||
const r = await fetch('/api/embed-bench/models')
|
||||
if (!r.ok) throw new Error(`HTTP ${r.status}`)
|
||||
const data = await r.json() as { models: OllamaModel[] }
|
||||
availableModels.value = data.models
|
||||
} catch (err) {
|
||||
modelsError.value = `Failed to load models: ${err}`
|
||||
} finally {
|
||||
loadingModels.value = false
|
||||
}
|
||||
}
|
||||
|
||||
// ── Run ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function startRun() {
|
||||
if (!canRun.value) return
|
||||
running.value = true
|
||||
resultEvents.value = []
|
||||
liveMessage.value = 'Starting embedding run…'
|
||||
runController.value = new AbortController()
|
||||
|
||||
try {
|
||||
const resp = await fetch('/api/embed-bench/run', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
corpus: corpus.value,
|
||||
queries: queries.value,
|
||||
models: selectedModels.value,
|
||||
top_k: topK.value,
|
||||
}),
|
||||
signal: runController.value.signal,
|
||||
})
|
||||
|
||||
const reader = resp.body!.getReader()
|
||||
const decoder = new TextDecoder()
|
||||
let buf = ''
|
||||
|
||||
while (true) {
|
||||
const { done, value } = await reader.read()
|
||||
if (done) break
|
||||
buf += decoder.decode(value, { stream: true })
|
||||
const lines = buf.split('\n')
|
||||
buf = lines.pop() ?? ''
|
||||
for (const line of lines) {
|
||||
if (!line.startsWith('data: ')) continue
|
||||
const event = JSON.parse(line.slice(6))
|
||||
if (event.type === 'progress') {
|
||||
liveMessage.value = event.msg
|
||||
} else if (event.type === 'result') {
|
||||
resultEvents.value.push(event as ResultEvent)
|
||||
} else if (event.type === 'done') {
|
||||
liveMessage.value = 'Run complete.'
|
||||
} else if (event.type === 'error') {
|
||||
liveMessage.value = `Error: ${event.msg}`
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
if ((err as Error).name !== 'AbortError') {
|
||||
liveMessage.value = `Run failed: ${err}`
|
||||
}
|
||||
} finally {
|
||||
running.value = false
|
||||
runController.value = null
|
||||
}
|
||||
}
|
||||
|
||||
function cancelRun() {
|
||||
runController.value?.abort()
|
||||
liveMessage.value = 'Run cancelled.'
|
||||
}
|
||||
|
||||
// ── Utilities ─────────────────────────────────────────────────────────────────
|
||||
|
||||
function formatBytes(bytes: number): string {
|
||||
if (bytes < 1_000_000) return `${(bytes / 1000).toFixed(0)} KB`
|
||||
if (bytes < 1_000_000_000) return `${(bytes / 1_000_000).toFixed(0)} MB`
|
||||
return `${(bytes / 1_000_000_000).toFixed(1)} GB`
|
||||
}
|
||||
|
||||
function getHit(queryIdx: number, model: string, rank: number): HitResult | null {
|
||||
const query = uniqueQueries.value[queryIdx]
|
||||
if (!query) return null
|
||||
const ev = resultEvents.value.find(e => e.query === query && e.model === model)
|
||||
return ev?.hits[rank] ?? null
|
||||
}
|
||||
|
||||
function getRating(queryIdx: number, model: string, chunkIdx: number): string | undefined {
|
||||
const query = uniqueQueries.value[queryIdx]
|
||||
return ratings.value[query]?.[model]?.[chunkIdx]
|
||||
}
|
||||
|
||||
async function rate(
|
||||
queryIdx: number,
|
||||
model: string,
|
||||
hit: HitResult,
|
||||
rating: 'relevant' | 'not_relevant',
|
||||
) {
|
||||
const query = uniqueQueries.value[queryIdx]
|
||||
// Optimistic update
|
||||
if (!ratings.value[query]) ratings.value[query] = {}
|
||||
if (!ratings.value[query][model]) ratings.value[query][model] = {}
|
||||
ratings.value[query][model][hit.chunk_idx] = rating
|
||||
|
||||
try {
|
||||
await fetch('/api/embed-bench/rate', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
query,
|
||||
model,
|
||||
chunk_text: hit.text,
|
||||
chunk_idx: hit.chunk_idx,
|
||||
rating,
|
||||
}),
|
||||
})
|
||||
liveMessage.value = `Rated chunk ${hit.chunk_idx + 1} as ${rating}.`
|
||||
} catch (err) {
|
||||
liveMessage.value = `Rating failed: ${err}`
|
||||
}
|
||||
}
|
||||
|
||||
async function exportRatings() {
|
||||
const r = await fetch(`/api/embed-bench/export?format=${exportFormat.value}`)
|
||||
if (!r.ok) {
|
||||
liveMessage.value = `Export failed: HTTP ${r.status}`
|
||||
return
|
||||
}
|
||||
const blob = await r.blob()
|
||||
const disposition = r.headers.get('Content-Disposition') ?? ''
|
||||
const filenameMatch = disposition.match(/filename="([^"]+)"/)
|
||||
const filename = filenameMatch ? filenameMatch[1] : `embed_comparison.${exportFormat.value}`
|
||||
const url = URL.createObjectURL(blob)
|
||||
const a = document.createElement('a')
|
||||
a.href = url
|
||||
a.download = filename
|
||||
a.click()
|
||||
URL.revokeObjectURL(url)
|
||||
liveMessage.value = `Exported ${filename}.`
|
||||
}
|
||||
|
||||
// ── Lifecycle ─────────────────────────────────────────────────────────────────
|
||||
|
||||
onMounted(() => {
|
||||
loadModels()
|
||||
})
|
||||
</script>
|
||||
|
||||
<style scoped>
|
||||
.embed-compare-page {
|
||||
padding: var(--space-4, 1.5rem);
|
||||
max-width: 1100px;
|
||||
}
|
||||
|
||||
/* Step indicator */
|
||||
.step-indicator {
|
||||
display: flex;
|
||||
gap: 0;
|
||||
list-style: none;
|
||||
margin: 0 0 var(--space-4, 1.5rem);
|
||||
padding: 0;
|
||||
border-bottom: 2px solid var(--color-border, #d0d7e8);
|
||||
}
|
||||
.step-indicator li {
|
||||
padding: 0.4rem 1rem;
|
||||
font-size: 0.8rem;
|
||||
font-weight: 600;
|
||||
color: var(--color-text-muted, #4a5c7a);
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.05em;
|
||||
border-bottom: 2px solid transparent;
|
||||
margin-bottom: -2px;
|
||||
}
|
||||
.step-indicator li.complete {
|
||||
color: var(--app-primary, #2A6080);
|
||||
border-bottom-color: var(--app-primary, #2A6080);
|
||||
}
|
||||
|
||||
/* Accessibility: screen-reader live region — visually hidden but always present */
|
||||
.sr-live {
|
||||
position: absolute;
|
||||
width: 1px; height: 1px;
|
||||
overflow: hidden;
|
||||
clip: rect(0 0 0 0);
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
/* Cards */
|
||||
.card {
|
||||
background: var(--color-surface-raised, #e4ebf5);
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: var(--radius-md, 0.5rem);
|
||||
padding: var(--space-4, 1.5rem);
|
||||
margin-bottom: var(--space-4, 1.5rem);
|
||||
}
|
||||
.card h2 {
|
||||
font-size: 1rem;
|
||||
font-weight: 700;
|
||||
margin: 0 0 var(--space-3, 1rem);
|
||||
color: var(--color-text, #1a2338);
|
||||
}
|
||||
|
||||
.field { display: flex; flex-direction: column; gap: 0.3rem; margin-bottom: 0.75rem; }
|
||||
.field label { font-size: 0.85rem; font-weight: 600; }
|
||||
textarea, input[type="number"] {
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: var(--radius-sm, 0.25rem);
|
||||
padding: 0.5rem;
|
||||
font-size: 0.875rem;
|
||||
background: var(--color-surface, #f0f4fb);
|
||||
color: var(--color-text, #1a2338);
|
||||
resize: vertical;
|
||||
}
|
||||
|
||||
.corpus-controls { display: flex; flex-direction: column; gap: 0.5rem; }
|
||||
.import-row {
|
||||
display: flex; flex-wrap: wrap; gap: 0.5rem; align-items: center;
|
||||
}
|
||||
.import-row label { font-size: 0.85rem; font-weight: 600; }
|
||||
.corpus-count, .query-count { font-size: 0.875rem; color: var(--app-primary, #2A6080); margin: 0; }
|
||||
|
||||
.model-list { list-style: none; padding: 0; margin: 0; display: flex; flex-wrap: wrap; gap: 0.5rem; }
|
||||
.model-checkbox {
|
||||
display: flex; align-items: center; gap: 0.4rem;
|
||||
font-size: 0.875rem; cursor: pointer;
|
||||
padding: 0.3rem 0.6rem;
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: var(--radius-sm, 0.25rem);
|
||||
background: var(--color-surface, #f0f4fb);
|
||||
}
|
||||
.model-size { font-size: 0.75rem; }
|
||||
|
||||
.run-row { display: flex; flex-wrap: wrap; gap: 0.75rem; align-items: flex-end; }
|
||||
.field-inline { display: flex; align-items: center; gap: 0.4rem; }
|
||||
.field-inline label { font-size: 0.85rem; font-weight: 600; white-space: nowrap; }
|
||||
|
||||
.btn-primary, .btn-secondary, .btn-danger {
|
||||
padding: 0.4rem 1rem;
|
||||
border-radius: var(--radius-sm, 0.25rem);
|
||||
border: 1px solid transparent;
|
||||
font-size: 0.875rem;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
transition: background 0.15s;
|
||||
}
|
||||
.btn-primary { background: var(--app-primary, #2A6080); color: #fff; }
|
||||
.btn-primary:hover:not(:disabled) { filter: brightness(1.1); }
|
||||
.btn-primary:disabled { opacity: 0.5; cursor: not-allowed; }
|
||||
.btn-secondary { background: var(--color-surface, #f0f4fb); color: var(--color-text, #1a2338); border-color: var(--color-border, #d0d7e8); }
|
||||
.btn-secondary:hover:not(:disabled) { background: var(--color-border, #d0d7e8); }
|
||||
.btn-secondary:disabled { opacity: 0.5; cursor: not-allowed; }
|
||||
.btn-danger { background: var(--color-error, #c0392b); color: #fff; }
|
||||
|
||||
.muted { color: var(--color-text-muted, #4a5c7a); font-size: 0.875rem; }
|
||||
.error-text { color: var(--color-error, #c0392b); font-size: 0.875rem; }
|
||||
|
||||
@media (max-width: 768px) {
|
||||
.import-row { flex-direction: column; align-items: flex-start; }
|
||||
.run-row { flex-direction: column; }
|
||||
.model-list { flex-direction: column; }
|
||||
}
|
||||
|
||||
/* Results table */
|
||||
.table-wrap { overflow-x: auto; }
|
||||
.results-table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
font-size: 0.875rem;
|
||||
}
|
||||
.results-table thead th {
|
||||
position: sticky;
|
||||
top: 0;
|
||||
background: var(--color-surface-raised, #e4ebf5);
|
||||
border-bottom: 2px solid var(--color-border, #d0d7e8);
|
||||
padding: 0.5rem 0.75rem;
|
||||
text-align: left;
|
||||
font-weight: 700;
|
||||
white-space: nowrap;
|
||||
z-index: 1;
|
||||
}
|
||||
.results-table td {
|
||||
padding: 0.5rem 0.75rem;
|
||||
vertical-align: top;
|
||||
border-bottom: 1px solid var(--color-border, #d0d7e8);
|
||||
}
|
||||
.rank-col { width: 2rem; text-align: center; }
|
||||
|
||||
.hit-text { margin-bottom: 0.25rem; line-height: 1.4; }
|
||||
|
||||
.score-row { display: flex; align-items: center; gap: 0.4rem; margin-bottom: 0.25rem; }
|
||||
.score-bar-wrap {
|
||||
flex: 1;
|
||||
height: 6px;
|
||||
background: var(--color-border, #d0d7e8);
|
||||
border-radius: 3px;
|
||||
overflow: hidden;
|
||||
}
|
||||
.score-bar {
|
||||
height: 100%;
|
||||
background: var(--app-primary, #2A6080);
|
||||
border-radius: 3px;
|
||||
transition: width 0.3s ease;
|
||||
}
|
||||
.score-label { font-size: 0.75rem; color: var(--color-text-muted, #4a5c7a); min-width: 3rem; text-align: right; }
|
||||
|
||||
.rating-row { display: flex; gap: 0.4rem; flex-wrap: wrap; }
|
||||
.rate-btn {
|
||||
padding: 0.2rem 0.5rem;
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: var(--radius-sm, 0.25rem);
|
||||
background: var(--color-surface, #f0f4fb);
|
||||
color: var(--color-text, #1a2338);
|
||||
font-size: 0.75rem;
|
||||
cursor: pointer;
|
||||
transition: background 0.15s, border-color 0.15s;
|
||||
}
|
||||
.rate-btn.active {
|
||||
background: color-mix(in srgb, var(--app-primary, #2A6080) 20%, transparent);
|
||||
border-color: var(--app-primary, #2A6080);
|
||||
font-weight: 700;
|
||||
}
|
||||
.rate-btn-neg.active {
|
||||
background: color-mix(in srgb, var(--color-error, #c0392b) 15%, transparent);
|
||||
border-color: var(--color-error, #c0392b);
|
||||
}
|
||||
|
||||
/* Query nav */
|
||||
.query-nav {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
margin-bottom: 0.75rem;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
.query-counter { font-size: 0.875rem; flex: 1; }
|
||||
|
||||
/* Export */
|
||||
.export-row { display: flex; gap: 1rem; align-items: center; flex-wrap: wrap; }
|
||||
.export-format-group {
|
||||
border: none;
|
||||
padding: 0;
|
||||
display: flex;
|
||||
gap: 0.75rem;
|
||||
}
|
||||
.export-format-group legend {
|
||||
font-size: 0.85rem;
|
||||
font-weight: 600;
|
||||
margin-bottom: 0.25rem;
|
||||
float: left;
|
||||
margin-right: 0.5rem;
|
||||
}
|
||||
.export-format-group label { font-size: 0.875rem; display: flex; align-items: center; gap: 0.3rem; }
|
||||
|
||||
@media (max-width: 768px) {
|
||||
.results-table thead th,
|
||||
.results-table td { padding: 0.35rem 0.4rem; font-size: 0.8rem; }
|
||||
.query-nav { flex-direction: column; align-items: flex-start; }
|
||||
}
|
||||
|
||||
@media (prefers-reduced-motion: reduce) {
|
||||
.score-bar { transition: none; }
|
||||
}
|
||||
</style>
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
<template>
|
||||
<EmbedCompareTab />
|
||||
</template>
|
||||
|
||||
<script setup lang="ts">
|
||||
import EmbedCompareTab from './EmbedCompareTab.vue'
|
||||
</script>
|
||||
|
|
@ -302,7 +302,7 @@ const llmModelBadge = computed(() => {
|
|||
const llmTaskTypeCols = computed(() => {
|
||||
const types = new Set<string>()
|
||||
for (const r of llmResults.value) {
|
||||
for (const k of Object.keys(r.quality_by_task_type ?? {})) types.add(k)
|
||||
for (const k of Object.keys(r.quality_by_task_type)) types.add(k)
|
||||
}
|
||||
return [...types].sort()
|
||||
})
|
||||
|
|
@ -338,7 +338,7 @@ const llmBestByCol = computed((): Record<string, string> => {
|
|||
for (const col of llmTaskTypeCols.value) {
|
||||
bestId = ''; bestVal = -Infinity
|
||||
for (const r of llmResults.value) {
|
||||
const v = r.quality_by_task_type?.[col]
|
||||
const v = r.quality_by_task_type[col]
|
||||
if (v != null && v > bestVal) { bestVal = v; bestId = r.model_id }
|
||||
}
|
||||
best[col] = bestId
|
||||
|
|
|
|||
Loading…
Reference in a new issue