85 lines
2.6 KiB
Python
85 lines
2.6 KiB
Python
"""Avocet — embedding model comparison harness.
|
|
|
|
Exposes FastAPI routes under /api/embed-bench (mounted via app/eval/cforch.py).
|
|
All computation is local: no LLM inference, Ollama only. MIT tier throughout.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import csv
|
|
import io
|
|
import json
|
|
import logging
|
|
import math
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import httpx
|
|
import yaml
|
|
from fastapi import APIRouter, HTTPException
|
|
from fastapi.responses import StreamingResponse
|
|
from pydantic import BaseModel, field_validator
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_ROOT = Path(__file__).parent.parent.parent
|
|
_CONFIG_DIR: Path | None = None # override via set_config_dir() in tests
|
|
_RUN_ACTIVE: bool = False
|
|
_RATINGS_FILE = _ROOT / "data" / "embed_bench_ratings.jsonl"
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
# ── Testability seam ──────────────────────────────────────────────────────────
|
|
|
|
def set_config_dir(path: Path | None) -> None:
|
|
global _CONFIG_DIR
|
|
_CONFIG_DIR = path
|
|
|
|
|
|
# ── Internal helpers ──────────────────────────────────────────────────────────
|
|
|
|
def _config_file() -> Path:
|
|
if _CONFIG_DIR is not None:
|
|
return _CONFIG_DIR / "label_tool.yaml"
|
|
return _ROOT / "config" / "label_tool.yaml"
|
|
|
|
|
|
def _load_config() -> dict[str, Any]:
|
|
f = _config_file()
|
|
if not f.exists():
|
|
return {}
|
|
try:
|
|
return yaml.safe_load(f.read_text(encoding="utf-8")) or {}
|
|
except yaml.YAMLError as exc:
|
|
logger.warning("Failed to parse embed_bench config %s: %s", f, exc)
|
|
return {}
|
|
|
|
|
|
def _ollama_url() -> str:
|
|
cfg = _load_config()
|
|
embed_cfg = cfg.get("embed_bench", {}) or {}
|
|
cforch_cfg = cfg.get("cforch", {}) or {}
|
|
return (
|
|
embed_cfg.get("ollama_url")
|
|
or cforch_cfg.get("ollama_url", "http://localhost:11434")
|
|
)
|
|
|
|
|
|
def _ratings_path() -> Path:
|
|
if _CONFIG_DIR is not None:
|
|
return _CONFIG_DIR / "embed_bench_ratings.jsonl"
|
|
return _RATINGS_FILE
|
|
|
|
|
|
def _cosine(a: list[float], b: list[float]) -> float:
|
|
if len(a) != len(b):
|
|
raise ValueError(
|
|
f"Embedding dimension mismatch: {len(a)} vs {len(b)}"
|
|
)
|
|
dot = sum(x * y for x, y in zip(a, b))
|
|
mag_a = math.sqrt(sum(x * x for x in a))
|
|
mag_b = math.sqrt(sum(x * x for x in b))
|
|
if mag_a == 0.0 or mag_b == 0.0:
|
|
return 0.0
|
|
return dot / (mag_a * mag_b)
|