feat: add embed_bench module scaffold and _cosine() helper

This commit is contained in:
pyr0ball 2026-05-07 08:37:18 -07:00
parent 6f9aad126e
commit 276bdadb92
2 changed files with 131 additions and 0 deletions

81
app/eval/embed_bench.py Normal file
View file

@ -0,0 +1,81 @@
"""Avocet — embedding model comparison harness.
Exposes FastAPI routes under /api/embed-bench (mounted via app/eval/cforch.py).
All computation is local: no LLM inference, Ollama only. MIT tier throughout.
"""
from __future__ import annotations
import csv
import io
import json
import logging
import math
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import httpx
import yaml
from fastapi import APIRouter, HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel, field_validator
logger = logging.getLogger(__name__)
_ROOT = Path(__file__).parent.parent.parent
_CONFIG_DIR: Path | None = None # override via set_config_dir() in tests
_RUN_ACTIVE: bool = False
_RATINGS_FILE = _ROOT / "data" / "embed_bench_ratings.jsonl"
router = APIRouter()
# ── Testability seam ──────────────────────────────────────────────────────────
def set_config_dir(path: Path | None) -> None:
global _CONFIG_DIR
_CONFIG_DIR = path
# ── Internal helpers ──────────────────────────────────────────────────────────
def _config_file() -> Path:
if _CONFIG_DIR is not None:
return _CONFIG_DIR / "label_tool.yaml"
return _ROOT / "config" / "label_tool.yaml"
def _load_config() -> dict:
f = _config_file()
if not f.exists():
return {}
try:
return yaml.safe_load(f.read_text(encoding="utf-8")) or {}
except yaml.YAMLError as exc:
logger.warning("Failed to parse embed_bench config %s: %s", f, exc)
return {}
def _ollama_url() -> str:
cfg = _load_config()
embed_cfg = cfg.get("embed_bench", {}) or {}
cforch_cfg = cfg.get("cforch", {}) or {}
return (
embed_cfg.get("ollama_url")
or cforch_cfg.get("ollama_url", "http://localhost:11434")
)
def _ratings_path() -> Path:
if _CONFIG_DIR is not None:
return _CONFIG_DIR / "embed_bench_ratings.jsonl"
return _RATINGS_FILE
def _cosine(a: list[float], b: list[float]) -> float:
dot = sum(x * y for x, y in zip(a, b))
mag_a = math.sqrt(sum(x * x for x in a))
mag_b = math.sqrt(sum(x * x for x in b))
if mag_a == 0.0 or mag_b == 0.0:
return 0.0
return dot / (mag_a * mag_b)

50
tests/test_embed_bench.py Normal file
View file

@ -0,0 +1,50 @@
"""Tests for app/eval/embed_bench.py."""
from __future__ import annotations
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
from fastapi.testclient import TestClient
# ── Fixtures ──────────────────────────────────────────────────────────────────
@pytest.fixture(autouse=True)
def reset_embed_bench_globals(tmp_path):
"""Redirect config dir to tmp_path and reset running flag."""
from app.eval import embed_bench as mod
prev_config_dir = mod._CONFIG_DIR
prev_running = mod._RUN_ACTIVE
mod.set_config_dir(tmp_path)
mod._RUN_ACTIVE = False
yield tmp_path
mod.set_config_dir(prev_config_dir)
mod._RUN_ACTIVE = prev_running
@pytest.fixture
def client():
from app.api import app
return TestClient(app)
# ── cosine helper ──────────────────────────────────────────────────────────────
def test_cosine_identical():
from app.eval.embed_bench import _cosine
assert _cosine([1.0, 0.0], [1.0, 0.0]) == pytest.approx(1.0)
def test_cosine_orthogonal():
from app.eval.embed_bench import _cosine
assert _cosine([1.0, 0.0], [0.0, 1.0]) == pytest.approx(0.0)
def test_cosine_opposite():
from app.eval.embed_bench import _cosine
assert _cosine([1.0, 0.0], [-1.0, 0.0]) == pytest.approx(-1.0)