diff --git a/app/api.py b/app/api.py index eca231f..70c889c 100644 --- a/app/api.py +++ b/app/api.py @@ -70,6 +70,9 @@ def finetune_cancel_compat() -> dict: from app.data.log_corpus import router as log_corpus_router app.include_router(log_corpus_router, prefix="/api/corpus") +from app.data.recipe_scan import router as recipe_scan_router +app.include_router(recipe_scan_router, prefix="/api/recipe-scan") + from app.dashboard import router as dashboard_router app.include_router(dashboard_router, prefix="/api") diff --git a/app/data/recipe_scan.py b/app/data/recipe_scan.py new file mode 100644 index 0000000..93fcda9 --- /dev/null +++ b/app/data/recipe_scan.py @@ -0,0 +1,313 @@ +"""Avocet — Recipe scan labeling API (avocet#65). + +Receives recipe scan items from the Kiwi pipeline (scanner/phone image + +docuvision OCR extraction + ground-truth structured recipe), presents them +for human review, and exports approved/edited pairs in the messages chat +format for the vision fine-tune harness. + +DB: data/recipe_scan.db (separate from corpus.db — different lifecycle) +No auth required — local admin tool, not a push endpoint. + +All endpoints registered on `router`. api.py includes this with +prefix="/api/recipe-scan". +""" +from __future__ import annotations + +import json +import logging +import sqlite3 +import uuid +from contextlib import contextmanager +from datetime import datetime, timezone +from pathlib import Path +from typing import Generator, Literal + +from fastapi import APIRouter, HTTPException +from fastapi.responses import StreamingResponse +from pydantic import BaseModel, field_validator + +logger = logging.getLogger(__name__) + +_ROOT = Path(__file__).parent.parent.parent +_DB_PATH: Path = _ROOT / "data" / "recipe_scan.db" + +_VALID_MODALITIES = {"scanner", "phone", "handwritten"} +_VALID_STATUSES = {"pending", "approved", "edited", "rejected"} + +_SCHEMA = """ +CREATE TABLE IF NOT EXISTS recipe_scan_items ( + id TEXT PRIMARY KEY, + image_path TEXT NOT NULL, + modality TEXT NOT NULL DEFAULT 'scanner', + source TEXT NOT NULL DEFAULT 'purple_carrot', + extracted TEXT NOT NULL, + ground_truth TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'pending', + corrected TEXT, + labeled_at TEXT, + rejected_reason TEXT +); +CREATE INDEX IF NOT EXISTS idx_rsi_status ON recipe_scan_items(status); +CREATE INDEX IF NOT EXISTS idx_rsi_modality ON recipe_scan_items(modality); +""" + +router = APIRouter() + + +# ── Testability seam ────────────────────────────────────────────────────────── + +def set_db_path(path: Path) -> None: + global _DB_PATH + _DB_PATH = path + + +# ── Internal helpers ────────────────────────────────────────────────────────── + +@contextmanager +def _db() -> Generator[sqlite3.Connection, None, None]: + conn = sqlite3.connect(str(_DB_PATH)) + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA journal_mode=WAL") + try: + yield conn + conn.commit() + except Exception: + conn.rollback() + raise + finally: + conn.close() + + +def _init_db() -> None: + with _db() as conn: + conn.executescript(_SCHEMA) + + +def _now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def _build_training_pair(row: sqlite3.Row) -> dict: + """Build a messages-format training pair from a labeled row. + + user message: correction prompt + the docuvision-extracted JSON draft. + Trains the model to review and correct an existing extraction, which is + more data-efficient than producing from scratch when OCR is usually close. + + assistant message: the approved ground truth (or human-corrected JSON). + """ + target_str = row["corrected"] if row["corrected"] else row["ground_truth"] + extracted = json.loads(row["extracted"]) + target = json.loads(target_str) + user_content = ( + "Review and correct this recipe extraction. " + "Return valid JSON with fields: title, description, ingredients, steps, " + "prep_time, cook_time, servings.\n\n" + f"Extraction to review:\n{json.dumps(extracted, ensure_ascii=False, indent=2)}" + ) + return { + "id": row["id"], + "modality": row["modality"], + "source": row["source"], + "image_path": row["image_path"], + "messages": [ + {"role": "user", "content": user_content}, + {"role": "assistant", "content": json.dumps(target, ensure_ascii=False)}, + ], + } + + +_init_db() + + +# ── POST /import ─────────────────────────────────────────────────────────────── + +class ImportItem(BaseModel): + id: str = "" + image_path: str + modality: Literal["scanner", "phone", "handwritten"] = "scanner" + source: str = "purple_carrot" + extracted: dict + ground_truth: dict + + @field_validator("id", mode="before") + @classmethod + def default_id(cls, v: str) -> str: + return v or str(uuid.uuid4()) + + +class ImportRequest(BaseModel): + items: list[ImportItem] + + +@router.post("/import") +def import_items(body: ImportRequest) -> dict: + """Bulk-import scan items from the Kiwi pipeline. Idempotent by item id.""" + stored = 0 + with _db() as conn: + for item in body.items: + result = conn.execute( + "INSERT OR IGNORE INTO recipe_scan_items " + "(id, image_path, modality, source, extracted, ground_truth) " + "VALUES (?, ?, ?, ?, ?, ?)", + (item.id, item.image_path, item.modality, item.source, + json.dumps(item.extracted), json.dumps(item.ground_truth)), + ) + stored += result.rowcount + return {"imported": stored, "total_submitted": len(body.items)} + + +# ── GET /next ───────────────────────────────────────────────────────────────── + +@router.get("/next") +def get_next() -> dict: + """Return the next pending item for review, oldest-first.""" + with _db() as conn: + row = conn.execute( + "SELECT * FROM recipe_scan_items WHERE status = 'pending' ORDER BY rowid LIMIT 1" + ).fetchone() + if row is None: + raise HTTPException(404, "No pending items in queue") + return { + **dict(row), + "extracted": json.loads(row["extracted"]), + "ground_truth": json.loads(row["ground_truth"]), + } + + +# ── POST /items/{id}/approve ────────────────────────────────────────────────── + +@router.post("/items/{item_id}/approve") +def approve_item(item_id: str) -> dict: + """Mark item as approved — extracted JSON is close enough to ground truth.""" + with _db() as conn: + row = conn.execute("SELECT id FROM recipe_scan_items WHERE id = ?", (item_id,)).fetchone() + if row is None: + raise HTTPException(404, "Item not found") + conn.execute( + "UPDATE recipe_scan_items SET status='approved', labeled_at=? WHERE id=?", + (_now_iso(), item_id), + ) + return {"status": "approved", "id": item_id} + + +# ── POST /items/{id}/edit ───────────────────────────────────────────────────── + +class EditBody(BaseModel): + corrected: dict + + +@router.post("/items/{item_id}/edit") +def edit_item(item_id: str, body: EditBody) -> dict: + """Approve with a human-corrected JSON. corrected overrides extracted in export.""" + with _db() as conn: + row = conn.execute("SELECT id FROM recipe_scan_items WHERE id = ?", (item_id,)).fetchone() + if row is None: + raise HTTPException(404, "Item not found") + conn.execute( + "UPDATE recipe_scan_items SET status='edited', corrected=?, labeled_at=? WHERE id=?", + (json.dumps(body.corrected), _now_iso(), item_id), + ) + return {"status": "edited", "id": item_id} + + +# ── POST /items/{id}/reject ─────────────────────────────────────────────────── + +class RejectBody(BaseModel): + reason: str = "" + + +@router.post("/items/{item_id}/reject") +def reject_item(item_id: str, body: RejectBody = RejectBody()) -> dict: + """Reject item — extraction too broken to use for training.""" + with _db() as conn: + row = conn.execute("SELECT id FROM recipe_scan_items WHERE id = ?", (item_id,)).fetchone() + if row is None: + raise HTTPException(404, "Item not found") + conn.execute( + "UPDATE recipe_scan_items SET status='rejected', rejected_reason=?, labeled_at=? WHERE id=?", + (body.reason or None, _now_iso(), item_id), + ) + return {"status": "rejected", "id": item_id} + + +# ── GET /stats ──────────────────────────────────────────────────────────────── + +@router.get("/stats") +def get_stats() -> dict: + with _db() as conn: + total = conn.execute("SELECT COUNT(*) FROM recipe_scan_items").fetchone()[0] + by_status = { + r["status"]: r["cnt"] + for r in conn.execute( + "SELECT status, COUNT(*) AS cnt FROM recipe_scan_items GROUP BY status" + ).fetchall() + } + by_modality = { + r["modality"]: r["cnt"] + for r in conn.execute( + "SELECT modality, COUNT(*) AS cnt FROM recipe_scan_items GROUP BY modality" + ).fetchall() + } + export_ready = conn.execute( + "SELECT COUNT(*) FROM recipe_scan_items WHERE status IN ('approved', 'edited')" + ).fetchone()[0] + return { + "total": total, + "by_status": by_status, + "by_modality": by_modality, + "export_ready": export_ready, + } + + +# ── GET /export ─────────────────────────────────────────────────────────────── + +@router.get("/export") +def export_pairs() -> StreamingResponse: + """Stream approved/edited items as JSONL training pairs (messages format).""" + with _db() as conn: + rows = conn.execute( + "SELECT * FROM recipe_scan_items WHERE status IN ('approved', 'edited') ORDER BY rowid" + ).fetchall() + + def _generate(): + for row in rows: + yield json.dumps(_build_training_pair(row), ensure_ascii=False) + "\n" + + return StreamingResponse( + _generate(), + media_type="application/x-ndjson", + headers={"Content-Disposition": "attachment; filename=recipe_scan_pairs.jsonl"}, + ) + + +# ── GET /image ──────────────────────────────────────────────────────────────── + +_IMAGE_ROOT = Path("/Library/Assets/kiwi") + + +@router.get("/image") +def serve_image(path: str) -> StreamingResponse: + """Serve a scan image from /Library/Assets/kiwi/. + + path must resolve within /Library/Assets/kiwi/ — rejects traversal attempts. + """ + try: + resolved = Path(path).resolve() + _IMAGE_ROOT.resolve() # ensure root itself is valid + resolved.relative_to(_IMAGE_ROOT.resolve()) + except (ValueError, OSError): + raise HTTPException(403, "Path outside allowed image directory") + + if not resolved.exists(): + raise HTTPException(404, "Image not found") + + suffix = resolved.suffix.lower() + media_types = {".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".png": "image/png", ".webp": "image/webp"} + media_type = media_types.get(suffix, "application/octet-stream") + + return StreamingResponse( + open(resolved, "rb"), + media_type=media_type, + headers={"Cache-Control": "public, max-age=86400"}, + ) diff --git a/tests/test_recipe_scan.py b/tests/test_recipe_scan.py new file mode 100644 index 0000000..7bad810 --- /dev/null +++ b/tests/test_recipe_scan.py @@ -0,0 +1,227 @@ +"""Tests for app/data/recipe_scan.py — recipe scan labeling endpoints.""" +from __future__ import annotations + +import json +import uuid +from pathlib import Path + +import pytest +from fastapi.testclient import TestClient + +from app.data import recipe_scan as rs + + +EXTRACTED = {"title": "Shepherd's Pie", "ingredients": ["lamb", "potato"], "steps": ["brown meat", "mash potato"]} +GROUND_TRUTH = {"title": "Shepherd's Pie", "ingredients": ["ground lamb", "mashed potato", "peas"], "steps": ["brown meat", "add veg", "mash potato", "bake"]} + + +@pytest.fixture(autouse=True) +def isolated_db(tmp_path, monkeypatch): + monkeypatch.setattr(rs, "_DB_PATH", tmp_path / "recipe_scan.db") + rs._init_db() + + +@pytest.fixture() +def client(): + from fastapi import FastAPI + app = FastAPI() + app.include_router(rs.router, prefix="/api/recipe-scan") + return TestClient(app) + + +def _item(**kwargs) -> dict: + return { + "id": str(uuid.uuid4()), + "image_path": "/Library/Assets/kiwi/scans/pc_test.jpg", + "modality": kwargs.get("modality", "scanner"), + "source": kwargs.get("source", "purple_carrot"), + "extracted": kwargs.get("extracted", EXTRACTED), + "ground_truth": kwargs.get("ground_truth", GROUND_TRUTH), + } + + +def _import(client, items: list[dict]) -> None: + resp = client.post("/api/recipe-scan/import", json={"items": items}) + assert resp.status_code == 200 + + +# ── Import ───────────────────────────────────────────────────────────────────── + +def test_import_stores_items(client): + _import(client, [_item()]) + stats = client.get("/api/recipe-scan/stats").json() + assert stats["total"] == 1 + assert stats["by_status"]["pending"] == 1 + + +def test_import_rejects_unknown_modality(client): + bad = _item() + bad["modality"] = "telepathy" + resp = client.post("/api/recipe-scan/import", json={"items": [bad]}) + assert resp.status_code == 422 + + +def test_import_is_idempotent(client): + item = _item() + _import(client, [item]) + _import(client, [item]) # same id — should not duplicate + stats = client.get("/api/recipe-scan/stats").json() + assert stats["total"] == 1 + + +def test_import_multiple_items(client): + _import(client, [_item(), _item(), _item()]) + assert client.get("/api/recipe-scan/stats").json()["total"] == 3 + + +# ── Next ─────────────────────────────────────────────────────────────────────── + +def test_next_returns_404_when_queue_empty(client): + resp = client.get("/api/recipe-scan/next") + assert resp.status_code == 404 + + +def test_next_returns_pending_item(client): + item = _item() + _import(client, [item]) + resp = client.get("/api/recipe-scan/next") + assert resp.status_code == 200 + data = resp.json() + assert data["id"] == item["id"] + assert data["status"] == "pending" + assert "extracted" in data + assert "ground_truth" in data + + +def test_next_skips_non_pending(client): + item = _item() + _import(client, [item]) + client.post(f"/api/recipe-scan/items/{item['id']}/reject") + resp = client.get("/api/recipe-scan/next") + assert resp.status_code == 404 + + +# ── Approve ──────────────────────────────────────────────────────────────────── + +def test_approve_marks_item_approved(client): + item = _item() + _import(client, [item]) + resp = client.post(f"/api/recipe-scan/items/{item['id']}/approve") + assert resp.status_code == 200 + assert resp.json()["status"] == "approved" + stats = client.get("/api/recipe-scan/stats").json() + assert stats["by_status"]["approved"] == 1 + + +def test_approve_returns_404_for_unknown_id(client): + resp = client.post("/api/recipe-scan/items/no-such-id/approve") + assert resp.status_code == 404 + + +# ── Edit ─────────────────────────────────────────────────────────────────────── + +def test_edit_stores_corrected_json(client): + item = _item() + _import(client, [item]) + corrected = {**GROUND_TRUTH, "servings": 4} + resp = client.post( + f"/api/recipe-scan/items/{item['id']}/edit", + json={"corrected": corrected}, + ) + assert resp.status_code == 200 + assert resp.json()["status"] == "edited" + stats = client.get("/api/recipe-scan/stats").json() + assert stats["by_status"]["edited"] == 1 + + +def test_edit_requires_corrected_field(client): + item = _item() + _import(client, [item]) + resp = client.post(f"/api/recipe-scan/items/{item['id']}/edit", json={}) + assert resp.status_code == 422 + + +# ── Reject ───────────────────────────────────────────────────────────────────── + +def test_reject_marks_item_rejected(client): + item = _item() + _import(client, [item]) + resp = client.post( + f"/api/recipe-scan/items/{item['id']}/reject", + json={"reason": "OCR completely unreadable"}, + ) + assert resp.status_code == 200 + assert resp.json()["status"] == "rejected" + + +def test_reject_without_reason_is_valid(client): + item = _item() + _import(client, [item]) + resp = client.post(f"/api/recipe-scan/items/{item['id']}/reject") + assert resp.status_code == 200 + + +# ── Export ───────────────────────────────────────────────────────────────────── + +def test_export_empty_when_nothing_approved(client): + item = _item() + _import(client, [item]) + resp = client.get("/api/recipe-scan/export") + assert resp.status_code == 200 + assert resp.text.strip() == "" + + +def test_export_includes_approved_item(client): + item = _item() + _import(client, [item]) + client.post(f"/api/recipe-scan/items/{item['id']}/approve") + resp = client.get("/api/recipe-scan/export") + lines = [l for l in resp.text.strip().splitlines() if l] + assert len(lines) == 1 + pair = json.loads(lines[0]) + assert pair["id"] == item["id"] + assert pair["modality"] == "scanner" + assert "messages" in pair + assert len(pair["messages"]) == 2 + assert pair["messages"][0]["role"] == "user" + assert pair["messages"][1]["role"] == "assistant" + + +def test_export_includes_edited_item_with_correction(client): + item = _item() + _import(client, [item]) + corrected = {**GROUND_TRUTH, "servings": 4} + client.post( + f"/api/recipe-scan/items/{item['id']}/edit", + json={"corrected": corrected}, + ) + resp = client.get("/api/recipe-scan/export") + lines = [l for l in resp.text.strip().splitlines() if l] + pair = json.loads(lines[0]) + assistant_content = json.loads(pair["messages"][1]["content"]) + assert assistant_content["servings"] == 4 + + +def test_export_excludes_rejected_items(client): + item = _item() + _import(client, [item]) + client.post(f"/api/recipe-scan/items/{item['id']}/reject") + resp = client.get("/api/recipe-scan/export") + assert resp.text.strip() == "" + + +# ── Stats ────────────────────────────────────────────────────────────────────── + +def test_stats_counts_all_statuses(client): + items = [_item(), _item(), _item(), _item()] + _import(client, items) + client.post(f"/api/recipe-scan/items/{items[0]['id']}/approve") + client.post(f"/api/recipe-scan/items/{items[1]['id']}/edit", json={"corrected": GROUND_TRUTH}) + client.post(f"/api/recipe-scan/items/{items[2]['id']}/reject") + stats = client.get("/api/recipe-scan/stats").json() + assert stats["total"] == 4 + assert stats["by_status"]["pending"] == 1 + assert stats["by_status"]["approved"] == 1 + assert stats["by_status"]["edited"] == 1 + assert stats["by_status"]["rejected"] == 1 + assert stats["export_ready"] == 2 # approved + edited diff --git a/web/src/components/AppSidebar.vue b/web/src/components/AppSidebar.vue index 3f9fd64..9db9671 100644 --- a/web/src/components/AppSidebar.vue +++ b/web/src/components/AppSidebar.vue @@ -220,6 +220,7 @@ const dataItems: NavItem[] = [ { path: '/data/fetch', icon: '📬', label: 'Fetch' }, { path: '/data/corrections', icon: '✏️', label: 'Corrections' }, { path: '/data/imitate', icon: '🪞', label: 'Imitate' }, + { path: '/data/recipe-scan', icon: '📷', label: 'Recipe Scan' }, ] const evalItems: NavItem[] = [ diff --git a/web/src/router/index.ts b/web/src/router/index.ts index 0665b9c..36a45d2 100644 --- a/web/src/router/index.ts +++ b/web/src/router/index.ts @@ -26,6 +26,7 @@ export const routes = [ { path: '/data/fetch', component: FetchView, meta: { title: 'Fetch' } }, { path: '/data/corrections', component: CorrectionsView, meta: { title: 'Corrections' } }, { path: '/data/imitate', component: ImitateView, meta: { title: 'Imitate' } }, + { path: '/data/recipe-scan', component: () => import('../views/RecipeScanView.vue'), meta: { title: 'Recipe Scan' } }, // ── Eval domain ────────────────────────────────────────── { path: '/eval/benchmark', component: BenchmarkView, meta: { title: 'Benchmark' } }, diff --git a/web/src/views/RecipeScanView.vue b/web/src/views/RecipeScanView.vue new file mode 100644 index 0000000..8a7a218 --- /dev/null +++ b/web/src/views/RecipeScanView.vue @@ -0,0 +1,536 @@ + + + + + Recipe Scan Review + + {{ stats.by_status?.pending ?? 0 }} pending + {{ stats.by_status?.approved ?? 0 }} approved + {{ stats.by_status?.edited ?? 0 }} edited + {{ stats.by_status?.rejected ?? 0 }} rejected + + ⬇ Export {{ stats.export_ready }} pairs + + + + + + + + + + + + {{ apiError }} + Retry + + + + + Queue is empty — all items reviewed. + Import items from the Kiwi pipeline to continue. + + + + + + + + {{ item.modality }} + {{ item.source }} + + + + + Image not available + {{ item.image_path }} + + + + + + + + + + Ground truth reference + {{ prettyJson(item.ground_truth) }} + + + + + + Extracted + edit before approving + + + {{ jsonError }} + + + + + + ✓ Approve + + + ✎ Approve edited + + + ✕ Reject + + + + + + + + + + {{ toast }} + + + + + + + +
{{ apiError }}
Queue is empty — all items reviewed.
Import items from the Kiwi pipeline to continue.
{{ item.image_path }}
{{ prettyJson(item.ground_truth) }}
{{ jsonError }}