From bcd321367e0cc060c850b00ae604992ffc99a96d Mon Sep 17 00:00:00 2001
From: pyr0ball <pyroballpcs@gmail.com>
Date: Wed, 13 May 2026 23:01:16 -0700
Subject: [PATCH] feat: GET /api/library/sample-chunks for Avocet embed bench
 (closes #6)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Returns up to N randomly sampled page chunks (default 50, max 200) with
chunk_id, doc_id, page_number, and text fields. No tier gate — internal
tool endpoint for same-host corpus benchmarking. Returns [] on empty library.
---
 app/api/library.py        | 23 +++++++++++++++++++++++
 tests/test_library_api.py | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/app/api/library.py b/app/api/library.py
index 2ce57d0..98c396d 100644
--- a/app/api/library.py
+++ b/app/api/library.py
@@ -90,6 +90,29 @@ def _run_ingest_background(
         _task_registry[task_id] = {"status": "error", "error": str(exc)}
 
 
+@router.get("/sample-chunks")
+def sample_chunks(
+    limit: int = 50,
+    db: sqlite3.Connection = Depends(get_db),
+) -> list[dict]:
+    """Return up to `limit` randomly sampled page chunks for corpus benchmarking.
+
+    No tier gate — internal tool, same-host access only. Returns [] if empty.
+    """
+    if limit < 1:
+        limit = 1
+    elif limit > 200:
+        limit = 200
+    rows = db.execute(
+        "SELECT id, doc_id, page_number, text FROM page_chunks ORDER BY RANDOM() LIMIT ?",
+        [limit],
+    ).fetchall()
+    return [
+        {"chunk_id": r["id"], "doc_id": r["doc_id"], "page_number": r["page_number"], "text": r["text"]}
+        for r in rows
+    ]
+
+
 @router.get("")
 def list_library(db: sqlite3.Connection = Depends(get_db)) -> list[dict]:
     rows = db.execute(
diff --git a/tests/test_library_api.py b/tests/test_library_api.py
index b1a16a5..7171406 100644
--- a/tests/test_library_api.py
+++ b/tests/test_library_api.py
@@ -66,3 +66,40 @@ def test_reingest_updates_status_to_processing(client, test_db, tmp_path):
     # Document should be in processing state (or beyond if stub ingest ran instantly)
     status_resp = client.get(f"/api/library/{doc_id}/status")
     assert status_resp.json()["status"] in ("processing", "error", "ready")
+
+
+def _add_chunks(db_path: str, doc_id: str, count: int) -> None:
+    conn = sqlite3.connect(db_path)
+    for i in range(count):
+        conn.execute(
+            "INSERT INTO page_chunks(doc_id, page_number, text, source, word_count) VALUES (?,?,?,?,?)",
+            [doc_id, i + 1, f"Page {i + 1} text content.", "text_layer", 4],
+        )
+    conn.commit()
+    conn.close()
+
+
+def test_sample_chunks_empty_returns_empty(client):
+    resp = client.get("/api/library/sample-chunks")
+    assert resp.status_code == 200
+    assert resp.json() == []
+
+
+def test_sample_chunks_returns_fields(client, test_db):
+    doc_id = _add_doc(test_db, "Monster Manual", "/books/mm.pdf")
+    _add_chunks(test_db, doc_id, 5)
+    resp = client.get("/api/library/sample-chunks?limit=3")
+    assert resp.status_code == 200
+    chunks = resp.json()
+    assert len(chunks) == 3
+    for c in chunks:
+        assert {"chunk_id", "doc_id", "page_number", "text"} == set(c.keys())
+        assert c["doc_id"] == doc_id
+
+
+def test_sample_chunks_limit_capped_at_200(client, test_db):
+    doc_id = _add_doc(test_db, "Big Book", "/books/big.pdf")
+    _add_chunks(test_db, doc_id, 10)
+    resp = client.get("/api/library/sample-chunks?limit=9999")
+    assert resp.status_code == 200
+    assert len(resp.json()) <= 200