From bcd321367e0cc060c850b00ae604992ffc99a96d Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Wed, 13 May 2026 23:01:16 -0700 Subject: [PATCH] feat: GET /api/library/sample-chunks for Avocet embed bench (closes #6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Returns up to N randomly sampled page chunks (default 50, max 200) with chunk_id, doc_id, page_number, and text fields. No tier gate — internal tool endpoint for same-host corpus benchmarking. Returns [] on empty library. --- app/api/library.py | 23 +++++++++++++++++++++++ tests/test_library_api.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/app/api/library.py b/app/api/library.py index 2ce57d0..98c396d 100644 --- a/app/api/library.py +++ b/app/api/library.py @@ -90,6 +90,29 @@ def _run_ingest_background( _task_registry[task_id] = {"status": "error", "error": str(exc)} +@router.get("/sample-chunks") +def sample_chunks( + limit: int = 50, + db: sqlite3.Connection = Depends(get_db), +) -> list[dict]: + """Return up to `limit` randomly sampled page chunks for corpus benchmarking. + + No tier gate — internal tool, same-host access only. Returns [] if empty. + """ + if limit < 1: + limit = 1 + elif limit > 200: + limit = 200 + rows = db.execute( + "SELECT id, doc_id, page_number, text FROM page_chunks ORDER BY RANDOM() LIMIT ?", + [limit], + ).fetchall() + return [ + {"chunk_id": r["id"], "doc_id": r["doc_id"], "page_number": r["page_number"], "text": r["text"]} + for r in rows + ] + + @router.get("") def list_library(db: sqlite3.Connection = Depends(get_db)) -> list[dict]: rows = db.execute( diff --git a/tests/test_library_api.py b/tests/test_library_api.py index b1a16a5..7171406 100644 --- a/tests/test_library_api.py +++ b/tests/test_library_api.py @@ -66,3 +66,40 @@ def test_reingest_updates_status_to_processing(client, test_db, tmp_path): # Document should be in processing state (or beyond if stub ingest ran instantly) status_resp = client.get(f"/api/library/{doc_id}/status") assert status_resp.json()["status"] in ("processing", "error", "ready") + + +def _add_chunks(db_path: str, doc_id: str, count: int) -> None: + conn = sqlite3.connect(db_path) + for i in range(count): + conn.execute( + "INSERT INTO page_chunks(doc_id, page_number, text, source, word_count) VALUES (?,?,?,?,?)", + [doc_id, i + 1, f"Page {i + 1} text content.", "text_layer", 4], + ) + conn.commit() + conn.close() + + +def test_sample_chunks_empty_returns_empty(client): + resp = client.get("/api/library/sample-chunks") + assert resp.status_code == 200 + assert resp.json() == [] + + +def test_sample_chunks_returns_fields(client, test_db): + doc_id = _add_doc(test_db, "Monster Manual", "/books/mm.pdf") + _add_chunks(test_db, doc_id, 5) + resp = client.get("/api/library/sample-chunks?limit=3") + assert resp.status_code == 200 + chunks = resp.json() + assert len(chunks) == 3 + for c in chunks: + assert {"chunk_id", "doc_id", "page_number", "text"} == set(c.keys()) + assert c["doc_id"] == doc_id + + +def test_sample_chunks_limit_capped_at_200(client, test_db): + doc_id = _add_doc(test_db, "Big Book", "/books/big.pdf") + _add_chunks(test_db, doc_id, 10) + resp = client.get("/api/library/sample-chunks?limit=9999") + assert resp.status_code == 200 + assert len(resp.json()) <= 200