feat: GET /api/library/sample-chunks for Avocet embed bench (closes #6)
Returns up to N randomly sampled page chunks (default 50, max 200) with chunk_id, doc_id, page_number, and text fields. No tier gate — internal tool endpoint for same-host corpus benchmarking. Returns [] on empty library.
This commit is contained in:
parent
1e066cf66c
commit
bcd321367e
2 changed files with 60 additions and 0 deletions
|
|
@ -90,6 +90,29 @@ def _run_ingest_background(
|
|||
_task_registry[task_id] = {"status": "error", "error": str(exc)}
|
||||
|
||||
|
||||
@router.get("/sample-chunks")
|
||||
def sample_chunks(
|
||||
limit: int = 50,
|
||||
db: sqlite3.Connection = Depends(get_db),
|
||||
) -> list[dict]:
|
||||
"""Return up to `limit` randomly sampled page chunks for corpus benchmarking.
|
||||
|
||||
No tier gate — internal tool, same-host access only. Returns [] if empty.
|
||||
"""
|
||||
if limit < 1:
|
||||
limit = 1
|
||||
elif limit > 200:
|
||||
limit = 200
|
||||
rows = db.execute(
|
||||
"SELECT id, doc_id, page_number, text FROM page_chunks ORDER BY RANDOM() LIMIT ?",
|
||||
[limit],
|
||||
).fetchall()
|
||||
return [
|
||||
{"chunk_id": r["id"], "doc_id": r["doc_id"], "page_number": r["page_number"], "text": r["text"]}
|
||||
for r in rows
|
||||
]
|
||||
|
||||
|
||||
@router.get("")
|
||||
def list_library(db: sqlite3.Connection = Depends(get_db)) -> list[dict]:
|
||||
rows = db.execute(
|
||||
|
|
|
|||
|
|
@ -66,3 +66,40 @@ def test_reingest_updates_status_to_processing(client, test_db, tmp_path):
|
|||
# Document should be in processing state (or beyond if stub ingest ran instantly)
|
||||
status_resp = client.get(f"/api/library/{doc_id}/status")
|
||||
assert status_resp.json()["status"] in ("processing", "error", "ready")
|
||||
|
||||
|
||||
def _add_chunks(db_path: str, doc_id: str, count: int) -> None:
|
||||
conn = sqlite3.connect(db_path)
|
||||
for i in range(count):
|
||||
conn.execute(
|
||||
"INSERT INTO page_chunks(doc_id, page_number, text, source, word_count) VALUES (?,?,?,?,?)",
|
||||
[doc_id, i + 1, f"Page {i + 1} text content.", "text_layer", 4],
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_sample_chunks_empty_returns_empty(client):
|
||||
resp = client.get("/api/library/sample-chunks")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json() == []
|
||||
|
||||
|
||||
def test_sample_chunks_returns_fields(client, test_db):
|
||||
doc_id = _add_doc(test_db, "Monster Manual", "/books/mm.pdf")
|
||||
_add_chunks(test_db, doc_id, 5)
|
||||
resp = client.get("/api/library/sample-chunks?limit=3")
|
||||
assert resp.status_code == 200
|
||||
chunks = resp.json()
|
||||
assert len(chunks) == 3
|
||||
for c in chunks:
|
||||
assert {"chunk_id", "doc_id", "page_number", "text"} == set(c.keys())
|
||||
assert c["doc_id"] == doc_id
|
||||
|
||||
|
||||
def test_sample_chunks_limit_capped_at_200(client, test_db):
|
||||
doc_id = _add_doc(test_db, "Big Book", "/books/big.pdf")
|
||||
_add_chunks(test_db, doc_id, 10)
|
||||
resp = client.get("/api/library/sample-chunks?limit=9999")
|
||||
assert resp.status_code == 200
|
||||
assert len(resp.json()) <= 200
|
||||
|
|
|
|||
Loading…
Reference in a new issue