From 70c8a7deeaba2ac7b28906c0d764425cda97450d Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Wed, 13 May 2026 16:21:55 -0700 Subject: [PATCH] =?UTF-8?q?feat:=20doc=20upload=20adapter=20=E2=80=94=20wr?= =?UTF-8?q?ites=20facts,=20document,=20and=20chunks=20to=20context=20store?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/ingest/doc_upload.py | 43 +++++++++++++++++++++ tests/context/test_doc_upload.py | 64 ++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 app/ingest/doc_upload.py create mode 100644 tests/context/test_doc_upload.py diff --git a/app/ingest/doc_upload.py b/app/ingest/doc_upload.py new file mode 100644 index 0000000..98bb8d7 --- /dev/null +++ b/app/ingest/doc_upload.py @@ -0,0 +1,43 @@ +"""Upload adapter: processes file bytes and writes to context store — MIT licensed.""" +from __future__ import annotations + +import sqlite3 +import uuid +from pathlib import Path +from typing import Any + +from app.context.chunker import process_upload +from app.context.store import add_document, add_fact + + +def ingest_upload(db_path: Path, filename: str, content: bytes) -> dict[str, Any]: + """Process an uploaded file and write to context store. Returns result summary.""" + doc_type, facts, chunks = process_upload(filename, content) + + doc = add_document( + db_path, + filename=filename, + doc_type=doc_type, + full_text=content.decode("utf-8", errors="replace"), + file_size=len(content), + ) + + for fact in facts: + add_fact(db_path, fact.category, fact.key, fact.value, source="upload") + + conn = sqlite3.connect(str(db_path)) + conn.execute("PRAGMA journal_mode=WAL") + for i, chunk_text in enumerate(chunks): + conn.execute( + "INSERT INTO context_chunks(id, document_id, chunk_index, text) VALUES (?,?,?,?)", + (str(uuid.uuid4()), doc.id, i, chunk_text), + ) + conn.commit() + conn.close() + + return { + "document_id": doc.id, + "doc_type": doc_type, + "facts_written": len(facts), + "chunks_written": len(chunks), + } diff --git a/tests/context/test_doc_upload.py b/tests/context/test_doc_upload.py new file mode 100644 index 0000000..9986d62 --- /dev/null +++ b/tests/context/test_doc_upload.py @@ -0,0 +1,64 @@ +"""End-to-end upload pipeline: file bytes → DB rows.""" +import sqlite3 +import pytest +from pathlib import Path + +from app.ingest.doc_upload import ingest_upload +from app.context.store import list_facts, list_documents +from app.context.chunker import UnsupportedDocType + + +@pytest.fixture +def db(tmp_path): + db_path = tmp_path / "t.db" + conn = sqlite3.connect(str(db_path)) + conn.executescript(""" + CREATE TABLE context_facts ( + id TEXT PRIMARY KEY, category TEXT NOT NULL, key TEXT NOT NULL, + value TEXT NOT NULL, source TEXT, created_at TEXT NOT NULL + ); + CREATE TABLE context_documents ( + id TEXT PRIMARY KEY, filename TEXT NOT NULL, doc_type TEXT NOT NULL, + full_text TEXT NOT NULL, file_size INTEGER, uploaded_at TEXT NOT NULL + ); + CREATE TABLE context_chunks ( + id TEXT PRIMARY KEY, document_id TEXT NOT NULL + REFERENCES context_documents(id) ON DELETE CASCADE, + chunk_index INTEGER NOT NULL, text TEXT NOT NULL, embedding BLOB + ); + """) + conn.commit() + conn.close() + return db_path + + +def test_ingest_yaml_creates_facts_and_doc(db): + yaml_bytes = b""" +services: + plex: + image: plexinc/pms-docker + ports: + - "32400:32400" +""" + result = ingest_upload(db, "docker-compose.yml", yaml_bytes) + assert result["doc_type"] == "yaml" + assert result["facts_written"] >= 1 + assert result["chunks_written"] >= 1 + docs = list_documents(db) + assert len(docs) == 1 + assert docs[0].filename == "docker-compose.yml" + facts = list_facts(db, category="service") + assert any(f.key == "plex" for f in facts) + + +def test_ingest_markdown_no_facts(db): + md = b"# Runbook\n\nRestart plex with `systemctl restart plex`." + result = ingest_upload(db, "runbook.md", md) + assert result["doc_type"] == "markdown" + assert result["facts_written"] == 0 + assert result["chunks_written"] >= 1 + + +def test_ingest_raises_on_bad_type(db): + with pytest.raises(UnsupportedDocType): + ingest_upload(db, "report.pdf", b"data")