turnstone/app/glean/doc_upload.py
pyr0ball e543ab70f7 feat: dual-backend SQLite/Postgres + multi-tenant source namespacing
- Add app/db/ abstraction layer: Backend enum, DbConn wrapper,
  dialect helper (q() for ? vs %s paramstyle), get_conn(), tenant_id()
- Auto-detect backend from DATABASE_URL; SQLite remains default when
  unset — no config change for local deployments
- Add tenant_id column to all three logical DBs (main, context, incidents);
  idempotent ALTER TABLE migration runs before schema scripts on existing DBs
- All INSERTs inject tenant_id; SELECTs use (tenant_id = ? OR tenant_id = '')
  for backward compat with pre-namespacing rows
- Add docker-compose.yml with named volume turnstone_pgdata (survives rebuilds)
  and optional external Postgres support via DATABASE_URL override
- Add scripts/migrate_sqlite_to_postgres.py — one-shot idempotent migration
  for existing SQLite data; ON CONFLICT DO NOTHING for safe re-runs
- Fix SSH glean path in pipeline.py to use ensure_schema + get_conn
  (was still using raw sqlite3.connect + old _SCHEMA without tenant_id)
- Fix FTS5 JOIN ambiguity: qualify repeat_count as f.repeat_count in search
- Update all tests to use ensure_*_schema fixtures; add row_factory where needed
- 394/394 tests passing

Closes: #42
Closes: #50
2026-06-08 08:37:54 -07:00

42 lines
1.3 KiB
Python

"""Upload adapter: processes file bytes and writes to context store — MIT licensed."""
from __future__ import annotations
import uuid
from pathlib import Path
from typing import Any
from app.context.chunker import process_upload
from app.context.store import add_document, add_fact
from app.db import get_conn, resolve_tenant_id
def glean_upload(db_path: Path, filename: str, content: bytes) -> dict[str, Any]:
"""Process an uploaded file and write to context store. Returns result summary."""
doc_type, facts, chunks = process_upload(filename, content)
tid = resolve_tenant_id()
doc = add_document(
db_path,
filename=filename,
doc_type=doc_type,
full_text=content.decode("utf-8", errors="replace"),
file_size=len(content),
)
for fact in facts:
add_fact(db_path, fact.category, fact.key, fact.value, source="upload")
with get_conn(db_path) as conn:
for i, chunk_text in enumerate(chunks):
conn.execute(
"INSERT INTO context_chunks(id, tenant_id, document_id, chunk_index, text) VALUES (?,?,?,?,?)",
(str(uuid.uuid4()), tid, doc.id, i, chunk_text),
)
conn.commit()
return {
"document_id": doc.id,
"doc_type": doc_type,
"facts_written": len(facts),
"chunks_written": len(chunks),
}