feat: context chunker — type detection, YAML extraction, text chunking

- Implement document type detection for yaml/json/markdown/text - Extract service facts from docker-compose YAML (names, images, ports) - Split text into overlapping word chunks (300-word default with 50-word overlap) - Enforce 5 MB file size limit - Comprehensive TDD test suite: 15 tests passing
2026-05-13 15:54:51 -07:00 · 2026-05-13 15:54:51 -07:00 · b23a60a602
commit b23a60a602
parent 54c756dfe8
2 changed files with 209 additions and 0 deletions
--- a/app/context/chunker.py
+++ b/app/context/chunker.py
@ -0,0 +1,97 @@
+"""Document type detection, fact extraction, and text chunking — MIT licensed."""
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from pathlib import Path
+
+ACCEPTED_SUFFIXES = {".md", ".txt", ".yaml", ".yml", ".json", ".conf", ".config", ".toml"}
+MAX_FILE_BYTES = 5 * 1024 * 1024  # 5 MB
+CHUNK_WORDS = 300
+CHUNK_OVERLAP = 50
+
+
+class UnsupportedDocType(Exception):
+    pass
+
+
+class FileTooLarge(Exception):
+    pass
+
+
+@dataclass(frozen=True)
+class ExtractedFact:
+    category: str
+    key: str
+    value: str
+
+
+def detect_type(filename: str, content: bytes) -> str:  # noqa: ARG001
+    suffix = Path(filename).suffix.lower()
+    if suffix not in ACCEPTED_SUFFIXES:
+        raise UnsupportedDocType(
+            f"File type {suffix!r} is not supported. "
+            f"Accepted: {', '.join(sorted(ACCEPTED_SUFFIXES))}"
+        )
+    if suffix in {".yaml", ".yml"}:
+        return "yaml"
+    if suffix == ".json":
+        return "json"
+    if suffix == ".md":
+        return "markdown"
+    return "text"
+
+
+def extract_facts_from_yaml(text: str) -> list[ExtractedFact]:
+    """Extract service names and ports from docker-compose-style YAML."""
+    try:
+        import yaml
+        data = yaml.safe_load(text)
+    except Exception:
+        return []
+    if not isinstance(data, dict):
+        return []
+    services = data.get("services")
+    if not isinstance(services, dict):
+        return []
+    facts = []
+    for name, definition in services.items():
+        if not isinstance(definition, dict):
+            continue
+        parts: list[str] = []
+        image = definition.get("image")
+        if image:
+            parts.append(f"image:{image}")
+        for port in definition.get("ports", []):
+            parts.append(f"port:{port}")
+        facts.append(ExtractedFact(
+            category="service",
+            key=str(name),
+            value=" ".join(parts) if parts else "configured",
+        ))
+    return facts
+
+
+def chunk_text(text: str, chunk_size: int = CHUNK_WORDS, overlap: int = CHUNK_OVERLAP) -> list[str]:
+    words = text.split()
+    if not words:
+        return []
+    chunks: list[str] = []
+    i = 0
+    while i < len(words):
+        chunks.append(" ".join(words[i: i + chunk_size]))
+        i += chunk_size - overlap
+    return chunks
+
+
+def process_upload(filename: str, content: bytes) -> tuple[str, list[ExtractedFact], list[str]]:
+    """Return (doc_type, extracted_facts, text_chunks). Raises on bad type or size."""
+    if len(content) > MAX_FILE_BYTES:
+        raise FileTooLarge(f"File exceeds {MAX_FILE_BYTES // (1024 * 1024)} MB limit.")
+    text = content.decode("utf-8", errors="replace")
+    doc_type = detect_type(filename, content)
+    facts: list[ExtractedFact] = []
+    if doc_type == "yaml":
+        facts = extract_facts_from_yaml(text)
+    chunks = chunk_text(text)
+    return doc_type, facts, chunks
--- a/tests/context/test_chunker.py
+++ b/tests/context/test_chunker.py
@ -0,0 +1,112 @@
+"""Tests for app/context/chunker.py."""
+import pytest
+from app.context.chunker import (
+    detect_type, extract_facts_from_yaml, chunk_text,
+    process_upload, UnsupportedDocType, FileTooLarge,
+)
+
+
+def test_detect_type_yaml():
+    assert detect_type("compose.yml", b"") == "yaml"
+    assert detect_type("docker-compose.yaml", b"") == "yaml"
+
+
+def test_detect_type_json():
+    assert detect_type("config.json", b"") == "json"
+
+
+def test_detect_type_markdown():
+    assert detect_type("runbook.md", b"") == "markdown"
+
+
+def test_detect_type_text():
+    assert detect_type("notes.txt", b"") == "text"
+
+
+def test_detect_type_unsupported():
+    with pytest.raises(UnsupportedDocType, match=".pdf"):
+        detect_type("report.pdf", b"")
+
+
+def test_extract_facts_from_yaml_docker_compose():
+    yaml_text = """
+services:
+  plex:
+    image: plexinc/pms-docker
+    ports:
+      - "32400:32400"
+  sonarr:
+    image: linuxserver/sonarr
+    ports:
+      - "8989:8989"
+"""
+    facts = extract_facts_from_yaml(yaml_text)
+    keys = [f.key for f in facts]
+    assert "plex" in keys
+    assert "sonarr" in keys
+    plex_fact = next(f for f in facts if f.key == "plex")
+    assert "port:32400:32400" in plex_fact.value or "port:" in plex_fact.value
+    assert plex_fact.category == "service"
+
+
+def test_extract_facts_from_yaml_non_compose():
+    yaml_text = "foo: bar\nbaz: 42\n"
+    facts = extract_facts_from_yaml(yaml_text)
+    assert facts == []
+
+
+def test_extract_facts_from_yaml_invalid():
+    facts = extract_facts_from_yaml("{{{{not yaml")
+    assert facts == []
+
+
+def test_chunk_text_basic():
+    words = ["word"] * 600
+    text = " ".join(words)
+    chunks = chunk_text(text, chunk_size=300, overlap=50)
+    assert len(chunks) >= 2
+    for c in chunks:
+        assert c.strip()
+
+
+def test_chunk_text_short():
+    chunks = chunk_text("short text", chunk_size=300, overlap=50)
+    assert len(chunks) == 1
+    assert chunks[0] == "short text"
+
+
+def test_chunk_text_empty():
+    assert chunk_text("") == []
+
+
+def test_process_upload_yaml_extracts_facts():
+    yaml_bytes = b"""
+services:
+  nginx:
+    image: nginx:latest
+    ports:
+      - "80:80"
+"""
+    doc_type, facts, chunks = process_upload("docker-compose.yml", yaml_bytes)
+    assert doc_type == "yaml"
+    assert any(f.key == "nginx" for f in facts)
+    assert len(chunks) >= 1
+
+
+def test_process_upload_markdown_no_facts():
+    md = b"# Plex Troubleshooting\n\nRestart the service with systemctl restart plex.\n"
+    doc_type, facts, chunks = process_upload("plex.md", md)
+    assert doc_type == "markdown"
+    assert facts == []
+    assert len(chunks) >= 1
+
+
+def test_process_upload_too_large():
+    big = b"x" * (6 * 1024 * 1024)
+    with pytest.raises(FileTooLarge):
+        process_upload("big.txt", big)
+
+
+def test_process_upload_unsupported_type():
+    with pytest.raises(UnsupportedDocType):
+        process_upload("report.pdf", b"data")