feat: context chunker — type detection, YAML extraction, text chunking
- Implement document type detection for yaml/json/markdown/text - Extract service facts from docker-compose YAML (names, images, ports) - Split text into overlapping word chunks (300-word default with 50-word overlap) - Enforce 5 MB file size limit - Comprehensive TDD test suite: 15 tests passing
This commit is contained in:
parent
54c756dfe8
commit
b23a60a602
2 changed files with 209 additions and 0 deletions
97
app/context/chunker.py
Normal file
97
app/context/chunker.py
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
"""Document type detection, fact extraction, and text chunking — MIT licensed."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
ACCEPTED_SUFFIXES = {".md", ".txt", ".yaml", ".yml", ".json", ".conf", ".config", ".toml"}
|
||||
MAX_FILE_BYTES = 5 * 1024 * 1024 # 5 MB
|
||||
CHUNK_WORDS = 300
|
||||
CHUNK_OVERLAP = 50
|
||||
|
||||
|
||||
class UnsupportedDocType(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class FileTooLarge(Exception):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ExtractedFact:
|
||||
category: str
|
||||
key: str
|
||||
value: str
|
||||
|
||||
|
||||
def detect_type(filename: str, content: bytes) -> str: # noqa: ARG001
|
||||
suffix = Path(filename).suffix.lower()
|
||||
if suffix not in ACCEPTED_SUFFIXES:
|
||||
raise UnsupportedDocType(
|
||||
f"File type {suffix!r} is not supported. "
|
||||
f"Accepted: {', '.join(sorted(ACCEPTED_SUFFIXES))}"
|
||||
)
|
||||
if suffix in {".yaml", ".yml"}:
|
||||
return "yaml"
|
||||
if suffix == ".json":
|
||||
return "json"
|
||||
if suffix == ".md":
|
||||
return "markdown"
|
||||
return "text"
|
||||
|
||||
|
||||
def extract_facts_from_yaml(text: str) -> list[ExtractedFact]:
|
||||
"""Extract service names and ports from docker-compose-style YAML."""
|
||||
try:
|
||||
import yaml
|
||||
data = yaml.safe_load(text)
|
||||
except Exception:
|
||||
return []
|
||||
if not isinstance(data, dict):
|
||||
return []
|
||||
services = data.get("services")
|
||||
if not isinstance(services, dict):
|
||||
return []
|
||||
facts = []
|
||||
for name, definition in services.items():
|
||||
if not isinstance(definition, dict):
|
||||
continue
|
||||
parts: list[str] = []
|
||||
image = definition.get("image")
|
||||
if image:
|
||||
parts.append(f"image:{image}")
|
||||
for port in definition.get("ports", []):
|
||||
parts.append(f"port:{port}")
|
||||
facts.append(ExtractedFact(
|
||||
category="service",
|
||||
key=str(name),
|
||||
value=" ".join(parts) if parts else "configured",
|
||||
))
|
||||
return facts
|
||||
|
||||
|
||||
def chunk_text(text: str, chunk_size: int = CHUNK_WORDS, overlap: int = CHUNK_OVERLAP) -> list[str]:
|
||||
words = text.split()
|
||||
if not words:
|
||||
return []
|
||||
chunks: list[str] = []
|
||||
i = 0
|
||||
while i < len(words):
|
||||
chunks.append(" ".join(words[i: i + chunk_size]))
|
||||
i += chunk_size - overlap
|
||||
return chunks
|
||||
|
||||
|
||||
def process_upload(filename: str, content: bytes) -> tuple[str, list[ExtractedFact], list[str]]:
|
||||
"""Return (doc_type, extracted_facts, text_chunks). Raises on bad type or size."""
|
||||
if len(content) > MAX_FILE_BYTES:
|
||||
raise FileTooLarge(f"File exceeds {MAX_FILE_BYTES // (1024 * 1024)} MB limit.")
|
||||
text = content.decode("utf-8", errors="replace")
|
||||
doc_type = detect_type(filename, content)
|
||||
facts: list[ExtractedFact] = []
|
||||
if doc_type == "yaml":
|
||||
facts = extract_facts_from_yaml(text)
|
||||
chunks = chunk_text(text)
|
||||
return doc_type, facts, chunks
|
||||
112
tests/context/test_chunker.py
Normal file
112
tests/context/test_chunker.py
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
"""Tests for app/context/chunker.py."""
|
||||
import pytest
|
||||
from app.context.chunker import (
|
||||
detect_type, extract_facts_from_yaml, chunk_text,
|
||||
process_upload, UnsupportedDocType, FileTooLarge,
|
||||
)
|
||||
|
||||
|
||||
def test_detect_type_yaml():
|
||||
assert detect_type("compose.yml", b"") == "yaml"
|
||||
assert detect_type("docker-compose.yaml", b"") == "yaml"
|
||||
|
||||
|
||||
def test_detect_type_json():
|
||||
assert detect_type("config.json", b"") == "json"
|
||||
|
||||
|
||||
def test_detect_type_markdown():
|
||||
assert detect_type("runbook.md", b"") == "markdown"
|
||||
|
||||
|
||||
def test_detect_type_text():
|
||||
assert detect_type("notes.txt", b"") == "text"
|
||||
|
||||
|
||||
def test_detect_type_unsupported():
|
||||
with pytest.raises(UnsupportedDocType, match=".pdf"):
|
||||
detect_type("report.pdf", b"")
|
||||
|
||||
|
||||
def test_extract_facts_from_yaml_docker_compose():
|
||||
yaml_text = """
|
||||
services:
|
||||
plex:
|
||||
image: plexinc/pms-docker
|
||||
ports:
|
||||
- "32400:32400"
|
||||
sonarr:
|
||||
image: linuxserver/sonarr
|
||||
ports:
|
||||
- "8989:8989"
|
||||
"""
|
||||
facts = extract_facts_from_yaml(yaml_text)
|
||||
keys = [f.key for f in facts]
|
||||
assert "plex" in keys
|
||||
assert "sonarr" in keys
|
||||
plex_fact = next(f for f in facts if f.key == "plex")
|
||||
assert "port:32400:32400" in plex_fact.value or "port:" in plex_fact.value
|
||||
assert plex_fact.category == "service"
|
||||
|
||||
|
||||
def test_extract_facts_from_yaml_non_compose():
|
||||
yaml_text = "foo: bar\nbaz: 42\n"
|
||||
facts = extract_facts_from_yaml(yaml_text)
|
||||
assert facts == []
|
||||
|
||||
|
||||
def test_extract_facts_from_yaml_invalid():
|
||||
facts = extract_facts_from_yaml("{{{{not yaml")
|
||||
assert facts == []
|
||||
|
||||
|
||||
def test_chunk_text_basic():
|
||||
words = ["word"] * 600
|
||||
text = " ".join(words)
|
||||
chunks = chunk_text(text, chunk_size=300, overlap=50)
|
||||
assert len(chunks) >= 2
|
||||
for c in chunks:
|
||||
assert c.strip()
|
||||
|
||||
|
||||
def test_chunk_text_short():
|
||||
chunks = chunk_text("short text", chunk_size=300, overlap=50)
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0] == "short text"
|
||||
|
||||
|
||||
def test_chunk_text_empty():
|
||||
assert chunk_text("") == []
|
||||
|
||||
|
||||
def test_process_upload_yaml_extracts_facts():
|
||||
yaml_bytes = b"""
|
||||
services:
|
||||
nginx:
|
||||
image: nginx:latest
|
||||
ports:
|
||||
- "80:80"
|
||||
"""
|
||||
doc_type, facts, chunks = process_upload("docker-compose.yml", yaml_bytes)
|
||||
assert doc_type == "yaml"
|
||||
assert any(f.key == "nginx" for f in facts)
|
||||
assert len(chunks) >= 1
|
||||
|
||||
|
||||
def test_process_upload_markdown_no_facts():
|
||||
md = b"# Plex Troubleshooting\n\nRestart the service with systemctl restart plex.\n"
|
||||
doc_type, facts, chunks = process_upload("plex.md", md)
|
||||
assert doc_type == "markdown"
|
||||
assert facts == []
|
||||
assert len(chunks) >= 1
|
||||
|
||||
|
||||
def test_process_upload_too_large():
|
||||
big = b"x" * (6 * 1024 * 1024)
|
||||
with pytest.raises(FileTooLarge):
|
||||
process_upload("big.txt", big)
|
||||
|
||||
|
||||
def test_process_upload_unsupported_type():
|
||||
with pytest.raises(UnsupportedDocType):
|
||||
process_upload("report.pdf", b"data")
|
||||
Loading…
Reference in a new issue