From c17bbf6e262a11ad9a8d1831e3789f00761b4fdf Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Wed, 13 May 2026 15:54:51 -0700 Subject: [PATCH] =?UTF-8?q?feat:=20context=20chunker=20=E2=80=94=20type=20?= =?UTF-8?q?detection,=20YAML=20extraction,=20text=20chunking?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Implement document type detection for yaml/json/markdown/text - Extract service facts from docker-compose YAML (names, images, ports) - Split text into overlapping word chunks (300-word default with 50-word overlap) - Enforce 5 MB file size limit - Comprehensive TDD test suite: 15 tests passing --- app/context/chunker.py | 97 +++++++++++++++++++++++++++++ tests/context/test_chunker.py | 112 ++++++++++++++++++++++++++++++++++ 2 files changed, 209 insertions(+) create mode 100644 app/context/chunker.py create mode 100644 tests/context/test_chunker.py diff --git a/app/context/chunker.py b/app/context/chunker.py new file mode 100644 index 0000000..242ee04 --- /dev/null +++ b/app/context/chunker.py @@ -0,0 +1,97 @@ +"""Document type detection, fact extraction, and text chunking — MIT licensed.""" +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path + +ACCEPTED_SUFFIXES = {".md", ".txt", ".yaml", ".yml", ".json", ".conf", ".config", ".toml"} +MAX_FILE_BYTES = 5 * 1024 * 1024 # 5 MB +CHUNK_WORDS = 300 +CHUNK_OVERLAP = 50 + + +class UnsupportedDocType(Exception): + pass + + +class FileTooLarge(Exception): + pass + + +@dataclass(frozen=True) +class ExtractedFact: + category: str + key: str + value: str + + +def detect_type(filename: str, content: bytes) -> str: # noqa: ARG001 + suffix = Path(filename).suffix.lower() + if suffix not in ACCEPTED_SUFFIXES: + raise UnsupportedDocType( + f"File type {suffix!r} is not supported. " + f"Accepted: {', '.join(sorted(ACCEPTED_SUFFIXES))}" + ) + if suffix in {".yaml", ".yml"}: + return "yaml" + if suffix == ".json": + return "json" + if suffix == ".md": + return "markdown" + return "text" + + +def extract_facts_from_yaml(text: str) -> list[ExtractedFact]: + """Extract service names and ports from docker-compose-style YAML.""" + try: + import yaml + data = yaml.safe_load(text) + except Exception: + return [] + if not isinstance(data, dict): + return [] + services = data.get("services") + if not isinstance(services, dict): + return [] + facts = [] + for name, definition in services.items(): + if not isinstance(definition, dict): + continue + parts: list[str] = [] + image = definition.get("image") + if image: + parts.append(f"image:{image}") + for port in definition.get("ports", []): + parts.append(f"port:{port}") + facts.append(ExtractedFact( + category="service", + key=str(name), + value=" ".join(parts) if parts else "configured", + )) + return facts + + +def chunk_text(text: str, chunk_size: int = CHUNK_WORDS, overlap: int = CHUNK_OVERLAP) -> list[str]: + words = text.split() + if not words: + return [] + chunks: list[str] = [] + i = 0 + while i < len(words): + chunks.append(" ".join(words[i: i + chunk_size])) + i += chunk_size - overlap + return chunks + + +def process_upload(filename: str, content: bytes) -> tuple[str, list[ExtractedFact], list[str]]: + """Return (doc_type, extracted_facts, text_chunks). Raises on bad type or size.""" + if len(content) > MAX_FILE_BYTES: + raise FileTooLarge(f"File exceeds {MAX_FILE_BYTES // (1024 * 1024)} MB limit.") + text = content.decode("utf-8", errors="replace") + doc_type = detect_type(filename, content) + facts: list[ExtractedFact] = [] + if doc_type == "yaml": + facts = extract_facts_from_yaml(text) + chunks = chunk_text(text) + return doc_type, facts, chunks diff --git a/tests/context/test_chunker.py b/tests/context/test_chunker.py new file mode 100644 index 0000000..b8a7e0e --- /dev/null +++ b/tests/context/test_chunker.py @@ -0,0 +1,112 @@ +"""Tests for app/context/chunker.py.""" +import pytest +from app.context.chunker import ( + detect_type, extract_facts_from_yaml, chunk_text, + process_upload, UnsupportedDocType, FileTooLarge, +) + + +def test_detect_type_yaml(): + assert detect_type("compose.yml", b"") == "yaml" + assert detect_type("docker-compose.yaml", b"") == "yaml" + + +def test_detect_type_json(): + assert detect_type("config.json", b"") == "json" + + +def test_detect_type_markdown(): + assert detect_type("runbook.md", b"") == "markdown" + + +def test_detect_type_text(): + assert detect_type("notes.txt", b"") == "text" + + +def test_detect_type_unsupported(): + with pytest.raises(UnsupportedDocType, match=".pdf"): + detect_type("report.pdf", b"") + + +def test_extract_facts_from_yaml_docker_compose(): + yaml_text = """ +services: + plex: + image: plexinc/pms-docker + ports: + - "32400:32400" + sonarr: + image: linuxserver/sonarr + ports: + - "8989:8989" +""" + facts = extract_facts_from_yaml(yaml_text) + keys = [f.key for f in facts] + assert "plex" in keys + assert "sonarr" in keys + plex_fact = next(f for f in facts if f.key == "plex") + assert "port:32400:32400" in plex_fact.value or "port:" in plex_fact.value + assert plex_fact.category == "service" + + +def test_extract_facts_from_yaml_non_compose(): + yaml_text = "foo: bar\nbaz: 42\n" + facts = extract_facts_from_yaml(yaml_text) + assert facts == [] + + +def test_extract_facts_from_yaml_invalid(): + facts = extract_facts_from_yaml("{{{{not yaml") + assert facts == [] + + +def test_chunk_text_basic(): + words = ["word"] * 600 + text = " ".join(words) + chunks = chunk_text(text, chunk_size=300, overlap=50) + assert len(chunks) >= 2 + for c in chunks: + assert c.strip() + + +def test_chunk_text_short(): + chunks = chunk_text("short text", chunk_size=300, overlap=50) + assert len(chunks) == 1 + assert chunks[0] == "short text" + + +def test_chunk_text_empty(): + assert chunk_text("") == [] + + +def test_process_upload_yaml_extracts_facts(): + yaml_bytes = b""" +services: + nginx: + image: nginx:latest + ports: + - "80:80" +""" + doc_type, facts, chunks = process_upload("docker-compose.yml", yaml_bytes) + assert doc_type == "yaml" + assert any(f.key == "nginx" for f in facts) + assert len(chunks) >= 1 + + +def test_process_upload_markdown_no_facts(): + md = b"# Plex Troubleshooting\n\nRestart the service with systemctl restart plex.\n" + doc_type, facts, chunks = process_upload("plex.md", md) + assert doc_type == "markdown" + assert facts == [] + assert len(chunks) >= 1 + + +def test_process_upload_too_large(): + big = b"x" * (6 * 1024 * 1024) + with pytest.raises(FileTooLarge): + process_upload("big.txt", big) + + +def test_process_upload_unsupported_type(): + with pytest.raises(UnsupportedDocType): + process_upload("report.pdf", b"data")