turnstone/app/context/chunker.py

"""Document type detection, fact extraction, and text chunking — MIT licensed."""
from __future__ import annotations

import json
from dataclasses import dataclass
from pathlib import Path

ACCEPTED_SUFFIXES = {".md", ".txt", ".yaml", ".yml", ".json", ".conf", ".config", ".toml"}
MAX_FILE_BYTES = 5 * 1024 * 1024  # 5 MB
CHUNK_WORDS = 300
CHUNK_OVERLAP = 50


class UnsupportedDocType(Exception):
    pass


class FileTooLarge(Exception):
    pass


@dataclass(frozen=True)
class ExtractedFact:
    category: str
    key: str
    value: str


def detect_type(filename: str, content: bytes) -> str:  # noqa: ARG001
    suffix = Path(filename).suffix.lower()
    if suffix not in ACCEPTED_SUFFIXES:
        raise UnsupportedDocType(
            f"File type {suffix!r} is not supported. "
            f"Accepted: {', '.join(sorted(ACCEPTED_SUFFIXES))}"
        )
    if suffix in {".yaml", ".yml"}:
        return "yaml"
    if suffix == ".json":
        return "json"
    if suffix == ".md":
        return "markdown"
    return "text"


def extract_facts_from_yaml(text: str) -> list[ExtractedFact]:
    """Extract service names and ports from docker-compose-style YAML."""
    try:
        import yaml
        data = yaml.safe_load(text)
    except Exception:
        return []
    if not isinstance(data, dict):
        return []
    services = data.get("services")
    if not isinstance(services, dict):
        return []
    facts = []
    for name, definition in services.items():
        if not isinstance(definition, dict):
            continue
        parts: list[str] = []
        image = definition.get("image")
        if image:
            parts.append(f"image:{image}")
        for port in definition.get("ports", []):
            parts.append(f"port:{port}")
        facts.append(ExtractedFact(
            category="service",
            key=str(name),
            value=" ".join(parts) if parts else "configured",
        ))
    return facts


def chunk_text(text: str, chunk_size: int = CHUNK_WORDS, overlap: int = CHUNK_OVERLAP) -> list[str]:
    words = text.split()
    if not words:
        return []
    chunks: list[str] = []
    i = 0
    while i < len(words):
        chunks.append(" ".join(words[i: i + chunk_size]))
        i += chunk_size - overlap
    return chunks


def process_upload(filename: str, content: bytes) -> tuple[str, list[ExtractedFact], list[str]]:
    """Return (doc_type, extracted_facts, text_chunks). Raises on bad type or size."""
    if len(content) > MAX_FILE_BYTES:
        raise FileTooLarge(f"File exceeds {MAX_FILE_BYTES // (1024 * 1024)} MB limit.")
    text = content.decode("utf-8", errors="replace")
    doc_type = detect_type(filename, content)
    facts: list[ExtractedFact] = []
    if doc_type == "yaml":
        facts = extract_facts_from_yaml(text)
    chunks = chunk_text(text)
    return doc_type, facts, chunks