"""Document type detection, fact extraction, and text chunking — MIT licensed.""" from __future__ import annotations import json from dataclasses import dataclass from pathlib import Path ACCEPTED_SUFFIXES = {".md", ".txt", ".yaml", ".yml", ".json", ".conf", ".config", ".toml"} MAX_FILE_BYTES = 5 * 1024 * 1024 # 5 MB CHUNK_WORDS = 300 CHUNK_OVERLAP = 50 class UnsupportedDocType(Exception): pass class FileTooLarge(Exception): pass @dataclass(frozen=True) class ExtractedFact: category: str key: str value: str def detect_type(filename: str, content: bytes) -> str: # noqa: ARG001 suffix = Path(filename).suffix.lower() if suffix not in ACCEPTED_SUFFIXES: raise UnsupportedDocType( f"File type {suffix!r} is not supported. " f"Accepted: {', '.join(sorted(ACCEPTED_SUFFIXES))}" ) if suffix in {".yaml", ".yml"}: return "yaml" if suffix == ".json": return "json" if suffix == ".md": return "markdown" return "text" def extract_facts_from_yaml(text: str) -> list[ExtractedFact]: """Extract service names and ports from docker-compose-style YAML.""" try: import yaml data = yaml.safe_load(text) except Exception: return [] if not isinstance(data, dict): return [] services = data.get("services") if not isinstance(services, dict): return [] facts = [] for name, definition in services.items(): if not isinstance(definition, dict): continue parts: list[str] = [] image = definition.get("image") if image: parts.append(f"image:{image}") for port in definition.get("ports", []): parts.append(f"port:{port}") facts.append(ExtractedFact( category="service", key=str(name), value=" ".join(parts) if parts else "configured", )) return facts def chunk_text(text: str, chunk_size: int = CHUNK_WORDS, overlap: int = CHUNK_OVERLAP) -> list[str]: words = text.split() if not words: return [] chunks: list[str] = [] i = 0 while i < len(words): chunks.append(" ".join(words[i: i + chunk_size])) i += chunk_size - overlap return chunks def process_upload(filename: str, content: bytes) -> tuple[str, list[ExtractedFact], list[str]]: """Return (doc_type, extracted_facts, text_chunks). Raises on bad type or size.""" if len(content) > MAX_FILE_BYTES: raise FileTooLarge(f"File exceeds {MAX_FILE_BYTES // (1024 * 1024)} MB limit.") text = content.decode("utf-8", errors="replace") doc_type = detect_type(filename, content) facts: list[ExtractedFact] = [] if doc_type == "yaml": facts = extract_facts_from_yaml(text) chunks = chunk_text(text) return doc_type, facts, chunks