- Implement document type detection for yaml/json/markdown/text - Extract service facts from docker-compose YAML (names, images, ports) - Split text into overlapping word chunks (300-word default with 50-word overlap) - Enforce 5 MB file size limit - Comprehensive TDD test suite: 15 tests passing
97 lines
2.8 KiB
Python
97 lines
2.8 KiB
Python
"""Document type detection, fact extraction, and text chunking — MIT licensed."""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
ACCEPTED_SUFFIXES = {".md", ".txt", ".yaml", ".yml", ".json", ".conf", ".config", ".toml"}
|
|
MAX_FILE_BYTES = 5 * 1024 * 1024 # 5 MB
|
|
CHUNK_WORDS = 300
|
|
CHUNK_OVERLAP = 50
|
|
|
|
|
|
class UnsupportedDocType(Exception):
|
|
pass
|
|
|
|
|
|
class FileTooLarge(Exception):
|
|
pass
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ExtractedFact:
|
|
category: str
|
|
key: str
|
|
value: str
|
|
|
|
|
|
def detect_type(filename: str, content: bytes) -> str: # noqa: ARG001
|
|
suffix = Path(filename).suffix.lower()
|
|
if suffix not in ACCEPTED_SUFFIXES:
|
|
raise UnsupportedDocType(
|
|
f"File type {suffix!r} is not supported. "
|
|
f"Accepted: {', '.join(sorted(ACCEPTED_SUFFIXES))}"
|
|
)
|
|
if suffix in {".yaml", ".yml"}:
|
|
return "yaml"
|
|
if suffix == ".json":
|
|
return "json"
|
|
if suffix == ".md":
|
|
return "markdown"
|
|
return "text"
|
|
|
|
|
|
def extract_facts_from_yaml(text: str) -> list[ExtractedFact]:
|
|
"""Extract service names and ports from docker-compose-style YAML."""
|
|
try:
|
|
import yaml
|
|
data = yaml.safe_load(text)
|
|
except Exception:
|
|
return []
|
|
if not isinstance(data, dict):
|
|
return []
|
|
services = data.get("services")
|
|
if not isinstance(services, dict):
|
|
return []
|
|
facts = []
|
|
for name, definition in services.items():
|
|
if not isinstance(definition, dict):
|
|
continue
|
|
parts: list[str] = []
|
|
image = definition.get("image")
|
|
if image:
|
|
parts.append(f"image:{image}")
|
|
for port in definition.get("ports", []):
|
|
parts.append(f"port:{port}")
|
|
facts.append(ExtractedFact(
|
|
category="service",
|
|
key=str(name),
|
|
value=" ".join(parts) if parts else "configured",
|
|
))
|
|
return facts
|
|
|
|
|
|
def chunk_text(text: str, chunk_size: int = CHUNK_WORDS, overlap: int = CHUNK_OVERLAP) -> list[str]:
|
|
words = text.split()
|
|
if not words:
|
|
return []
|
|
chunks: list[str] = []
|
|
i = 0
|
|
while i < len(words):
|
|
chunks.append(" ".join(words[i: i + chunk_size]))
|
|
i += chunk_size - overlap
|
|
return chunks
|
|
|
|
|
|
def process_upload(filename: str, content: bytes) -> tuple[str, list[ExtractedFact], list[str]]:
|
|
"""Return (doc_type, extracted_facts, text_chunks). Raises on bad type or size."""
|
|
if len(content) > MAX_FILE_BYTES:
|
|
raise FileTooLarge(f"File exceeds {MAX_FILE_BYTES // (1024 * 1024)} MB limit.")
|
|
text = content.decode("utf-8", errors="replace")
|
|
doc_type = detect_type(filename, content)
|
|
facts: list[ExtractedFact] = []
|
|
if doc_type == "yaml":
|
|
facts = extract_facts_from_yaml(text)
|
|
chunks = chunk_text(text)
|
|
return doc_type, facts, chunks
|