- Implement document type detection for yaml/json/markdown/text - Extract service facts from docker-compose YAML (names, images, ports) - Split text into overlapping word chunks (300-word default with 50-word overlap) - Enforce 5 MB file size limit - Comprehensive TDD test suite: 15 tests passing
112 lines
2.8 KiB
Python
112 lines
2.8 KiB
Python
"""Tests for app/context/chunker.py."""
|
|
import pytest
|
|
from app.context.chunker import (
|
|
detect_type, extract_facts_from_yaml, chunk_text,
|
|
process_upload, UnsupportedDocType, FileTooLarge,
|
|
)
|
|
|
|
|
|
def test_detect_type_yaml():
|
|
assert detect_type("compose.yml", b"") == "yaml"
|
|
assert detect_type("docker-compose.yaml", b"") == "yaml"
|
|
|
|
|
|
def test_detect_type_json():
|
|
assert detect_type("config.json", b"") == "json"
|
|
|
|
|
|
def test_detect_type_markdown():
|
|
assert detect_type("runbook.md", b"") == "markdown"
|
|
|
|
|
|
def test_detect_type_text():
|
|
assert detect_type("notes.txt", b"") == "text"
|
|
|
|
|
|
def test_detect_type_unsupported():
|
|
with pytest.raises(UnsupportedDocType, match=".pdf"):
|
|
detect_type("report.pdf", b"")
|
|
|
|
|
|
def test_extract_facts_from_yaml_docker_compose():
|
|
yaml_text = """
|
|
services:
|
|
plex:
|
|
image: plexinc/pms-docker
|
|
ports:
|
|
- "32400:32400"
|
|
sonarr:
|
|
image: linuxserver/sonarr
|
|
ports:
|
|
- "8989:8989"
|
|
"""
|
|
facts = extract_facts_from_yaml(yaml_text)
|
|
keys = [f.key for f in facts]
|
|
assert "plex" in keys
|
|
assert "sonarr" in keys
|
|
plex_fact = next(f for f in facts if f.key == "plex")
|
|
assert "port:32400:32400" in plex_fact.value or "port:" in plex_fact.value
|
|
assert plex_fact.category == "service"
|
|
|
|
|
|
def test_extract_facts_from_yaml_non_compose():
|
|
yaml_text = "foo: bar\nbaz: 42\n"
|
|
facts = extract_facts_from_yaml(yaml_text)
|
|
assert facts == []
|
|
|
|
|
|
def test_extract_facts_from_yaml_invalid():
|
|
facts = extract_facts_from_yaml("{{{{not yaml")
|
|
assert facts == []
|
|
|
|
|
|
def test_chunk_text_basic():
|
|
words = ["word"] * 600
|
|
text = " ".join(words)
|
|
chunks = chunk_text(text, chunk_size=300, overlap=50)
|
|
assert len(chunks) >= 2
|
|
for c in chunks:
|
|
assert c.strip()
|
|
|
|
|
|
def test_chunk_text_short():
|
|
chunks = chunk_text("short text", chunk_size=300, overlap=50)
|
|
assert len(chunks) == 1
|
|
assert chunks[0] == "short text"
|
|
|
|
|
|
def test_chunk_text_empty():
|
|
assert chunk_text("") == []
|
|
|
|
|
|
def test_process_upload_yaml_extracts_facts():
|
|
yaml_bytes = b"""
|
|
services:
|
|
nginx:
|
|
image: nginx:latest
|
|
ports:
|
|
- "80:80"
|
|
"""
|
|
doc_type, facts, chunks = process_upload("docker-compose.yml", yaml_bytes)
|
|
assert doc_type == "yaml"
|
|
assert any(f.key == "nginx" for f in facts)
|
|
assert len(chunks) >= 1
|
|
|
|
|
|
def test_process_upload_markdown_no_facts():
|
|
md = b"# Plex Troubleshooting\n\nRestart the service with systemctl restart plex.\n"
|
|
doc_type, facts, chunks = process_upload("plex.md", md)
|
|
assert doc_type == "markdown"
|
|
assert facts == []
|
|
assert len(chunks) >= 1
|
|
|
|
|
|
def test_process_upload_too_large():
|
|
big = b"x" * (6 * 1024 * 1024)
|
|
with pytest.raises(FileTooLarge):
|
|
process_upload("big.txt", big)
|
|
|
|
|
|
def test_process_upload_unsupported_type():
|
|
with pytest.raises(UnsupportedDocType):
|
|
process_upload("report.pdf", b"data")
|