fix(corpus): glean_dir now recurses subdirs; fix docker SOURCE prefix

- Changed glob → rglob in glean_dir so corpus directories with format
  subfolders (journald/, docker/, etc.) are fully ingested
- Fixed gen_corpus.py docker SOURCE to emit "docker:<service>" prefix
  so the pipeline correctly detects format as 'docker' not 'plaintext'
- 17/17 gen_corpus tests passing

Closes: #46
This commit is contained in:
pyr0ball 2026-06-11 16:30:28 -07:00
parent 99b44ddb81
commit c797f68d4b
2 changed files with 2 additions and 2 deletions

View file

@ -528,7 +528,7 @@ def glean_dir(
Pass ``force=True`` to bypass fingerprint checks and re-glean all files
regardless of whether they have changed since the last run.
"""
files = sorted(corpus_dir.glob("*.jsonl")) + sorted(corpus_dir.glob("*.log"))
files = sorted(corpus_dir.rglob("*.jsonl")) + sorted(corpus_dir.rglob("*.log"))
return _glean_files(files, db_path, pattern_file, batch_size, force=force)

View file

@ -284,7 +284,7 @@ def gen_docker(path: Path, start: datetime, end: datetime, rng: random.Random, e
service = rng.choice(_DOCKER_SERVICES)
msg = _pick_msg(_DOCKER_MESSAGES, severity, rng)
entry = {
"SOURCE": service,
"SOURCE": f"docker:{service}",
"MESSAGE": msg,
}
fh.write(json.dumps(entry) + "\n")