From 74c9de9ccfa2ed5deeaeb81da20064ee0d843b12 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Thu, 11 Jun 2026 16:30:28 -0700 Subject: [PATCH] fix(corpus): glean_dir now recurses subdirs; fix docker SOURCE prefix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Changed glob → rglob in glean_dir so corpus directories with format subfolders (journald/, docker/, etc.) are fully ingested - Fixed gen_corpus.py docker SOURCE to emit "docker:" prefix so the pipeline correctly detects format as 'docker' not 'plaintext' - 17/17 gen_corpus tests passing Closes: https://git.opensourcesolarpunk.com/Circuit-Forge/turnstone/issues/46 --- app/glean/pipeline.py | 2 +- scripts/gen_corpus.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/app/glean/pipeline.py b/app/glean/pipeline.py index 2cb3184..d6a99a6 100644 --- a/app/glean/pipeline.py +++ b/app/glean/pipeline.py @@ -528,7 +528,7 @@ def glean_dir( Pass ``force=True`` to bypass fingerprint checks and re-glean all files regardless of whether they have changed since the last run. """ - files = sorted(corpus_dir.glob("*.jsonl")) + sorted(corpus_dir.glob("*.log")) + files = sorted(corpus_dir.rglob("*.jsonl")) + sorted(corpus_dir.rglob("*.log")) return _glean_files(files, db_path, pattern_file, batch_size, force=force) diff --git a/scripts/gen_corpus.py b/scripts/gen_corpus.py index fef20e9..bdc4056 100644 --- a/scripts/gen_corpus.py +++ b/scripts/gen_corpus.py @@ -284,7 +284,7 @@ def gen_docker(path: Path, start: datetime, end: datetime, rng: random.Random, e service = rng.choice(_DOCKER_SERVICES) msg = _pick_msg(_DOCKER_MESSAGES, severity, rng) entry = { - "SOURCE": service, + "SOURCE": f"docker:{service}", "MESSAGE": msg, } fh.write(json.dumps(entry) + "\n")