diff --git a/app/glean/pipeline.py b/app/glean/pipeline.py index 2cb3184..d6a99a6 100644 --- a/app/glean/pipeline.py +++ b/app/glean/pipeline.py @@ -528,7 +528,7 @@ def glean_dir( Pass ``force=True`` to bypass fingerprint checks and re-glean all files regardless of whether they have changed since the last run. """ - files = sorted(corpus_dir.glob("*.jsonl")) + sorted(corpus_dir.glob("*.log")) + files = sorted(corpus_dir.rglob("*.jsonl")) + sorted(corpus_dir.rglob("*.log")) return _glean_files(files, db_path, pattern_file, batch_size, force=force) diff --git a/scripts/gen_corpus.py b/scripts/gen_corpus.py index fef20e9..bdc4056 100644 --- a/scripts/gen_corpus.py +++ b/scripts/gen_corpus.py @@ -284,7 +284,7 @@ def gen_docker(path: Path, start: datetime, end: datetime, rng: random.Random, e service = rng.choice(_DOCKER_SERVICES) msg = _pick_msg(_DOCKER_MESSAGES, severity, rng) entry = { - "SOURCE": service, + "SOURCE": f"docker:{service}", "MESSAGE": msg, } fh.write(json.dumps(entry) + "\n")