fix(corpus): glean_dir now recurses subdirs; fix docker SOURCE prefix
- Changed glob → rglob in glean_dir so corpus directories with format subfolders (journald/, docker/, etc.) are fully ingested - Fixed gen_corpus.py docker SOURCE to emit "docker:<service>" prefix so the pipeline correctly detects format as 'docker' not 'plaintext' - 17/17 gen_corpus tests passing Closes: #46
This commit is contained in:
parent
7ab92a5cf4
commit
f8ad0fd453
2 changed files with 2 additions and 2 deletions
|
|
@ -528,7 +528,7 @@ def glean_dir(
|
|||
Pass ``force=True`` to bypass fingerprint checks and re-glean all files
|
||||
regardless of whether they have changed since the last run.
|
||||
"""
|
||||
files = sorted(corpus_dir.glob("*.jsonl")) + sorted(corpus_dir.glob("*.log"))
|
||||
files = sorted(corpus_dir.rglob("*.jsonl")) + sorted(corpus_dir.rglob("*.log"))
|
||||
return _glean_files(files, db_path, pattern_file, batch_size, force=force)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -284,7 +284,7 @@ def gen_docker(path: Path, start: datetime, end: datetime, rng: random.Random, e
|
|||
service = rng.choice(_DOCKER_SERVICES)
|
||||
msg = _pick_msg(_DOCKER_MESSAGES, severity, rng)
|
||||
entry = {
|
||||
"SOURCE": service,
|
||||
"SOURCE": f"docker:{service}",
|
||||
"MESSAGE": msg,
|
||||
}
|
||||
fh.write(json.dumps(entry) + "\n")
|
||||
|
|
|
|||
Loading…
Reference in a new issue