feat(corpus): pipeline log ingest from shared dir (closes #67)
Pull-side companion to kiwi#141. Ingests structured JSONL pipeline logs from /Library/Assets/logs/pipeline/ into the log corpus for Turnstone logreading model training. - app/data/log_corpus.py: add ingested_pipeline_files tracking table, _pipeline_ingest_dir() config helper, _ingest_one_file() parser, and POST /api/corpus/pipeline-ingest endpoint - source_host = "pipeline_scrape"; source_id from logger field; extra dict stored as matched_patterns; batch_type = "pipeline_log" - Idempotent by filename: skips files already in ingested_pipeline_files - config/label_tool.yaml.example: add corpus section with pipeline_ingest_dir and push sources comment block - tests/test_log_corpus.py: 8 new tests covering ingest, idempotency, non-JSONL filtering, malformed line resilience, incremental runs
This commit is contained in:
parent
13ca082a43
commit
9bb88b168f
3 changed files with 308 additions and 0 deletions
|
|
@ -34,6 +34,8 @@ router = APIRouter()
|
|||
|
||||
_DB_PATH: Path = _ROOT / "data" / "corpus.db"
|
||||
|
||||
_PIPELINE_SOURCE_HOST = "pipeline_scrape"
|
||||
|
||||
_SCHEMA = """
|
||||
CREATE TABLE IF NOT EXISTS corpus_sources (
|
||||
token TEXT PRIMARY KEY,
|
||||
|
|
@ -77,6 +79,12 @@ CREATE TABLE IF NOT EXISTS corpus_entries (
|
|||
CREATE INDEX IF NOT EXISTS idx_ce_label_state ON corpus_entries(label_state);
|
||||
CREATE INDEX IF NOT EXISTS idx_ce_source ON corpus_entries(source_host);
|
||||
CREATE INDEX IF NOT EXISTS idx_ce_severity ON corpus_entries(severity);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ingested_pipeline_files (
|
||||
filename TEXT PRIMARY KEY,
|
||||
ingested_at TEXT NOT NULL,
|
||||
entry_count INTEGER NOT NULL
|
||||
);
|
||||
"""
|
||||
|
||||
|
||||
|
|
@ -122,6 +130,19 @@ def _init_db() -> None:
|
|||
_seed_sources(conn)
|
||||
|
||||
|
||||
def _pipeline_ingest_dir() -> Path | None:
|
||||
"""Return the configured pipeline log ingest directory, or None if unset."""
|
||||
f = _config_file()
|
||||
if not f.exists():
|
||||
return None
|
||||
try:
|
||||
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
|
||||
except yaml.YAMLError:
|
||||
return None
|
||||
val = raw.get("corpus", {}).get("pipeline_ingest_dir", "") or ""
|
||||
return Path(val) if val else None
|
||||
|
||||
|
||||
def _load_corpus_config() -> list[dict]:
|
||||
f = _config_file()
|
||||
if not f.exists():
|
||||
|
|
@ -350,3 +371,92 @@ def export_labeled() -> StreamingResponse:
|
|||
media_type="application/x-ndjson",
|
||||
headers={"Content-Disposition": "attachment; filename=log_corpus_labeled.jsonl"},
|
||||
)
|
||||
|
||||
|
||||
# ── POST /api/corpus/pipeline-ingest ─────────────────────────────────────────
|
||||
|
||||
def _ingest_one_file(conn: sqlite3.Connection, path: Path) -> int:
|
||||
"""Parse a pipeline JSONL file and insert entries. Returns count stored."""
|
||||
batch_id = str(uuid.uuid4())
|
||||
lines = path.read_text(encoding="utf-8").splitlines()
|
||||
entries_raw: list[dict] = []
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
entries_raw.append(json.loads(line))
|
||||
except json.JSONDecodeError:
|
||||
logger.debug("Skipping malformed line in %s", path.name)
|
||||
|
||||
conn.execute(
|
||||
"INSERT INTO corpus_batches (id, source_host, batch_type, received_at, entry_count, raw_json) "
|
||||
"VALUES (?, ?, ?, ?, ?, ?)",
|
||||
(batch_id, _PIPELINE_SOURCE_HOST, "pipeline_log", _now_iso(),
|
||||
len(entries_raw), json.dumps({"file": path.name})),
|
||||
)
|
||||
|
||||
stored = 0
|
||||
for entry in entries_raw:
|
||||
text = (entry.get("msg") or "").strip()
|
||||
if not text:
|
||||
continue
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO corpus_entries "
|
||||
"(id, batch_id, source_host, timestamp_iso, severity, source_id, text, matched_patterns) "
|
||||
"VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
|
||||
(str(uuid.uuid4()), batch_id, _PIPELINE_SOURCE_HOST,
|
||||
entry.get("ts"),
|
||||
entry.get("level"),
|
||||
entry.get("logger"),
|
||||
text,
|
||||
json.dumps([entry["extra"]] if entry.get("extra") else [])),
|
||||
)
|
||||
stored += 1
|
||||
|
||||
conn.execute(
|
||||
"INSERT INTO ingested_pipeline_files (filename, ingested_at, entry_count) VALUES (?, ?, ?)",
|
||||
(path.name, _now_iso(), stored),
|
||||
)
|
||||
return stored
|
||||
|
||||
|
||||
@router.post("/pipeline-ingest")
|
||||
def pipeline_ingest() -> dict:
|
||||
"""Walk the configured pipeline log directory and ingest new JSONL files.
|
||||
|
||||
Skips files already recorded in ingested_pipeline_files. Safe to call
|
||||
repeatedly — idempotent by filename.
|
||||
"""
|
||||
ingest_dir = _pipeline_ingest_dir()
|
||||
if ingest_dir is None:
|
||||
raise HTTPException(404, "pipeline_ingest_dir not configured in label_tool.yaml")
|
||||
|
||||
ingested = 0
|
||||
skipped = 0
|
||||
total_stored = 0
|
||||
files_detail: list[dict] = []
|
||||
|
||||
with _db() as conn:
|
||||
already_done: set[str] = {
|
||||
row[0]
|
||||
for row in conn.execute("SELECT filename FROM ingested_pipeline_files").fetchall()
|
||||
}
|
||||
|
||||
for path in sorted(ingest_dir.glob("*.jsonl")):
|
||||
if path.name in already_done:
|
||||
skipped += 1
|
||||
continue
|
||||
stored = _ingest_one_file(conn, path)
|
||||
ingested += 1
|
||||
total_stored += stored
|
||||
files_detail.append({"file": path.name, "entries_stored": stored})
|
||||
|
||||
logger.info("Pipeline ingest: %d files ingested, %d skipped, %d entries stored",
|
||||
ingested, skipped, total_stored)
|
||||
return {
|
||||
"ingested_files": ingested,
|
||||
"skipped_files": skipped,
|
||||
"entries_stored": total_stored,
|
||||
"files": files_detail,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -122,6 +122,22 @@ imitate:
|
|||
text_fields: [title]
|
||||
prompt_template: "Summarize the key rules described in this passage:\n\n{text}"
|
||||
|
||||
# ── Log corpus (Turnstone training data) ──────────────────────────────────────
|
||||
corpus:
|
||||
# Directory containing pipeline JSONL log files to ingest (pull-side).
|
||||
# Files named <script>_<ts>.jsonl; one structured record per line.
|
||||
# POST /api/corpus/pipeline-ingest walks this dir and imports new files.
|
||||
# NFS-mounted on both Heimdall and Sif at /Library/Assets/
|
||||
pipeline_ingest_dir: /Library/Assets/logs/pipeline/
|
||||
|
||||
# Turnstone push sources (consent-gated, token-authenticated).
|
||||
# sources:
|
||||
# - token: "your-bearer-token"
|
||||
# source_host: "node.local"
|
||||
# owner: YourName
|
||||
# consent_date: "2026-05-17"
|
||||
# consent_method: signal_chat
|
||||
|
||||
# ── Embedding model comparison harness ────────────────────────────────────────
|
||||
embed_bench:
|
||||
# ollama_url: http://localhost:11434 # optional; falls back to cforch.ollama_url
|
||||
|
|
|
|||
|
|
@ -270,3 +270,185 @@ def test_export_excludes_pii_flagged(client):
|
|||
|
||||
resp = client.get("/api/corpus/export")
|
||||
assert resp.text.strip() == ""
|
||||
|
||||
|
||||
# ── Pipeline ingest endpoint ───────────────────────────────────────────────────
|
||||
|
||||
def _make_pipeline_file(directory: Path, name: str, lines: list[dict]) -> Path:
|
||||
"""Write a JSONL pipeline log file to directory."""
|
||||
p = directory / name
|
||||
p.write_text("\n".join(json.dumps(l) for l in lines), encoding="utf-8")
|
||||
return p
|
||||
|
||||
|
||||
_PIPELINE_LINE = {
|
||||
"ts": "2026-05-17T10:00:00Z",
|
||||
"level": "INFO",
|
||||
"logger": "scripts.pipeline.purple_carrot_scraper",
|
||||
"msg": "Fetched recipe page",
|
||||
"extra": {"url": "https://example.com/recipe/1", "status": 200},
|
||||
}
|
||||
|
||||
|
||||
def test_pipeline_ingest_returns_404_when_dir_not_configured(client, tmp_path):
|
||||
"""No pipeline_ingest_dir in config — endpoint returns 404."""
|
||||
resp = client.post("/api/corpus/pipeline-ingest")
|
||||
assert resp.status_code == 404
|
||||
|
||||
|
||||
def test_pipeline_ingest_empty_dir(client, tmp_path, monkeypatch):
|
||||
"""Configured dir exists but is empty — returns zeros, no error."""
|
||||
ingest_dir = tmp_path / "pipeline_logs"
|
||||
ingest_dir.mkdir()
|
||||
config_dir = tmp_path / "config"
|
||||
config_dir.mkdir(exist_ok=True)
|
||||
(config_dir / "label_tool.yaml").write_text(
|
||||
f"corpus:\n pipeline_ingest_dir: \"{ingest_dir}\"\n sources: []\n"
|
||||
)
|
||||
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
|
||||
|
||||
resp = client.post("/api/corpus/pipeline-ingest")
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert data["ingested_files"] == 0
|
||||
assert data["skipped_files"] == 0
|
||||
assert data["entries_stored"] == 0
|
||||
|
||||
|
||||
def test_pipeline_ingest_ingests_valid_file(client, tmp_path, monkeypatch):
|
||||
"""Valid JSONL file is ingested; entries appear in corpus."""
|
||||
ingest_dir = tmp_path / "pipeline_logs"
|
||||
ingest_dir.mkdir()
|
||||
_make_pipeline_file(ingest_dir, "scraper_20260517.jsonl", [
|
||||
_PIPELINE_LINE,
|
||||
{**_PIPELINE_LINE, "msg": "Saved 3 recipes", "level": "INFO"},
|
||||
])
|
||||
|
||||
config_dir = tmp_path / "config"
|
||||
config_dir.mkdir(exist_ok=True)
|
||||
(config_dir / "label_tool.yaml").write_text(
|
||||
f"corpus:\n pipeline_ingest_dir: \"{ingest_dir}\"\n sources: []\n"
|
||||
)
|
||||
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
|
||||
|
||||
resp = client.post("/api/corpus/pipeline-ingest")
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert data["ingested_files"] == 1
|
||||
assert data["entries_stored"] == 2
|
||||
|
||||
entries = client.get("/api/corpus/entries", params={"limit": 10}).json()["entries"]
|
||||
assert len(entries) == 2
|
||||
assert all(e["source_host"] == "pipeline_scrape" for e in entries)
|
||||
|
||||
|
||||
def test_pipeline_ingest_source_id_from_logger(client, tmp_path, monkeypatch):
|
||||
"""source_id is populated from the 'logger' field of each log line."""
|
||||
ingest_dir = tmp_path / "pipeline_logs"
|
||||
ingest_dir.mkdir()
|
||||
_make_pipeline_file(ingest_dir, "run_20260517.jsonl", [_PIPELINE_LINE])
|
||||
|
||||
config_dir = tmp_path / "config"
|
||||
config_dir.mkdir(exist_ok=True)
|
||||
(config_dir / "label_tool.yaml").write_text(
|
||||
f"corpus:\n pipeline_ingest_dir: \"{ingest_dir}\"\n sources: []\n"
|
||||
)
|
||||
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
|
||||
|
||||
client.post("/api/corpus/pipeline-ingest")
|
||||
entries = client.get("/api/corpus/entries", params={"limit": 10}).json()["entries"]
|
||||
assert entries[0]["source_id"] == "scripts.pipeline.purple_carrot_scraper"
|
||||
|
||||
|
||||
def test_pipeline_ingest_idempotent(client, tmp_path, monkeypatch):
|
||||
"""Calling the endpoint twice does not re-ingest already-processed files."""
|
||||
ingest_dir = tmp_path / "pipeline_logs"
|
||||
ingest_dir.mkdir()
|
||||
_make_pipeline_file(ingest_dir, "scraper_20260517.jsonl", [_PIPELINE_LINE])
|
||||
|
||||
config_dir = tmp_path / "config"
|
||||
config_dir.mkdir(exist_ok=True)
|
||||
(config_dir / "label_tool.yaml").write_text(
|
||||
f"corpus:\n pipeline_ingest_dir: \"{ingest_dir}\"\n sources: []\n"
|
||||
)
|
||||
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
|
||||
|
||||
client.post("/api/corpus/pipeline-ingest")
|
||||
resp2 = client.post("/api/corpus/pipeline-ingest")
|
||||
|
||||
data = resp2.json()
|
||||
assert data["ingested_files"] == 0
|
||||
assert data["skipped_files"] == 1
|
||||
assert data["entries_stored"] == 0
|
||||
|
||||
entries = client.get("/api/corpus/entries", params={"limit": 10}).json()["entries"]
|
||||
assert len(entries) == 1 # still just the one from the first ingest
|
||||
|
||||
|
||||
def test_pipeline_ingest_skips_non_jsonl(client, tmp_path, monkeypatch):
|
||||
"""Non-.jsonl files in the dir are silently ignored."""
|
||||
ingest_dir = tmp_path / "pipeline_logs"
|
||||
ingest_dir.mkdir()
|
||||
(ingest_dir / "notes.txt").write_text("this is not a log file")
|
||||
(ingest_dir / "run.csv").write_text("a,b,c\n1,2,3")
|
||||
_make_pipeline_file(ingest_dir, "valid_20260517.jsonl", [_PIPELINE_LINE])
|
||||
|
||||
config_dir = tmp_path / "config"
|
||||
config_dir.mkdir(exist_ok=True)
|
||||
(config_dir / "label_tool.yaml").write_text(
|
||||
f"corpus:\n pipeline_ingest_dir: \"{ingest_dir}\"\n sources: []\n"
|
||||
)
|
||||
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
|
||||
|
||||
resp = client.post("/api/corpus/pipeline-ingest")
|
||||
assert resp.json()["ingested_files"] == 1
|
||||
|
||||
|
||||
def test_pipeline_ingest_skips_malformed_lines(client, tmp_path, monkeypatch):
|
||||
"""Lines that are not valid JSON are skipped; valid lines in the same file still land."""
|
||||
ingest_dir = tmp_path / "pipeline_logs"
|
||||
ingest_dir.mkdir()
|
||||
p = ingest_dir / "mixed_20260517.jsonl"
|
||||
p.write_text(
|
||||
json.dumps(_PIPELINE_LINE) + "\n"
|
||||
"this is not json\n"
|
||||
+ json.dumps({**_PIPELINE_LINE, "msg": "another valid line"}),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
config_dir = tmp_path / "config"
|
||||
config_dir.mkdir(exist_ok=True)
|
||||
(config_dir / "label_tool.yaml").write_text(
|
||||
f"corpus:\n pipeline_ingest_dir: \"{ingest_dir}\"\n sources: []\n"
|
||||
)
|
||||
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
|
||||
|
||||
resp = client.post("/api/corpus/pipeline-ingest")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["entries_stored"] == 2 # 2 valid lines, 1 skipped
|
||||
|
||||
|
||||
def test_pipeline_ingest_new_file_after_first_run(client, tmp_path, monkeypatch):
|
||||
"""A new file added after the first ingest is picked up on the next call."""
|
||||
ingest_dir = tmp_path / "pipeline_logs"
|
||||
ingest_dir.mkdir()
|
||||
_make_pipeline_file(ingest_dir, "run_a.jsonl", [_PIPELINE_LINE])
|
||||
|
||||
config_dir = tmp_path / "config"
|
||||
config_dir.mkdir(exist_ok=True)
|
||||
(config_dir / "label_tool.yaml").write_text(
|
||||
f"corpus:\n pipeline_ingest_dir: \"{ingest_dir}\"\n sources: []\n"
|
||||
)
|
||||
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
|
||||
|
||||
client.post("/api/corpus/pipeline-ingest") # ingest run_a.jsonl
|
||||
|
||||
_make_pipeline_file(ingest_dir, "run_b.jsonl", [
|
||||
{**_PIPELINE_LINE, "msg": "Second run line"},
|
||||
])
|
||||
|
||||
resp2 = client.post("/api/corpus/pipeline-ingest")
|
||||
data = resp2.json()
|
||||
assert data["ingested_files"] == 1
|
||||
assert data["skipped_files"] == 1
|
||||
assert data["entries_stored"] == 1
|
||||
|
|
|
|||
Loading…
Reference in a new issue