turnstone/app/glean/plaintext.py

"""Generic plain-text log parser — fallback for unrecognized formats.

Attempts to extract a timestamp and severity from each line using common
patterns (syslog, ISO 8601, nginx/apache). Lines that don't match any
timestamp pattern are still ingested as plain text with no timestamp.
"""
from __future__ import annotations

import re
from datetime import datetime, timezone
from typing import Iterator

from app.glean.base import (
    SourceState, apply_patterns, detect_severity, make_entry_id, now_iso,
)
from app.services.models import LogPattern, RetrievedEntry

# Ordered most-specific first
_TS_PATTERNS: list[tuple[re.Pattern, str]] = [
    # ISO 8601: 2026-05-07T14:23:01.123Z or 2026-05-07 14:23:01
    (re.compile(r"^(?P<ts>\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?)"), "%Y-%m-%dT%H:%M:%S"),
    # Syslog: May  7 14:23:01
    (re.compile(r"^(?P<ts>\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2})"), "%b %d %H:%M:%S"),
    # Common log: 07/May/2026:14:23:01 +0000
    (re.compile(r"^(?P<ts>\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2}\s+[+-]\d{4})"), "%d/%b/%Y:%H:%M:%S %z"),
]


def _extract_ts(line: str) -> tuple[str, str]:
    for pattern, fmt in _TS_PATTERNS:
        m = pattern.match(line)
        if m:
            ts_raw = m.group("ts")
            try:
                # Strip fractional seconds / TZ for strptime compat.
                # Normalise ISO 8601 T-separator to space so strptime format matches.
                clean = re.sub(r"(\.\d+)?([Zz]|[+-]\d{2}:?\d{2})?$", "", ts_raw).strip()
                clean = clean.replace("T", " ")
                dt = datetime.strptime(clean, fmt.replace("T", " "))
                if dt.year == 1900:
                    dt = dt.replace(year=datetime.now().year)
                dt = dt.astimezone(timezone.utc)
                return ts_raw, dt.isoformat()
            except ValueError:
                pass
    return "", ""


def parse(
    lines: Iterator[str],
    source_id: str,
    compiled_patterns: list[tuple[LogPattern, object]],
    ingest_time: str | None = None,
) -> Iterator[RetrievedEntry]:
    ingest_time = ingest_time or now_iso()
    state = SourceState()

    for raw_line in lines:
        text = raw_line.strip()
        if not text:
            continue

        ts_raw, ts_iso = _extract_ts(text)
        severity = detect_severity(text)
        repeat, out_of_order = state.observe(text, ts_iso or None)
        matched = apply_patterns(text, compiled_patterns)

        yield RetrievedEntry(
            entry_id=make_entry_id(source_id, state.sequence, text),
            source_id=source_id,
            sequence=state.sequence,
            timestamp_raw=ts_raw,
            timestamp_iso=ts_iso or None,
            ingest_time=ingest_time,
            severity=severity,
            repeat_count=repeat,
            out_of_order=out_of_order,
            matched_patterns=matched,
            text=text,
        )