feat: plex EAE watchdog and plex_eae_failure pattern

Add plex_eae_failure pattern to default.yaml targeting the EasyAudioEncoder
crash signature (EAE timeout + I/O error pair, 5s cadence). Pattern fires
when EAE's WAV handoff files stop appearing in the pms temp directory.

Add watch_plex.py: tail-based watchdog that counts EAE timeout events and
auto-restarts plexmediaserver after N consecutive hits (default 3, ~15s of
failure). Includes cooldown, dry-run mode, and a systemd unit template.
This commit is contained in:
pyr0ball 2026-05-08 13:41:34 -07:00
parent bbe4b1e360
commit 3431be5bfa
2 changed files with 142 additions and 0 deletions

View file

@ -82,6 +82,12 @@ patterns:
description: IP address change or DHCP event description: IP address change or DHCP event
# Add device/service-specific patterns below this line: # Add device/service-specific patterns below this line:
- name: plex_eae_failure
pattern: "(EAE timeout|EAE not running|eac3_eae.*error reading output|Error submitting packet to decoder.*I/O error)"
severity: ERROR
description: Plex EasyAudioEncoder (EAC3 Dolby audio transcoder) crashed — service restart required
# - name: ext_device_device_error # - name: ext_device_device_error
# pattern: "ERR-\d{4}" # pattern: "ERR-\d{4}"
# severity: ERROR # severity: ERROR

136
scripts/watch_plex.py Normal file
View file

@ -0,0 +1,136 @@
"""Watchdog: detect Plex EAE (EasyAudioEncoder) failures and auto-restart.
The EAE daemon handles Dolby EAC3/DDP audio transcoding. When it crashes,
Plex logs 'EAE timeout!' at 5-second intervals until the service is restarted.
Run as root or a user with `systemctl restart plexmediaserver` permission.
Usage:
python scripts/watch_plex.py [--log PATH] [--threshold N] [--cooldown SECS] [--dry-run]
Systemd unit (deploy to /etc/systemd/system/turnstone-plex-watchdog.service):
[Unit]
Description=Turnstone Plex EAE watchdog
After=plexmediaserver.service
[Service]
ExecStart=/usr/bin/python3 /opt/turnstone/scripts/watch_plex.py
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target
"""
from __future__ import annotations
import argparse
import logging
import subprocess
import sys
import time
from pathlib import Path
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("plex-watchdog")
DEFAULT_LOG = Path(
"/var/lib/plexmediaserver/Library/Application Support"
"/Plex Media Server/Logs/Plex Media Server.log"
)
EAE_TRIGGER = "EAE timeout!"
SERVICE = "plexmediaserver"
def _restart(dry_run: bool) -> bool:
cmd = ["systemctl", "restart", SERVICE]
if dry_run:
logger.info("[dry-run] would run: %s", " ".join(cmd))
return True
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
logger.info("Restart successful")
return True
logger.error("Restart failed (exit %d): %s", result.returncode, result.stderr.strip())
return False
def _tail_f(path: Path):
"""Yield new lines from a growing log file, blocking between reads."""
with open(path, errors="replace") as f:
f.seek(0, 2) # start at end — don't replay history on startup
while True:
line = f.readline()
if line:
yield line
else:
time.sleep(0.25)
def watch(log_path: Path, threshold: int, cooldown: int, dry_run: bool) -> None:
logger.info(
"Watching %s | threshold=%d EAE timeouts | cooldown=%ds%s",
log_path, threshold, cooldown, " | DRY RUN" if dry_run else "",
)
eae_count = 0
last_restart: float = 0.0
for line in _tail_f(log_path):
line = line.strip()
if "Started plexmediaserver" in line or "Starting plexmediaserver" in line:
if eae_count > 0:
logger.info("Service (re)started — resetting EAE counter")
eae_count = 0
continue
if EAE_TRIGGER not in line:
continue
eae_count += 1
logger.warning("EAE timeout #%d detected", eae_count)
if eae_count < threshold:
continue
now = time.time()
remaining = cooldown - (now - last_restart)
if remaining > 0:
logger.warning(
"Threshold reached but cooldown active (%.0fs remaining) — skipping restart",
remaining,
)
continue
logger.error(
"EAE failure confirmed (%d timeouts) — restarting %s", eae_count, SERVICE
)
if _restart(dry_run):
last_restart = now
eae_count = 0
def main() -> None:
parser = argparse.ArgumentParser(description="Plex EAE watchdog")
parser.add_argument("--log", type=Path, default=DEFAULT_LOG, help="Plex server log path")
parser.add_argument("--threshold", type=int, default=3, help="EAE timeouts before restart (default 3)")
parser.add_argument("--cooldown", type=int, default=300, help="Seconds between restarts (default 300)")
parser.add_argument("--dry-run", action="store_true", help="Log what would happen without restarting")
args = parser.parse_args()
if not args.log.exists():
logger.error("Log file not found: %s", args.log)
sys.exit(1)
try:
watch(args.log, args.threshold, args.cooldown, args.dry_run)
except KeyboardInterrupt:
logger.info("Watchdog stopped")
if __name__ == "__main__":
main()