feat: plex EAE watchdog and plex_eae_failure pattern
Add plex_eae_failure pattern to default.yaml targeting the EasyAudioEncoder crash signature (EAE timeout + I/O error pair, 5s cadence). Pattern fires when EAE's WAV handoff files stop appearing in the pms temp directory. Add watch_plex.py: tail-based watchdog that counts EAE timeout events and auto-restarts plexmediaserver after N consecutive hits (default 3, ~15s of failure). Includes cooldown, dry-run mode, and a systemd unit template.
This commit is contained in:
parent
64c3996aa1
commit
8db8810667
2 changed files with 142 additions and 0 deletions
|
|
@ -82,6 +82,12 @@ patterns:
|
|||
description: IP address change or DHCP event
|
||||
|
||||
# Add device/service-specific patterns below this line:
|
||||
|
||||
- name: plex_eae_failure
|
||||
pattern: "(EAE timeout|EAE not running|eac3_eae.*error reading output|Error submitting packet to decoder.*I/O error)"
|
||||
severity: ERROR
|
||||
description: Plex EasyAudioEncoder (EAC3 Dolby audio transcoder) crashed — service restart required
|
||||
|
||||
# - name: avcx_device_error
|
||||
# pattern: "ERR-\d{4}"
|
||||
# severity: ERROR
|
||||
|
|
|
|||
136
scripts/watch_plex.py
Normal file
136
scripts/watch_plex.py
Normal file
|
|
@ -0,0 +1,136 @@
|
|||
"""Watchdog: detect Plex EAE (EasyAudioEncoder) failures and auto-restart.
|
||||
|
||||
The EAE daemon handles Dolby EAC3/DDP audio transcoding. When it crashes,
|
||||
Plex logs 'EAE timeout!' at 5-second intervals until the service is restarted.
|
||||
|
||||
Run as root or a user with `systemctl restart plexmediaserver` permission.
|
||||
|
||||
Usage:
|
||||
python scripts/watch_plex.py [--log PATH] [--threshold N] [--cooldown SECS] [--dry-run]
|
||||
|
||||
Systemd unit (deploy to /etc/systemd/system/turnstone-plex-watchdog.service):
|
||||
[Unit]
|
||||
Description=Turnstone Plex EAE watchdog
|
||||
After=plexmediaserver.service
|
||||
|
||||
[Service]
|
||||
ExecStart=/usr/bin/python3 /opt/turnstone/scripts/watch_plex.py
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
logger = logging.getLogger("plex-watchdog")
|
||||
|
||||
DEFAULT_LOG = Path(
|
||||
"/var/lib/plexmediaserver/Library/Application Support"
|
||||
"/Plex Media Server/Logs/Plex Media Server.log"
|
||||
)
|
||||
EAE_TRIGGER = "EAE timeout!"
|
||||
SERVICE = "plexmediaserver"
|
||||
|
||||
|
||||
def _restart(dry_run: bool) -> bool:
|
||||
cmd = ["systemctl", "restart", SERVICE]
|
||||
if dry_run:
|
||||
logger.info("[dry-run] would run: %s", " ".join(cmd))
|
||||
return True
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode == 0:
|
||||
logger.info("Restart successful")
|
||||
return True
|
||||
logger.error("Restart failed (exit %d): %s", result.returncode, result.stderr.strip())
|
||||
return False
|
||||
|
||||
|
||||
def _tail_f(path: Path):
|
||||
"""Yield new lines from a growing log file, blocking between reads."""
|
||||
with open(path, errors="replace") as f:
|
||||
f.seek(0, 2) # start at end — don't replay history on startup
|
||||
while True:
|
||||
line = f.readline()
|
||||
if line:
|
||||
yield line
|
||||
else:
|
||||
time.sleep(0.25)
|
||||
|
||||
|
||||
def watch(log_path: Path, threshold: int, cooldown: int, dry_run: bool) -> None:
|
||||
logger.info(
|
||||
"Watching %s | threshold=%d EAE timeouts | cooldown=%ds%s",
|
||||
log_path, threshold, cooldown, " | DRY RUN" if dry_run else "",
|
||||
)
|
||||
|
||||
eae_count = 0
|
||||
last_restart: float = 0.0
|
||||
|
||||
for line in _tail_f(log_path):
|
||||
line = line.strip()
|
||||
|
||||
if "Started plexmediaserver" in line or "Starting plexmediaserver" in line:
|
||||
if eae_count > 0:
|
||||
logger.info("Service (re)started — resetting EAE counter")
|
||||
eae_count = 0
|
||||
continue
|
||||
|
||||
if EAE_TRIGGER not in line:
|
||||
continue
|
||||
|
||||
eae_count += 1
|
||||
logger.warning("EAE timeout #%d detected", eae_count)
|
||||
|
||||
if eae_count < threshold:
|
||||
continue
|
||||
|
||||
now = time.time()
|
||||
remaining = cooldown - (now - last_restart)
|
||||
if remaining > 0:
|
||||
logger.warning(
|
||||
"Threshold reached but cooldown active (%.0fs remaining) — skipping restart",
|
||||
remaining,
|
||||
)
|
||||
continue
|
||||
|
||||
logger.error(
|
||||
"EAE failure confirmed (%d timeouts) — restarting %s", eae_count, SERVICE
|
||||
)
|
||||
if _restart(dry_run):
|
||||
last_restart = now
|
||||
eae_count = 0
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Plex EAE watchdog")
|
||||
parser.add_argument("--log", type=Path, default=DEFAULT_LOG, help="Plex server log path")
|
||||
parser.add_argument("--threshold", type=int, default=3, help="EAE timeouts before restart (default 3)")
|
||||
parser.add_argument("--cooldown", type=int, default=300, help="Seconds between restarts (default 300)")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Log what would happen without restarting")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.log.exists():
|
||||
logger.error("Log file not found: %s", args.log)
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
watch(args.log, args.threshold, args.cooldown, args.dry_run)
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Watchdog stopped")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in a new issue