feat: plex EAE watchdog and plex_eae_failure pattern
Add plex_eae_failure pattern to default.yaml targeting the EasyAudioEncoder crash signature (EAE timeout + I/O error pair, 5s cadence). Pattern fires when EAE's WAV handoff files stop appearing in the pms temp directory. Add watch_plex.py: tail-based watchdog that counts EAE timeout events and auto-restarts plexmediaserver after N consecutive hits (default 3, ~15s of failure). Includes cooldown, dry-run mode, and a systemd unit template.
This commit is contained in:
parent
3e6eabb7ce
commit
e4a62c3505
2 changed files with 142 additions and 0 deletions
|
|
@ -82,6 +82,12 @@ patterns:
|
||||||
description: IP address change or DHCP event
|
description: IP address change or DHCP event
|
||||||
|
|
||||||
# Add device/service-specific patterns below this line:
|
# Add device/service-specific patterns below this line:
|
||||||
|
|
||||||
|
- name: plex_eae_failure
|
||||||
|
pattern: "(EAE timeout|EAE not running|eac3_eae.*error reading output|Error submitting packet to decoder.*I/O error)"
|
||||||
|
severity: ERROR
|
||||||
|
description: Plex EasyAudioEncoder (EAC3 Dolby audio transcoder) crashed — service restart required
|
||||||
|
|
||||||
# - name: avcx_device_error
|
# - name: avcx_device_error
|
||||||
# pattern: "ERR-\d{4}"
|
# pattern: "ERR-\d{4}"
|
||||||
# severity: ERROR
|
# severity: ERROR
|
||||||
|
|
|
||||||
136
scripts/watch_plex.py
Normal file
136
scripts/watch_plex.py
Normal file
|
|
@ -0,0 +1,136 @@
|
||||||
|
"""Watchdog: detect Plex EAE (EasyAudioEncoder) failures and auto-restart.
|
||||||
|
|
||||||
|
The EAE daemon handles Dolby EAC3/DDP audio transcoding. When it crashes,
|
||||||
|
Plex logs 'EAE timeout!' at 5-second intervals until the service is restarted.
|
||||||
|
|
||||||
|
Run as root or a user with `systemctl restart plexmediaserver` permission.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/watch_plex.py [--log PATH] [--threshold N] [--cooldown SECS] [--dry-run]
|
||||||
|
|
||||||
|
Systemd unit (deploy to /etc/systemd/system/turnstone-plex-watchdog.service):
|
||||||
|
[Unit]
|
||||||
|
Description=Turnstone Plex EAE watchdog
|
||||||
|
After=plexmediaserver.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
ExecStart=/usr/bin/python3 /opt/turnstone/scripts/watch_plex.py
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s %(levelname)s %(message)s",
|
||||||
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
|
)
|
||||||
|
logger = logging.getLogger("plex-watchdog")
|
||||||
|
|
||||||
|
DEFAULT_LOG = Path(
|
||||||
|
"/var/lib/plexmediaserver/Library/Application Support"
|
||||||
|
"/Plex Media Server/Logs/Plex Media Server.log"
|
||||||
|
)
|
||||||
|
EAE_TRIGGER = "EAE timeout!"
|
||||||
|
SERVICE = "plexmediaserver"
|
||||||
|
|
||||||
|
|
||||||
|
def _restart(dry_run: bool) -> bool:
|
||||||
|
cmd = ["systemctl", "restart", SERVICE]
|
||||||
|
if dry_run:
|
||||||
|
logger.info("[dry-run] would run: %s", " ".join(cmd))
|
||||||
|
return True
|
||||||
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||||
|
if result.returncode == 0:
|
||||||
|
logger.info("Restart successful")
|
||||||
|
return True
|
||||||
|
logger.error("Restart failed (exit %d): %s", result.returncode, result.stderr.strip())
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _tail_f(path: Path):
|
||||||
|
"""Yield new lines from a growing log file, blocking between reads."""
|
||||||
|
with open(path, errors="replace") as f:
|
||||||
|
f.seek(0, 2) # start at end — don't replay history on startup
|
||||||
|
while True:
|
||||||
|
line = f.readline()
|
||||||
|
if line:
|
||||||
|
yield line
|
||||||
|
else:
|
||||||
|
time.sleep(0.25)
|
||||||
|
|
||||||
|
|
||||||
|
def watch(log_path: Path, threshold: int, cooldown: int, dry_run: bool) -> None:
|
||||||
|
logger.info(
|
||||||
|
"Watching %s | threshold=%d EAE timeouts | cooldown=%ds%s",
|
||||||
|
log_path, threshold, cooldown, " | DRY RUN" if dry_run else "",
|
||||||
|
)
|
||||||
|
|
||||||
|
eae_count = 0
|
||||||
|
last_restart: float = 0.0
|
||||||
|
|
||||||
|
for line in _tail_f(log_path):
|
||||||
|
line = line.strip()
|
||||||
|
|
||||||
|
if "Started plexmediaserver" in line or "Starting plexmediaserver" in line:
|
||||||
|
if eae_count > 0:
|
||||||
|
logger.info("Service (re)started — resetting EAE counter")
|
||||||
|
eae_count = 0
|
||||||
|
continue
|
||||||
|
|
||||||
|
if EAE_TRIGGER not in line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
eae_count += 1
|
||||||
|
logger.warning("EAE timeout #%d detected", eae_count)
|
||||||
|
|
||||||
|
if eae_count < threshold:
|
||||||
|
continue
|
||||||
|
|
||||||
|
now = time.time()
|
||||||
|
remaining = cooldown - (now - last_restart)
|
||||||
|
if remaining > 0:
|
||||||
|
logger.warning(
|
||||||
|
"Threshold reached but cooldown active (%.0fs remaining) — skipping restart",
|
||||||
|
remaining,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.error(
|
||||||
|
"EAE failure confirmed (%d timeouts) — restarting %s", eae_count, SERVICE
|
||||||
|
)
|
||||||
|
if _restart(dry_run):
|
||||||
|
last_restart = now
|
||||||
|
eae_count = 0
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description="Plex EAE watchdog")
|
||||||
|
parser.add_argument("--log", type=Path, default=DEFAULT_LOG, help="Plex server log path")
|
||||||
|
parser.add_argument("--threshold", type=int, default=3, help="EAE timeouts before restart (default 3)")
|
||||||
|
parser.add_argument("--cooldown", type=int, default=300, help="Seconds between restarts (default 300)")
|
||||||
|
parser.add_argument("--dry-run", action="store_true", help="Log what would happen without restarting")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not args.log.exists():
|
||||||
|
logger.error("Log file not found: %s", args.log)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
watch(args.log, args.threshold, args.cooldown, args.dry_run)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
logger.info("Watchdog stopped")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in a new issue