network-watchdog/network-watchdog.sh
2025-05-03 11:40:49 +01:00

41 lines
1.2 KiB
Bash

#!/usr/bin/env bash
USAGE="Usage: $0 <check target> [ <grace time seconds> ]"
TAG=network-watchdog
REMOTE_CHECK=${1:?$USAGE}
GRACE_TIME=${2:-1800}
STATE_FILE=${STATE_FILE:-/var/run/$TAG.state}
ping -c 4 -w 5 "${REMOTE_CHECK}" >/dev/null
if [ $? -ne 0 ]
then
# Failure
logger -p local0.alert -t $TAG "Remote host $REMOTE_CHECK is unavailable"
if [ ! -r "$STATE_FILE" ]
then
echo "$REMOTE_CHECK unavailable since $(date -Iseconds)" >"$STATE_FILE"
logger -p local0.info -t $TAG "Recorded failure in $STATE_FILE"
else
STATE_FILE_TS=$(date -r "$STATE_FILE" '+%s')
NOW=$(date '+%s')
logger -p local0.debug -t $TAG "Checking $STATE_FILE_TS & $NOW = $(( NOW - STATE_FILE_TS )) against gracetime=$GRACE_TIME"
if [ "$(( NOW - STATE_FILE_TS ))" -gt "$GRACE_TIME" ]
then
logger -p local0.crit -t $TAG "Remote host $REMOTE_CHECK has been unavailable for too long -- rebooting"
reboot
else
logger -p local0.notice -t $TAG "Not reached threshold yet"
fi
fi
else
if [ -r "$STATE_FILE" ]
then
rm ${STATE_FILE}
logger -p local0.notice -t $TAG "Remote host $REMOTE_CHECK back to normal"
else
logger -p local0.info -t $TAG "Remote host $REMOTE_CHECK all ok"
fi
fi