From 0fd9c340589362a6a32bd595d8216a3b13248ada Mon Sep 17 00:00:00 2001 From: Per Cederqvist <ceder@lysator.liu.se> Date: Tue, 9 Jan 2007 14:19:02 +0000 Subject: [PATCH] Added check_nfs_server. --- Makefile | 3 +- check_nfs_server | 134 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+), 1 deletion(-) create mode 100755 check_nfs_server diff --git a/Makefile b/Makefile index 2463a59..99179fd 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,8 @@ LIBEXECDIR = /usr/local/nagios/libexec/ SCRIPTS = check_cups check_glsa check_saned check_lpd check_hddtemp \ check_link_status check_true check_lysrdiff check_syslog \ check_ping check_enodia_monitored check_hostextinfo \ - check_hydra check_datorhandbok check_no_server check_iostatE + check_hydra check_datorhandbok check_no_server check_iostatE \ + check_nfs_server all:; diff --git a/check_nfs_server b/check_nfs_server new file mode 100755 index 0000000..bb45a3a --- /dev/null +++ b/check_nfs_server @@ -0,0 +1,134 @@ +#!/usr/bin/env python + +# Read the last few messages from all syslog files and look for lines +# matching "NFS server (.*) not responding" that are not followed by +# "NFS server \1 OK". Report any problems found in an aggregated way. +# +# This script assumes that syslog log files are created using the +# following hierarchy: +# +# /misc/syslogs/2006-12/2006-12-18/sellafield-130.236.254.103 +# +# where 2006 is a year, 12 a month, 18 a day, and sellafield a +# hostname with its IP address appended. + +import sets +import time +import os +import sys +import re + +bad_re = re.compile("NFS server ([^ ]*) not responding") +good_re = re.compile("NFS server ([^ ]*) OK") +time_re = re.compile("((?P<year>[1-9][0-9]{3})-" + "(?P<mon>[0-9]{2})-" + "(?P<day>[0-9]{2}) " + "(?P<hour>[0-9]{2}):" + "(?P<min>[0-9]{2}):" + "(?P<sec>[0-9]{2}))") + +def critical(msg): + print "CRITICAL - %s" % msg + sys.exit(2) + +def warning(msg): + print "WARNING - %s" % msg + sys.exit(1) + +def ok(msg): + print "OK - %s" % msg + sys.exit(0) + +def dirname(y, m, d): + return "/misc/syslogs/%04d-%02d/%04d-%02d-%02d" % ( + y, m, y, m, d) + +def checkfile(fn, now): + fp = open(fn, "r") + filesize = os.fstat(fp.fileno()).st_size + fp.seek(-(min(filesize, 10 * 1024)), 2) + + # Discard the first (possibly incomplete) line. + if fp.tell() != 0: + fp.readline() + + res = {} + + for line in fp: + + m = time_re.match(line) + if m is None: + continue + + t = time.mktime(time.strptime(m.group(1), "%Y-%m-%d %H:%M:%S")) + if now - t < 1800: + continue + + bad = bad_re.search(line) + if bad is not None: + res[bad.group(1)] = res.get(bad.group(1), 0) + 1 + continue + + good = good_re.search(line) + if good is not None: + del res[bad.group(1)] + + return res + +def derive_hostname(fn): + m = re.match("(.*)-[0-9.]", fn) + if m is None: + return fn + else: + return m.group(1) + +def checkall(): + complaints = {} + complaining_hosts = {} + all = sets.Set() + + now = time.time() + tm = time.localtime(now) + dn = dirname(tm.tm_year, tm.tm_mon, tm.tm_mday) + + for fn in os.listdir(dn): + + stopped = derive_hostname(fn) + + for host, count in checkfile(os.path.join(dn, fn), now).iteritems(): + if host not in complaints: + complaints[host] = 0 + complaining_hosts[host] = sets.Set() + + complaints[host] += count + complaining_hosts[host].add(stopped) + all.add(stopped) + + if len(complaints) == 0: + ok("No recent NFS issues found") + + msg = [] + for host in complaints.keys(): + msg.append("%s (%d hosts complains %d times)" % ( + host, len(complaining_hosts[host]), complaints[host])) + + msg = ', '.join(msg) + "." + if len(msg) < 180: + msg += " Complaining hosts: " + need_comma = False + for fn in all: + if len(msg) > 200: + msg += " and others." + break + else: + if need_comma: + msg += ", " + msg += fn + need_comma = True + else: + msg += "." + + critical("Bad NFS servers: " + msg) + +if __name__ == '__main__': + checkall() -- GitLab