Skip to content
Snippets Groups Projects
Commit 0fd9c340 authored by Per Cederqvist's avatar Per Cederqvist
Browse files

Added check_nfs_server.

parent 76120d5c
No related branches found
No related tags found
No related merge requests found
......@@ -2,7 +2,8 @@ LIBEXECDIR = /usr/local/nagios/libexec/
SCRIPTS = check_cups check_glsa check_saned check_lpd check_hddtemp \
check_link_status check_true check_lysrdiff check_syslog \
check_ping check_enodia_monitored check_hostextinfo \
check_hydra check_datorhandbok check_no_server check_iostatE
check_hydra check_datorhandbok check_no_server check_iostatE \
check_nfs_server
all:;
......
#!/usr/bin/env python
# Read the last few messages from all syslog files and look for lines
# matching "NFS server (.*) not responding" that are not followed by
# "NFS server \1 OK". Report any problems found in an aggregated way.
#
# This script assumes that syslog log files are created using the
# following hierarchy:
#
# /misc/syslogs/2006-12/2006-12-18/sellafield-130.236.254.103
#
# where 2006 is a year, 12 a month, 18 a day, and sellafield a
# hostname with its IP address appended.
import sets
import time
import os
import sys
import re
bad_re = re.compile("NFS server ([^ ]*) not responding")
good_re = re.compile("NFS server ([^ ]*) OK")
time_re = re.compile("((?P<year>[1-9][0-9]{3})-"
"(?P<mon>[0-9]{2})-"
"(?P<day>[0-9]{2}) "
"(?P<hour>[0-9]{2}):"
"(?P<min>[0-9]{2}):"
"(?P<sec>[0-9]{2}))")
def critical(msg):
print "CRITICAL - %s" % msg
sys.exit(2)
def warning(msg):
print "WARNING - %s" % msg
sys.exit(1)
def ok(msg):
print "OK - %s" % msg
sys.exit(0)
def dirname(y, m, d):
return "/misc/syslogs/%04d-%02d/%04d-%02d-%02d" % (
y, m, y, m, d)
def checkfile(fn, now):
fp = open(fn, "r")
filesize = os.fstat(fp.fileno()).st_size
fp.seek(-(min(filesize, 10 * 1024)), 2)
# Discard the first (possibly incomplete) line.
if fp.tell() != 0:
fp.readline()
res = {}
for line in fp:
m = time_re.match(line)
if m is None:
continue
t = time.mktime(time.strptime(m.group(1), "%Y-%m-%d %H:%M:%S"))
if now - t < 1800:
continue
bad = bad_re.search(line)
if bad is not None:
res[bad.group(1)] = res.get(bad.group(1), 0) + 1
continue
good = good_re.search(line)
if good is not None:
del res[bad.group(1)]
return res
def derive_hostname(fn):
m = re.match("(.*)-[0-9.]", fn)
if m is None:
return fn
else:
return m.group(1)
def checkall():
complaints = {}
complaining_hosts = {}
all = sets.Set()
now = time.time()
tm = time.localtime(now)
dn = dirname(tm.tm_year, tm.tm_mon, tm.tm_mday)
for fn in os.listdir(dn):
stopped = derive_hostname(fn)
for host, count in checkfile(os.path.join(dn, fn), now).iteritems():
if host not in complaints:
complaints[host] = 0
complaining_hosts[host] = sets.Set()
complaints[host] += count
complaining_hosts[host].add(stopped)
all.add(stopped)
if len(complaints) == 0:
ok("No recent NFS issues found")
msg = []
for host in complaints.keys():
msg.append("%s (%d hosts complains %d times)" % (
host, len(complaining_hosts[host]), complaints[host]))
msg = ', '.join(msg) + "."
if len(msg) < 180:
msg += " Complaining hosts: "
need_comma = False
for fn in all:
if len(msg) > 200:
msg += " and others."
break
else:
if need_comma:
msg += ", "
msg += fn
need_comma = True
else:
msg += "."
critical("Bad NFS servers: " + msg)
if __name__ == '__main__':
checkall()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment