#!/bin/bash #(C)2007 DatuX. All rights reserved. #Automatic harddisk smart monitoring via Syn-3 monitoring framework. mkdir -p /etc/smart touch /etc/smart/config source /etc/smart/config #return 0 if selftest of a device is disabled. is_selftest_disabled() { eval SELFTEST_DISABLED=\$SELFTEST_DISABLED_${1}; if [ "$SELFTEST_DISABLED" == "1" ]; then return 0; fi return 1; } #return 0 if monitoring of a device is disabled. is_monitoring_disabled() { eval MONITORING_DISABLED=\$MONITORING_DISABLED_${1}; if [ "$MONITORING_DISABLED" == "1" ]; then return 0; fi return 1; } #runs a smartctl commandline and syn3-state logs result run_smartctl() { #is this smart drive accesible this way? $SMARTCMD -i > /tmp/smart.$$ 2>/dev/null || return $? #yes, so get info INFO=`cat /tmp/smart.$$ | egrep '(Device Model:|Device:|Model Family:|Serial Number:)' | sed -r 's/ +/ /g' | tr "\n" " "` ID="`egrep '(^Device:|Serial )' /tmp/smart.$$ | md5sum`" #we already saw this disk? #(sometimes disks are accesible in more then one way, for example with smart array controlers) if echo "$IDS" | grep "$ID" >/dev/null; then return 0 fi IDS="$IDS $ID" #re-enable smart if this is not yet the case: $SMARTCMD -s on &>/dev/null #get current status OUTPUT=`$SMARTCMD -H -i -A -l error -l selftest -q errorsonly 2>&1` EXIT=$? if [ "$OUTPUT" ]; then OUTPUT="Extra output: [ $OUTPUT ]"; fi #List devices here if [ "$LIST" == 1 ]; then echo "`echo "$ID" | cut -d ' ' -f1` $FACILITY $INFO"; return 0; fi #graph of temperature, if any TEMPERATURE=`$SMARTCMD -A|grep Temperature|head -1 | cut -c88- | egrep -o '^[0-9]+'` if [ "$TEMPERATURE" ]; then #TODO: add temperature_max_alert and caution? syn3-graphcreate "$FACILITY" --step=60 --use=MAX temperature=GAUGE:120:0:U syn3-graph "$FACILITY" temperature="$TEMPERATURE" fi #if monitoring is disabled in config file, skip current device. is_monitoring_disabled `echo "$ID" | cut -d ' ' -f1`; if [ "$?" == 0 ]; then syn3-state "$FACILITY" DELETE return 0 fi #analyse status code we've got: if [ "$(( $EXIT & 8 ))" != 0 ]; then syn3-state "$FACILITY" ALERT "Harddisk $INFO is FAILING! $OUTPUT" elif [ "$(( $EXIT & 16 ))" != 0 ]; then syn3-state "$FACILITY" ALERT "Harddisk $INFO is almost failing. $OUTPUT" elif [ "$(( $EXIT & 32 ))" != 0 ]; then syn3-state "$FACILITY" CAUTION "Harddisk $INFO has almost failed in the past. $OUTPUT" elif [ "$(( $EXIT & 128 ))" != 0 ]; then syn3-state "$FACILITY" CAUTION "Harddisk $INFO selftest failed. $OUTPUT" elif [ "$(( $EXIT & 64 ))" != 0 ]; then syn3-state "$FACILITY" OK "Harddisk $INFO is healty, but has logged errors: $OUTPUT" elif [ "$(( $EXIT & 2 ))" != 0 ]; then syn3-state "$FACILITY" DELETE #note: this cant be called anymore elif [ "$EXIT" == 0 ]; then syn3-state "$FACILITY" OK "Hardisk $INFO is healty. $OUTPUT" else syn3-state "$FACILITY" CAUTION "Hardisk $INFO has unknown SMART-status. $OUTPUT" fi #if selftest is disabled in config file, skip current device. is_selftest_disabled `echo "$ID" | cut -d ' ' -f1`; if [ "$?" == 0 ]; then return 0 fi #Start a long self-test at mid-night if [ `date +%H%M` == "0000" ]; then $SMARTCMD -t long &>/dev/null fi } if [ "$1" == "--list" ]; then LIST=1; fi #traverse all blockdevices cd /sys/block for SYS in *; do DEV=/dev/`echo $SYS| sed s@!@/@g` #range=1 (things like dm, loop, fd etc)? [ "`cat $SYS/range`" == "1" ] && continue; #try default first (this works for ata and scsi): SMARTCMD="smartctl $DEV" FACILITY="SMART-`echo $DEV| sed s#.*/##`" run_smartctl && continue #SAT SMARTCMD="smartctl $DEV -d sat" FACILITY="SMART-`echo $DEV| sed s#.*/##`" run_smartctl && continue #try cciss Smart Array controller mode: FOUND= for DRIVE in `seq 0 15`; do SMARTCMD="smartctl $DEV -d cciss,$DRIVE" FACILITY="SMART-`echo $DEV| sed s#.*/##`-$DRIVE" run_smartctl && FOUND=1 done [ "$FOUND" ] && continue; #try 3ware controller mode: FOUND= for DRIVE in `seq 0 15`; do SMARTCMD="smartctl $DEV -d 3ware,$DRIVE" FACILITY="SMART-`echo $DEV| sed s#.*/##`-$DRIVE" run_smartctl && FOUND=1 done [ "$FOUND" ] && continue; done rm /tmp/smart.$$