[c5c522c] | 1 | #!/bin/bash |
---|
| 2 | #(C)2007 DatuX. All rights reserved. |
---|
| 3 | #Automatic harddisk smart monitoring via Syn-3 monitoring framework. |
---|
| 4 | |
---|
| 5 | mkdir -p /etc/smart |
---|
| 6 | touch /etc/smart/config |
---|
| 7 | source /etc/smart/config |
---|
| 8 | |
---|
| 9 | #return 0 if selftest of a device is disabled. |
---|
| 10 | is_selftest_disabled() |
---|
| 11 | { |
---|
| 12 | eval SELFTEST_DISABLED=\$SELFTEST_DISABLED_${1}; |
---|
| 13 | if [ "$SELFTEST_DISABLED" == "1" ]; then |
---|
| 14 | return 0; |
---|
| 15 | fi |
---|
| 16 | return 1; |
---|
| 17 | } |
---|
| 18 | |
---|
| 19 | #return 0 if monitoring of a device is disabled. |
---|
| 20 | is_monitoring_disabled() |
---|
| 21 | { |
---|
| 22 | eval MONITORING_DISABLED=\$MONITORING_DISABLED_${1}; |
---|
| 23 | if [ "$MONITORING_DISABLED" == "1" ]; then |
---|
| 24 | return 0; |
---|
| 25 | fi |
---|
| 26 | return 1; |
---|
| 27 | } |
---|
| 28 | |
---|
| 29 | #runs a smartctl commandline and syn3-state logs result |
---|
| 30 | run_smartctl() |
---|
| 31 | { |
---|
| 32 | #is this smart drive accesible this way? |
---|
| 33 | $SMARTCMD -i > /tmp/smart.$$ 2>/dev/null || return $? |
---|
| 34 | |
---|
| 35 | #yes, so get info |
---|
| 36 | INFO=`cat /tmp/smart.$$ | egrep '(Device Model:|Device:|Model Family:|Serial Number:)' | sed -r 's/ +/ /g' | tr "\n" " "` |
---|
| 37 | ID="`egrep '(^Device:|Serial )' /tmp/smart.$$ | md5sum`" |
---|
| 38 | |
---|
| 39 | #we already saw this disk? |
---|
| 40 | #(sometimes disks are accesible in more then one way, for example with smart array controlers) |
---|
| 41 | if echo "$IDS" | grep "$ID" >/dev/null; then |
---|
| 42 | return 0 |
---|
| 43 | fi |
---|
| 44 | IDS="$IDS $ID" |
---|
| 45 | |
---|
| 46 | #re-enable smart if this is not yet the case: |
---|
| 47 | $SMARTCMD -s on &>/dev/null |
---|
| 48 | |
---|
| 49 | #get current status |
---|
| 50 | OUTPUT=`$SMARTCMD -H -i -A -l error -l selftest -q errorsonly 2>&1` |
---|
| 51 | EXIT=$? |
---|
| 52 | |
---|
| 53 | if [ "$OUTPUT" ]; then |
---|
| 54 | OUTPUT="Extra output: [ $OUTPUT ]"; |
---|
| 55 | fi |
---|
| 56 | |
---|
| 57 | #List devices here |
---|
| 58 | if [ "$LIST" == 1 ]; then |
---|
| 59 | echo "`echo "$ID" | cut -d ' ' -f1` $FACILITY $INFO"; |
---|
| 60 | return 0; |
---|
| 61 | fi |
---|
| 62 | |
---|
| 63 | #graph of temperature, if any |
---|
| 64 | TEMPERATURE=`$SMARTCMD -A|grep Temperature|head -1 | cut -c88- | egrep -o '^[0-9]+'` |
---|
| 65 | if [ "$TEMPERATURE" ]; then |
---|
| 66 | #TODO: add temperature_max_alert and caution? |
---|
| 67 | syn3-graphcreate "$FACILITY" --step=60 --use=MAX temperature=GAUGE:120:0:U |
---|
| 68 | syn3-graph "$FACILITY" temperature="$TEMPERATURE" |
---|
| 69 | fi |
---|
| 70 | |
---|
| 71 | #if monitoring is disabled in config file, skip current device. |
---|
| 72 | is_monitoring_disabled `echo "$ID" | cut -d ' ' -f1`; |
---|
| 73 | if [ "$?" == 0 ]; then |
---|
| 74 | syn3-state "$FACILITY" DELETE |
---|
| 75 | return 0 |
---|
| 76 | fi |
---|
| 77 | |
---|
| 78 | #analyse status code we've got: |
---|
| 79 | if [ "$(( $EXIT & 8 ))" != 0 ]; then |
---|
| 80 | syn3-state "$FACILITY" ALERT "Harddisk $INFO is FAILING! $OUTPUT" |
---|
| 81 | elif [ "$(( $EXIT & 16 ))" != 0 ]; then |
---|
| 82 | syn3-state "$FACILITY" ALERT "Harddisk $INFO is almost failing. $OUTPUT" |
---|
| 83 | elif [ "$(( $EXIT & 32 ))" != 0 ]; then |
---|
| 84 | syn3-state "$FACILITY" CAUTION "Harddisk $INFO has almost failed in the past. $OUTPUT" |
---|
| 85 | elif [ "$(( $EXIT & 128 ))" != 0 ]; then |
---|
| 86 | syn3-state "$FACILITY" CAUTION "Harddisk $INFO selftest failed. $OUTPUT" |
---|
| 87 | elif [ "$(( $EXIT & 64 ))" != 0 ]; then |
---|
| 88 | syn3-state "$FACILITY" OK "Harddisk $INFO is healty, but has logged errors: $OUTPUT" |
---|
| 89 | elif [ "$(( $EXIT & 2 ))" != 0 ]; then |
---|
| 90 | syn3-state "$FACILITY" DELETE #note: this cant be called anymore |
---|
| 91 | elif [ "$EXIT" == 0 ]; then |
---|
| 92 | syn3-state "$FACILITY" OK "Hardisk $INFO is healty. $OUTPUT" |
---|
| 93 | else |
---|
| 94 | syn3-state "$FACILITY" CAUTION "Hardisk $INFO has unknown SMART-status. $OUTPUT" |
---|
| 95 | fi |
---|
| 96 | |
---|
| 97 | #if selftest is disabled in config file, skip current device. |
---|
| 98 | is_selftest_disabled `echo "$ID" | cut -d ' ' -f1`; |
---|
| 99 | if [ "$?" == 0 ]; then |
---|
| 100 | return 0 |
---|
| 101 | fi |
---|
| 102 | |
---|
| 103 | #Start a long self-test at mid-night |
---|
| 104 | if [ `date +%H%M` == "0000" ]; then |
---|
| 105 | $SMARTCMD -t long &>/dev/null |
---|
| 106 | fi |
---|
| 107 | } |
---|
| 108 | |
---|
| 109 | if [ "$1" == "--list" ]; then |
---|
| 110 | LIST=1; |
---|
| 111 | fi |
---|
| 112 | |
---|
| 113 | #traverse all blockdevices |
---|
| 114 | cd /sys/block |
---|
| 115 | for SYS in *; do |
---|
| 116 | DEV=/dev/`echo $SYS| sed s@!@/@g` |
---|
| 117 | |
---|
| 118 | #range=1 (things like dm, loop, fd etc)? |
---|
| 119 | [ "`cat $SYS/range`" == "1" ] && continue; |
---|
| 120 | |
---|
| 121 | #try default first (this works for ata and scsi): |
---|
| 122 | SMARTCMD="smartctl $DEV" |
---|
| 123 | FACILITY="SMART-`echo $DEV| sed s#.*/##`" |
---|
| 124 | run_smartctl && continue |
---|
| 125 | |
---|
| 126 | #SAT |
---|
| 127 | SMARTCMD="smartctl $DEV -d sat" |
---|
| 128 | FACILITY="SMART-`echo $DEV| sed s#.*/##`" |
---|
| 129 | run_smartctl && continue |
---|
| 130 | |
---|
| 131 | #try cciss Smart Array controller mode: |
---|
| 132 | FOUND= |
---|
| 133 | for DRIVE in `seq 0 15`; do |
---|
| 134 | SMARTCMD="smartctl $DEV -d cciss,$DRIVE" |
---|
| 135 | FACILITY="SMART-`echo $DEV| sed s#.*/##`-$DRIVE" |
---|
| 136 | run_smartctl && FOUND=1 |
---|
| 137 | done |
---|
| 138 | [ "$FOUND" ] && continue; |
---|
| 139 | |
---|
| 140 | #try 3ware controller mode: |
---|
| 141 | FOUND= |
---|
| 142 | for DRIVE in `seq 0 15`; do |
---|
| 143 | SMARTCMD="smartctl $DEV -d 3ware,$DRIVE" |
---|
| 144 | FACILITY="SMART-`echo $DEV| sed s#.*/##`-$DRIVE" |
---|
| 145 | run_smartctl && FOUND=1 |
---|
| 146 | done |
---|
| 147 | [ "$FOUND" ] && continue; |
---|
| 148 | |
---|
| 149 | done |
---|
| 150 | |
---|
| 151 | rm /tmp/smart.$$ |
---|
| 152 | |
---|