1 | #!/bin/bash |
---|
2 | #(C)2007 DatuX. All rights reserved. |
---|
3 | #Automatic harddisk smart monitoring via Syn-3 monitoring framework. |
---|
4 | |
---|
5 | mkdir -p /etc/smart |
---|
6 | touch /etc/smart/config |
---|
7 | source /etc/smart/config |
---|
8 | |
---|
9 | #return 0 if selftest of a device is disabled. |
---|
10 | is_selftest_disabled() |
---|
11 | { |
---|
12 | eval SELFTEST_DISABLED=\$SELFTEST_DISABLED_${1}; |
---|
13 | if [ "$SELFTEST_DISABLED" == "1" ]; then |
---|
14 | return 0; |
---|
15 | fi |
---|
16 | return 1; |
---|
17 | } |
---|
18 | |
---|
19 | #return 0 if monitoring of a device is disabled. |
---|
20 | is_monitoring_disabled() |
---|
21 | { |
---|
22 | eval MONITORING_DISABLED=\$MONITORING_DISABLED_${1}; |
---|
23 | if [ "$MONITORING_DISABLED" == "1" ]; then |
---|
24 | return 0; |
---|
25 | fi |
---|
26 | return 1; |
---|
27 | } |
---|
28 | |
---|
29 | #runs a smartctl commandline and syn3-state logs result |
---|
30 | run_smartctl() |
---|
31 | { |
---|
32 | #is this smart drive accesible this way? |
---|
33 | $SMARTCMD -i > /tmp/smart.$$ 2>/dev/null || return $? |
---|
34 | |
---|
35 | #yes, so get info |
---|
36 | INFO=`cat /tmp/smart.$$ | egrep '(Device Model:|Device:|Model Family:|Serial Number:)' | sed -r 's/ +/ /g' | tr "\n" " "` |
---|
37 | ID="`egrep '(^Device:|Serial )' /tmp/smart.$$ | md5sum`" |
---|
38 | |
---|
39 | #we already saw this disk? |
---|
40 | #(sometimes disks are accesible in more then one way, for example with smart array controlers) |
---|
41 | if echo "$IDS" | grep "$ID" >/dev/null; then |
---|
42 | return 0 |
---|
43 | fi |
---|
44 | IDS="$IDS $ID" |
---|
45 | |
---|
46 | #re-enable smart if this is not yet the case: |
---|
47 | $SMARTCMD -s on &>/dev/null |
---|
48 | |
---|
49 | #get current status |
---|
50 | OUTPUT=`$SMARTCMD -H -i -A -l error -l selftest -q errorsonly 2>&1` |
---|
51 | EXIT=$? |
---|
52 | |
---|
53 | if [ "$OUTPUT" ]; then |
---|
54 | OUTPUT="Extra output: [ $OUTPUT ]"; |
---|
55 | fi |
---|
56 | |
---|
57 | #List devices here |
---|
58 | if [ "$LIST" == 1 ]; then |
---|
59 | echo "`echo "$ID" | cut -d ' ' -f1` $FACILITY $INFO"; |
---|
60 | return 0; |
---|
61 | fi |
---|
62 | |
---|
63 | #graph of temperature, if any |
---|
64 | TEMPERATURE=`$SMARTCMD -A|grep Temperature|head -1 | cut -c88- | egrep -o '^[0-9]+'` |
---|
65 | if [ "$TEMPERATURE" ]; then |
---|
66 | #TODO: add temperature_max_alert and caution? |
---|
67 | syn3-graphcreate "$FACILITY" --step=60 --use=MAX temperature=GAUGE:120:0:U |
---|
68 | syn3-graph "$FACILITY" temperature="$TEMPERATURE" |
---|
69 | fi |
---|
70 | |
---|
71 | #if monitoring is disabled in config file, skip current device. |
---|
72 | is_monitoring_disabled `echo "$ID" | cut -d ' ' -f1`; |
---|
73 | if [ "$?" == 0 ]; then |
---|
74 | syn3-state "$FACILITY" DELETE |
---|
75 | return 0 |
---|
76 | fi |
---|
77 | |
---|
78 | #analyse status code we've got: |
---|
79 | if [ "$(( $EXIT & 8 ))" != 0 ]; then |
---|
80 | syn3-state "$FACILITY" ALERT "Harddisk $INFO is FAILING! $OUTPUT" |
---|
81 | elif [ "$(( $EXIT & 16 ))" != 0 ]; then |
---|
82 | syn3-state "$FACILITY" ALERT "Harddisk $INFO is almost failing. $OUTPUT" |
---|
83 | elif [ "$(( $EXIT & 32 ))" != 0 ]; then |
---|
84 | syn3-state "$FACILITY" CAUTION "Harddisk $INFO has almost failed in the past. $OUTPUT" |
---|
85 | elif [ "$(( $EXIT & 128 ))" != 0 ]; then |
---|
86 | syn3-state "$FACILITY" CAUTION "Harddisk $INFO selftest failed. $OUTPUT" |
---|
87 | elif [ "$(( $EXIT & 64 ))" != 0 ]; then |
---|
88 | syn3-state "$FACILITY" OK "Harddisk $INFO is healty, but has logged errors: $OUTPUT" |
---|
89 | elif [ "$(( $EXIT & 2 ))" != 0 ]; then |
---|
90 | syn3-state "$FACILITY" DELETE #note: this cant be called anymore |
---|
91 | elif [ "$EXIT" == 0 ]; then |
---|
92 | syn3-state "$FACILITY" OK "Hardisk $INFO is healty. $OUTPUT" |
---|
93 | else |
---|
94 | syn3-state "$FACILITY" CAUTION "Hardisk $INFO has unknown SMART-status. $OUTPUT" |
---|
95 | fi |
---|
96 | |
---|
97 | #if selftest is disabled in config file, skip current device. |
---|
98 | is_selftest_disabled `echo "$ID" | cut -d ' ' -f1`; |
---|
99 | if [ "$?" == 0 ]; then |
---|
100 | return 0 |
---|
101 | fi |
---|
102 | |
---|
103 | #Start a long self-test at mid-night |
---|
104 | if [ `date +%H%M` == "0000" ]; then |
---|
105 | $SMARTCMD -t long &>/dev/null |
---|
106 | fi |
---|
107 | } |
---|
108 | |
---|
109 | if [ "$1" == "--list" ]; then |
---|
110 | LIST=1; |
---|
111 | fi |
---|
112 | |
---|
113 | #traverse all blockdevices |
---|
114 | cd /sys/block |
---|
115 | for SYS in *; do |
---|
116 | DEV=/dev/`echo $SYS| sed s@!@/@g` |
---|
117 | |
---|
118 | #range=1 (things like dm, loop, fd etc)? |
---|
119 | [ "`cat $SYS/range`" == "1" ] && continue; |
---|
120 | |
---|
121 | #try default first (this works for ata and scsi): |
---|
122 | SMARTCMD="smartctl $DEV" |
---|
123 | FACILITY="SMART-`echo $DEV| sed s#.*/##`" |
---|
124 | run_smartctl && continue |
---|
125 | |
---|
126 | #SAT |
---|
127 | SMARTCMD="smartctl $DEV -d sat" |
---|
128 | FACILITY="SMART-`echo $DEV| sed s#.*/##`" |
---|
129 | run_smartctl && continue |
---|
130 | |
---|
131 | #try cciss Smart Array controller mode: |
---|
132 | FOUND= |
---|
133 | for DRIVE in `seq 0 15`; do |
---|
134 | SMARTCMD="smartctl $DEV -d cciss,$DRIVE" |
---|
135 | FACILITY="SMART-`echo $DEV| sed s#.*/##`-$DRIVE" |
---|
136 | run_smartctl && FOUND=1 |
---|
137 | done |
---|
138 | [ "$FOUND" ] && continue; |
---|
139 | |
---|
140 | #try 3ware controller mode: |
---|
141 | FOUND= |
---|
142 | for DRIVE in `seq 0 15`; do |
---|
143 | SMARTCMD="smartctl $DEV -d 3ware,$DRIVE" |
---|
144 | FACILITY="SMART-`echo $DEV| sed s#.*/##`-$DRIVE" |
---|
145 | run_smartctl && FOUND=1 |
---|
146 | done |
---|
147 | [ "$FOUND" ] && continue; |
---|
148 | |
---|
149 | done |
---|
150 | |
---|
151 | rm /tmp/smart.$$ |
---|
152 | |
---|