1 | #!/usr/bin/perl |
---|
2 | # |
---|
3 | # $Id: dspam_train.in,v 1.01 2010/08/12 02:03:35 sbajic Exp $ |
---|
4 | |
---|
5 | use strict; |
---|
6 | use vars qw { $USER %CONFIG $SPAM_CORPUS $NONSPAM_CORPUS }; |
---|
7 | |
---|
8 | $CONFIG{'SHOW_SUBJECTS'} = 0; |
---|
9 | $CONFIG{'DSPAM_BINARY'} = '@bindir@/@dspam_transformed@'; |
---|
10 | $CONFIG{'BINDIR'} = '@bindir@'; |
---|
11 | |
---|
12 | ### DO NOT CONFIGURE ANYTHING BELOW THIS LINE |
---|
13 | |
---|
14 | $USER = shift; |
---|
15 | $SPAM_CORPUS = shift; |
---|
16 | if ($SPAM_CORPUS eq "--client" || $SPAM_CORPUS eq "-c") { |
---|
17 | $SPAM_CORPUS = shift; |
---|
18 | $CONFIG{'DSPAM_BINARY'} .= " --client"; |
---|
19 | } |
---|
20 | $NONSPAM_CORPUS = shift; |
---|
21 | |
---|
22 | if ($NONSPAM_CORPUS eq "") { |
---|
23 | # we were wrong about the first argument, it was the spam corpus and not |
---|
24 | # the user name in fact |
---|
25 | ($USER, $NONSPAM_CORPUS, $SPAM_CORPUS) = ((getpwuid($<))[0], $SPAM_CORPUS, $USER) |
---|
26 | } |
---|
27 | |
---|
28 | sub usage { |
---|
29 | print STDERR "Usage: $0 [username] [--client] [[-i index]|[spam_corpus] [nonspam_corpus]]\n"; |
---|
30 | exit(-1); |
---|
31 | } |
---|
32 | |
---|
33 | print "Taking Snapshot...\n"; |
---|
34 | system("$CONFIG{'BINDIR'}/dspam_stats -r $USER"); |
---|
35 | if (($? >> 8) != 0) { |
---|
36 | print STDERR "Running dspam_stats for $USER failed.\n"; |
---|
37 | exit(-1); |
---|
38 | } |
---|
39 | |
---|
40 | if ($SPAM_CORPUS eq "-i") { |
---|
41 | TrainOnIndex($NONSPAM_CORPUS); |
---|
42 | } |
---|
43 | else { |
---|
44 | &Train("$NONSPAM_CORPUS", "$SPAM_CORPUS"); |
---|
45 | } |
---|
46 | |
---|
47 | FinishTraining(); |
---|
48 | |
---|
49 | |
---|
50 | sub TrainOnIndex { |
---|
51 | my($index) = @_; |
---|
52 | open(INDEX, "<$index") or die "Failed to open index file \"$index\": $!\n"; |
---|
53 | |
---|
54 | print "Training on $index index...\n"; |
---|
55 | while(<INDEX>) { |
---|
56 | chomp; |
---|
57 | my($class, $filename) = split(/\s+/); |
---|
58 | if ($class eq "ham" || $class eq "nonspam") { |
---|
59 | TestNonspam($filename); |
---|
60 | } elsif ($class eq "spam") { |
---|
61 | TestSpam($filename); |
---|
62 | } else { |
---|
63 | die "ERROR: Unknown class '$class'. Test Broken."; |
---|
64 | } |
---|
65 | } |
---|
66 | } |
---|
67 | |
---|
68 | sub Train { |
---|
69 | my($nonspam, $spam) = @_; |
---|
70 | my(@nonspam_corpus, @spam_corpus); |
---|
71 | my($msg); |
---|
72 | |
---|
73 | print "Training $nonspam / $spam corpora...\n"; |
---|
74 | |
---|
75 | @nonspam_corpus = GetFilesOrMessages($nonspam); |
---|
76 | @spam_corpus = GetFilesOrMessages($spam); |
---|
77 | |
---|
78 | while($#nonspam_corpus > -1 || $#spam_corpus > -1) { |
---|
79 | if ($#nonspam_corpus > -1) { |
---|
80 | my($count) = 0; |
---|
81 | |
---|
82 | # Process nonspam until balanced |
---|
83 | $msg = shift(@nonspam_corpus); |
---|
84 | TestNonspam($msg); |
---|
85 | if ($#spam_corpus > -1) { |
---|
86 | $count = ($#nonspam_corpus+1) / ($#spam_corpus+1); |
---|
87 | } |
---|
88 | for(1..$count-1) |
---|
89 | { |
---|
90 | $msg = shift(@nonspam_corpus); |
---|
91 | TestNonspam($msg); |
---|
92 | } |
---|
93 | } |
---|
94 | |
---|
95 | if ($#spam_corpus > -1) { |
---|
96 | my($count) = 0; |
---|
97 | # Process spam until balanced |
---|
98 | $msg = shift(@spam_corpus); |
---|
99 | TestSpam($msg); |
---|
100 | if ($#nonspam_corpus > -1) { |
---|
101 | $count = ($#spam_corpus+1) / ($#nonspam_corpus+1); |
---|
102 | } |
---|
103 | for(1..$count-1) |
---|
104 | { |
---|
105 | $msg = shift(@spam_corpus); |
---|
106 | TestSpam($msg); |
---|
107 | } |
---|
108 | } |
---|
109 | } |
---|
110 | } |
---|
111 | |
---|
112 | sub FinishTraining() { |
---|
113 | print "TRAINING COMPLETE\n"; |
---|
114 | print "\nTraining Snapshot:\n"; |
---|
115 | system("$CONFIG{'BINDIR'}/dspam_stats -S -s $USER"); |
---|
116 | if (($? >> 8) != 0) { |
---|
117 | print STDERR "Running dspam_stats for $USER failed.\n"; |
---|
118 | exit(-1); |
---|
119 | } |
---|
120 | print "\nOverall Statistics:\n"; |
---|
121 | system("$CONFIG{'BINDIR'}/dspam_stats -S $USER"); |
---|
122 | if (($? >> 8) != 0) { |
---|
123 | print STDERR "Running dspam_stats for $USER failed.\n"; |
---|
124 | exit(-1); |
---|
125 | } |
---|
126 | } |
---|
127 | |
---|
128 | sub GetFilesOrMessages { |
---|
129 | my ($corpus) = @_; |
---|
130 | if (-d $corpus) { |
---|
131 | return GetFiles($corpus); |
---|
132 | } |
---|
133 | elsif (-f $corpus) { |
---|
134 | return GetMessages($corpus); |
---|
135 | } |
---|
136 | else { |
---|
137 | die "Corpus \"$corpus\" must be either a MBOX file or a maildir directory.\n" |
---|
138 | } |
---|
139 | } |
---|
140 | |
---|
141 | sub GetFiles { |
---|
142 | my($corpus) = @_; |
---|
143 | my(@files); |
---|
144 | |
---|
145 | opendir(DIR, "$corpus") || die "$corpus: $!"; |
---|
146 | @files = grep(!/^\.\.?$/, readdir(DIR)); |
---|
147 | closedir(DIR); |
---|
148 | return map { $_ = "$corpus/" . $_ } @files; |
---|
149 | } |
---|
150 | |
---|
151 | sub GetMessages { |
---|
152 | my ($mbox) = @_; |
---|
153 | |
---|
154 | die "Please install Mail::MboxParser module if you want to be able to use " . |
---|
155 | "MBOX files for training.\n" unless eval { require Mail::MboxParser; }; |
---|
156 | |
---|
157 | # filter out special pseudo messages used by Pine/UW-IMAPd |
---|
158 | return grep { $_->header->{subject} ne "DON'T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA" } |
---|
159 | Mail::MboxParser->new($mbox)->get_messages(); |
---|
160 | } |
---|
161 | |
---|
162 | sub TestNonspam { |
---|
163 | my($msg) = @_; |
---|
164 | TestAny($msg, "nonspam", "Innocent", "Whitelisted", "innocent", "fp") |
---|
165 | } |
---|
166 | |
---|
167 | sub TestSpam { |
---|
168 | my($msg) = @_; |
---|
169 | TestAny($msg, "spam ", "Spam", "Blacklisted", "spam", "fn") |
---|
170 | } |
---|
171 | |
---|
172 | sub TestAny { |
---|
173 | my($msg, $testname, $ok1, $ok2, $dspam_class, $short_class) = @_; |
---|
174 | my $response; |
---|
175 | print "[test: $testname] " . substr($msg . " " x 32, 0, 32) . " result: "; |
---|
176 | my $cmd = "$CONFIG{'DSPAM_BINARY'} --user $USER --deliver=summary --stdout"; |
---|
177 | if ( -f $msg ) { |
---|
178 | $response = `$cmd < '$msg'`; |
---|
179 | } |
---|
180 | else { |
---|
181 | use FileHandle; |
---|
182 | use IPC::Open2; |
---|
183 | my ($dspam_in, $dspam_out); |
---|
184 | my $pid = open2($dspam_out, $dspam_in, $cmd); |
---|
185 | print $dspam_in $msg->as_string(); |
---|
186 | close $dspam_in; |
---|
187 | $response = join('', <$dspam_out>); |
---|
188 | waitpid $pid, 0 |
---|
189 | } |
---|
190 | |
---|
191 | my $code = "UNKNOWN"; |
---|
192 | if ($response =~ /class="(\S+)"/i) { |
---|
193 | $code = $1; |
---|
194 | } |
---|
195 | if ($code eq "UNKNOWN") { |
---|
196 | # print "\n===== WOAH THERE =====\n"; |
---|
197 | # print "I was unable to parse the result. Test Broken.\n"; |
---|
198 | # print "======================\n"; |
---|
199 | # exit(0); |
---|
200 | print "BROKEN result!!\n"; |
---|
201 | return; |
---|
202 | } |
---|
203 | |
---|
204 | if ($code eq $ok1 || $code eq $ok2) { |
---|
205 | print "PASS"; |
---|
206 | } else { |
---|
207 | my($class) = "UNKNOWN"; |
---|
208 | my($signature) = "UNKNOWN"; |
---|
209 | if ($response =~ /class="(\S+)"/i) { |
---|
210 | $class = $1; |
---|
211 | } else { |
---|
212 | print "BROKEN class!!\n"; |
---|
213 | return; |
---|
214 | } |
---|
215 | |
---|
216 | if ($response =~ /signature=(\S+)/i) { |
---|
217 | $signature = $1; |
---|
218 | } else { |
---|
219 | # print "\n===== WOAH THERE =====\n"; |
---|
220 | # print "I was unable to find the DSPAM signature. Test Broken.\n"; |
---|
221 | # print "======================\n"; |
---|
222 | # print "\n$response\n"; |
---|
223 | # exit(0); |
---|
224 | print "BROKEN signature!!\n"; |
---|
225 | return; |
---|
226 | } |
---|
227 | |
---|
228 | print "FAIL ($class)"; |
---|
229 | if ($CONFIG{'SHOW_SUBJECTS'} == 1) { |
---|
230 | print "\n\t[$short_class] "; |
---|
231 | if ( -f $msg ) { |
---|
232 | open(FILE, "<$msg"); |
---|
233 | while(<FILE>) { |
---|
234 | if (/^Subject:/i) { |
---|
235 | chomp; |
---|
236 | print $_; |
---|
237 | close(FILE); |
---|
238 | } |
---|
239 | } |
---|
240 | close(FILE); |
---|
241 | } |
---|
242 | else { |
---|
243 | print $msg->header->{subject} |
---|
244 | } |
---|
245 | } |
---|
246 | open(TRAIN, "|$CONFIG{'DSPAM_BINARY'} --user $USER --class=$dspam_class " . |
---|
247 | "--source=error --signature=$signature"); |
---|
248 | close(TRAIN); |
---|
249 | } |
---|
250 | print "\n"; |
---|
251 | } |
---|