source: npl/mailserver/dspam/dspam-3.10.2/src/tools/dspam_train.in

Last change on this file was c5c522c, checked in by Edwin Eefting <edwin@datux.nl>, 8 years ago

initial commit, transferred from cleaned syn3 svn tree

  • Property mode set to 100755
File size: 6.7 KB
Line 
1#!/usr/bin/perl
2#
3# $Id: dspam_train.in,v 1.01 2010/08/12 02:03:35 sbajic Exp $
4
5use strict;
6use vars qw { $USER %CONFIG $SPAM_CORPUS $NONSPAM_CORPUS };
7
8$CONFIG{'SHOW_SUBJECTS'} = 0;
9$CONFIG{'DSPAM_BINARY'}  = '@bindir@/@dspam_transformed@';
10$CONFIG{'BINDIR'}        = '@bindir@';
11
12### DO NOT CONFIGURE ANYTHING BELOW THIS LINE
13
14$USER           = shift;
15$SPAM_CORPUS    = shift;
16if ($SPAM_CORPUS eq "--client" || $SPAM_CORPUS eq "-c") {
17  $SPAM_CORPUS = shift;
18  $CONFIG{'DSPAM_BINARY'} .= " --client";
19}
20$NONSPAM_CORPUS = shift;
21
22if ($NONSPAM_CORPUS eq "") {
23    # we were wrong about the first argument, it was the spam corpus and not
24    # the user name in fact
25    ($USER, $NONSPAM_CORPUS, $SPAM_CORPUS) = ((getpwuid($<))[0], $SPAM_CORPUS, $USER)
26}
27
28sub usage {
29    print STDERR "Usage: $0 [username] [--client] [[-i index]|[spam_corpus] [nonspam_corpus]]\n";
30    exit(-1);
31}
32
33print "Taking Snapshot...\n";
34system("$CONFIG{'BINDIR'}/dspam_stats -r $USER");
35if (($? >> 8) != 0) {
36    print STDERR "Running dspam_stats for $USER failed.\n";
37    exit(-1);
38}
39
40if ($SPAM_CORPUS eq "-i") {
41    TrainOnIndex($NONSPAM_CORPUS);
42}
43else {
44    &Train("$NONSPAM_CORPUS", "$SPAM_CORPUS");
45}
46
47FinishTraining();
48
49
50sub TrainOnIndex {
51  my($index) = @_;
52  open(INDEX, "<$index") or die "Failed to open index file \"$index\": $!\n";
53
54  print "Training on $index index...\n";
55  while(<INDEX>) {
56    chomp;
57    my($class, $filename) = split(/\s+/);
58    if ($class eq "ham" || $class eq "nonspam") {
59        TestNonspam($filename);
60    } elsif ($class eq "spam") {
61        TestSpam($filename);
62    } else {
63        die "ERROR: Unknown class '$class'. Test Broken.";
64    }
65  }
66}
67
68sub Train {
69    my($nonspam, $spam) = @_;
70    my(@nonspam_corpus, @spam_corpus);
71    my($msg);
72
73    print "Training $nonspam / $spam corpora...\n";
74
75    @nonspam_corpus = GetFilesOrMessages($nonspam);
76    @spam_corpus = GetFilesOrMessages($spam);
77
78    while($#nonspam_corpus > -1 || $#spam_corpus > -1) {
79        if ($#nonspam_corpus > -1) {
80            my($count) = 0;
81
82            # Process nonspam until balanced
83            $msg = shift(@nonspam_corpus);
84            TestNonspam($msg);
85            if ($#spam_corpus > -1) {
86                $count = ($#nonspam_corpus+1) / ($#spam_corpus+1);
87            }
88            for(1..$count-1)
89            {
90                $msg = shift(@nonspam_corpus);
91                TestNonspam($msg);
92            }
93        }
94
95        if ($#spam_corpus > -1) {
96            my($count) = 0;
97            # Process spam until balanced
98            $msg = shift(@spam_corpus);
99            TestSpam($msg);
100            if ($#nonspam_corpus > -1) {
101                $count = ($#spam_corpus+1) / ($#nonspam_corpus+1);
102            }
103            for(1..$count-1)
104            {
105                $msg = shift(@spam_corpus);
106                TestSpam($msg);
107            }
108        }
109    }
110}
111
112sub FinishTraining() {
113    print "TRAINING COMPLETE\n";
114    print "\nTraining Snapshot:\n";
115    system("$CONFIG{'BINDIR'}/dspam_stats -S -s $USER");
116    if (($? >> 8) != 0) {
117        print STDERR "Running dspam_stats for $USER failed.\n";
118        exit(-1);
119    }
120    print "\nOverall Statistics:\n";
121    system("$CONFIG{'BINDIR'}/dspam_stats -S $USER");
122    if (($? >> 8) != 0) {
123        print STDERR "Running dspam_stats for $USER failed.\n";
124        exit(-1);
125    }
126}
127
128sub GetFilesOrMessages {
129    my ($corpus) = @_;
130    if (-d $corpus) {
131        return GetFiles($corpus);
132    }
133    elsif (-f $corpus) {
134        return GetMessages($corpus);
135    }
136    else {
137        die "Corpus \"$corpus\" must be either a MBOX file or a maildir directory.\n"
138    }
139}
140
141sub GetFiles {
142  my($corpus) = @_;
143  my(@files);
144
145  opendir(DIR, "$corpus") || die "$corpus: $!";
146  @files = grep(!/^\.\.?$/, readdir(DIR));
147  closedir(DIR);
148  return map { $_ = "$corpus/" . $_ } @files;
149}
150
151sub GetMessages {
152    my ($mbox) = @_;
153
154    die "Please install Mail::MboxParser module if you want to be able to use " .
155        "MBOX files for training.\n" unless eval { require Mail::MboxParser; };
156
157    # filter out special pseudo messages used by Pine/UW-IMAPd
158    return grep { $_->header->{subject} ne "DON'T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA" }
159            Mail::MboxParser->new($mbox)->get_messages();
160}
161
162sub TestNonspam {
163    my($msg) = @_;
164    TestAny($msg, "nonspam", "Innocent", "Whitelisted", "innocent", "fp")
165}
166
167sub TestSpam {
168    my($msg) = @_;
169    TestAny($msg, "spam   ", "Spam", "Blacklisted", "spam", "fn")
170}
171
172sub TestAny {
173    my($msg, $testname, $ok1, $ok2, $dspam_class, $short_class) = @_;
174    my $response;
175    print "[test: $testname] " . substr($msg . " " x 32, 0, 32) .  " result: ";
176    my $cmd = "$CONFIG{'DSPAM_BINARY'} --user $USER --deliver=summary --stdout";
177    if ( -f $msg ) {
178        $response = `$cmd < '$msg'`;
179    }
180    else {
181        use FileHandle;
182        use IPC::Open2;
183        my ($dspam_in, $dspam_out);
184        my $pid = open2($dspam_out, $dspam_in, $cmd);
185        print $dspam_in $msg->as_string();
186        close $dspam_in;
187        $response = join('', <$dspam_out>);
188        waitpid $pid, 0
189    }
190
191    my $code = "UNKNOWN";
192    if ($response =~ /class="(\S+)"/i) {
193        $code = $1;
194    }
195    if ($code eq "UNKNOWN") {
196        # print "\n===== WOAH THERE =====\n";
197        # print "I was unable to parse the result. Test Broken.\n";
198        # print "======================\n";
199        # exit(0);
200        print "BROKEN result!!\n";
201        return;
202    }
203
204    if ($code eq $ok1 || $code eq $ok2) {
205        print "PASS";
206    } else {
207        my($class) = "UNKNOWN";
208        my($signature) = "UNKNOWN";
209        if ($response =~ /class="(\S+)"/i) {
210            $class = $1;
211        } else {
212            print "BROKEN class!!\n";
213            return;
214        }
215
216        if ($response =~ /signature=(\S+)/i) {
217            $signature = $1;
218        } else {
219            # print "\n===== WOAH THERE =====\n";
220            # print "I was unable to find the DSPAM signature. Test Broken.\n";
221            # print "======================\n";
222            # print "\n$response\n";
223            # exit(0);
224            print "BROKEN signature!!\n";
225            return;
226        }
227
228        print "FAIL ($class)";
229        if ($CONFIG{'SHOW_SUBJECTS'} == 1) {
230            print "\n\t[$short_class] ";
231            if ( -f $msg ) {
232                open(FILE, "<$msg");
233                while(<FILE>) {
234                    if (/^Subject:/i) {
235                        chomp;
236                        print $_;
237                        close(FILE);
238                    }
239                }
240                close(FILE);
241            }
242            else {
243                print $msg->header->{subject}
244            }
245        }
246        open(TRAIN, "|$CONFIG{'DSPAM_BINARY'} --user $USER --class=$dspam_class " .
247                    "--source=error --signature=$signature");
248        close(TRAIN);
249    }
250    print "\n";
251}
Note: See TracBrowser for help on using the repository browser.