#!/usr/bin/perl # # $Id: dspam_train.in,v 1.01 2010/08/12 02:03:35 sbajic Exp $ use strict; use vars qw { $USER %CONFIG $SPAM_CORPUS $NONSPAM_CORPUS }; $CONFIG{'SHOW_SUBJECTS'} = 0; $CONFIG{'DSPAM_BINARY'} = '@bindir@/@dspam_transformed@'; $CONFIG{'BINDIR'} = '@bindir@'; ### DO NOT CONFIGURE ANYTHING BELOW THIS LINE $USER = shift; $SPAM_CORPUS = shift; if ($SPAM_CORPUS eq "--client" || $SPAM_CORPUS eq "-c") { $SPAM_CORPUS = shift; $CONFIG{'DSPAM_BINARY'} .= " --client"; } $NONSPAM_CORPUS = shift; if ($NONSPAM_CORPUS eq "") { # we were wrong about the first argument, it was the spam corpus and not # the user name in fact ($USER, $NONSPAM_CORPUS, $SPAM_CORPUS) = ((getpwuid($<))[0], $SPAM_CORPUS, $USER) } sub usage { print STDERR "Usage: $0 [username] [--client] [[-i index]|[spam_corpus] [nonspam_corpus]]\n"; exit(-1); } print "Taking Snapshot...\n"; system("$CONFIG{'BINDIR'}/dspam_stats -r $USER"); if (($? >> 8) != 0) { print STDERR "Running dspam_stats for $USER failed.\n"; exit(-1); } if ($SPAM_CORPUS eq "-i") { TrainOnIndex($NONSPAM_CORPUS); } else { &Train("$NONSPAM_CORPUS", "$SPAM_CORPUS"); } FinishTraining(); sub TrainOnIndex { my($index) = @_; open(INDEX, "<$index") or die "Failed to open index file \"$index\": $!\n"; print "Training on $index index...\n"; while() { chomp; my($class, $filename) = split(/\s+/); if ($class eq "ham" || $class eq "nonspam") { TestNonspam($filename); } elsif ($class eq "spam") { TestSpam($filename); } else { die "ERROR: Unknown class '$class'. Test Broken."; } } } sub Train { my($nonspam, $spam) = @_; my(@nonspam_corpus, @spam_corpus); my($msg); print "Training $nonspam / $spam corpora...\n"; @nonspam_corpus = GetFilesOrMessages($nonspam); @spam_corpus = GetFilesOrMessages($spam); while($#nonspam_corpus > -1 || $#spam_corpus > -1) { if ($#nonspam_corpus > -1) { my($count) = 0; # Process nonspam until balanced $msg = shift(@nonspam_corpus); TestNonspam($msg); if ($#spam_corpus > -1) { $count = ($#nonspam_corpus+1) / ($#spam_corpus+1); } for(1..$count-1) { $msg = shift(@nonspam_corpus); TestNonspam($msg); } } if ($#spam_corpus > -1) { my($count) = 0; # Process spam until balanced $msg = shift(@spam_corpus); TestSpam($msg); if ($#nonspam_corpus > -1) { $count = ($#spam_corpus+1) / ($#nonspam_corpus+1); } for(1..$count-1) { $msg = shift(@spam_corpus); TestSpam($msg); } } } } sub FinishTraining() { print "TRAINING COMPLETE\n"; print "\nTraining Snapshot:\n"; system("$CONFIG{'BINDIR'}/dspam_stats -S -s $USER"); if (($? >> 8) != 0) { print STDERR "Running dspam_stats for $USER failed.\n"; exit(-1); } print "\nOverall Statistics:\n"; system("$CONFIG{'BINDIR'}/dspam_stats -S $USER"); if (($? >> 8) != 0) { print STDERR "Running dspam_stats for $USER failed.\n"; exit(-1); } } sub GetFilesOrMessages { my ($corpus) = @_; if (-d $corpus) { return GetFiles($corpus); } elsif (-f $corpus) { return GetMessages($corpus); } else { die "Corpus \"$corpus\" must be either a MBOX file or a maildir directory.\n" } } sub GetFiles { my($corpus) = @_; my(@files); opendir(DIR, "$corpus") || die "$corpus: $!"; @files = grep(!/^\.\.?$/, readdir(DIR)); closedir(DIR); return map { $_ = "$corpus/" . $_ } @files; } sub GetMessages { my ($mbox) = @_; die "Please install Mail::MboxParser module if you want to be able to use " . "MBOX files for training.\n" unless eval { require Mail::MboxParser; }; # filter out special pseudo messages used by Pine/UW-IMAPd return grep { $_->header->{subject} ne "DON'T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA" } Mail::MboxParser->new($mbox)->get_messages(); } sub TestNonspam { my($msg) = @_; TestAny($msg, "nonspam", "Innocent", "Whitelisted", "innocent", "fp") } sub TestSpam { my($msg) = @_; TestAny($msg, "spam ", "Spam", "Blacklisted", "spam", "fn") } sub TestAny { my($msg, $testname, $ok1, $ok2, $dspam_class, $short_class) = @_; my $response; print "[test: $testname] " . substr($msg . " " x 32, 0, 32) . " result: "; my $cmd = "$CONFIG{'DSPAM_BINARY'} --user $USER --deliver=summary --stdout"; if ( -f $msg ) { $response = `$cmd < '$msg'`; } else { use FileHandle; use IPC::Open2; my ($dspam_in, $dspam_out); my $pid = open2($dspam_out, $dspam_in, $cmd); print $dspam_in $msg->as_string(); close $dspam_in; $response = join('', <$dspam_out>); waitpid $pid, 0 } my $code = "UNKNOWN"; if ($response =~ /class="(\S+)"/i) { $code = $1; } if ($code eq "UNKNOWN") { # print "\n===== WOAH THERE =====\n"; # print "I was unable to parse the result. Test Broken.\n"; # print "======================\n"; # exit(0); print "BROKEN result!!\n"; return; } if ($code eq $ok1 || $code eq $ok2) { print "PASS"; } else { my($class) = "UNKNOWN"; my($signature) = "UNKNOWN"; if ($response =~ /class="(\S+)"/i) { $class = $1; } else { print "BROKEN class!!\n"; return; } if ($response =~ /signature=(\S+)/i) { $signature = $1; } else { # print "\n===== WOAH THERE =====\n"; # print "I was unable to find the DSPAM signature. Test Broken.\n"; # print "======================\n"; # print "\n$response\n"; # exit(0); print "BROKEN signature!!\n"; return; } print "FAIL ($class)"; if ($CONFIG{'SHOW_SUBJECTS'} == 1) { print "\n\t[$short_class] "; if ( -f $msg ) { open(FILE, "<$msg"); while() { if (/^Subject:/i) { chomp; print $_; close(FILE); } } close(FILE); } else { print $msg->header->{subject} } } open(TRAIN, "|$CONFIG{'DSPAM_BINARY'} --user $USER --class=$dspam_class " . "--source=error --signature=$signature"); close(TRAIN); } print "\n"; }