source: npl/mailserver/dspam/dspam-3.10.2/src/libdspam.c @ c5c522c

gcc484ntopperl-5.22
Last change on this file since c5c522c was c5c522c, checked in by Edwin Eefting <edwin@datux.nl>, 8 years ago

initial commit, transferred from cleaned syn3 svn tree

  • Property mode set to 100644
File size: 72.9 KB
Line 
1/* $Id: libdspam.c,v 1.205 2011/07/13 00:51:46 sbajic Exp $ */
2
3/*
4 DSPAM
5 COPYRIGHT (C) 2002-2012 DSPAM PROJECT
6
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU Affero General Public License as
9 published by the Free Software Foundation, either version 3 of the
10 License, or (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU Affero General Public License for more details.
16
17 You should have received a copy of the GNU Affero General Public License
18 along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20*/
21
22/*
23 * libdspam.c - DSPAM core analytical engine
24 *
25 * DESCRIPTION
26 *   libdspam is at the core of the decision making process and is called
27 *   by the agent to perform all tasks related to message classification.
28 *   The libdspam API functions are documented in libdspam(1).
29 */
30
31#ifndef STATIC_DRIVER
32void *_drv_handle;
33#endif
34
35#ifdef HAVE_CONFIG_H
36#include <auto-config.h>
37#endif
38
39#include <stdio.h>
40#include <stdlib.h>
41#include <math.h>
42#include <ctype.h>
43#include <errno.h>
44#include <string.h>
45#ifdef HAVE_UNISTD_H
46#include <unistd.h>
47#endif
48#include <sys/types.h>
49#include <sys/stat.h>
50#include <fcntl.h>
51#include <dlfcn.h>
52
53#ifdef TIME_WITH_SYS_TIME
54#   include <sys/time.h>
55#   include <time.h>
56#else
57#   ifdef HAVE_SYS_TIME_H
58#       include <sys/time.h>
59#   else
60#       include <time.h>
61#   endif
62#endif
63
64#include "config.h"
65#include "libdspam_objects.h"
66#include "libdspam.h"
67#include "nodetree.h"
68#include "config.h"
69#include "base64.h"
70#include "bnr.h"
71#include "util.h"
72#include "storage_driver.h"
73#include "buffer.h"
74#include "heap.h"
75#include "error.h"
76#include "decode.h"
77#include "language.h"
78
79#define CHI_S   0.1     /* Chi-Sq Strength */
80#define CHI_X   0.5000  /* Chi-Sq Assumed Probability */
81
82#define C1      16      /* Markov C1 */
83#define C2      1       /* Markov C2 */
84
85#ifdef DEBUG
86int DO_DEBUG = 0;
87#endif
88
89/*
90 * dspam_init()
91 *
92 * DESCRIPTION
93 *   The  dspam_init() function creates and initializes a new classification
94 *   context and attaches the context to whatever backend  storage  facility
95 *   was  configured. The user and group arguments provided are used to read
96 *   and write information stored for the user and group specified. The home
97 *   argument is used to configure libdspam's storage around the base direc-
98 *   tory specified. The mode specifies the operating mode to initialize the
99 *   classification context with and may be one of:
100 *
101 *    DSM_PROCESS   Process the message and return a result
102 *    DSM_CLASSIFY  Classify message only, no learning
103 *    DSM_TOOLS     No processing, attach to storage only
104 *
105 *   The  flags  provided further tune the classification context for a spe-
106 *   cific function. Multiple flags may be OR'd together.
107 *
108 *    DSF_SIGNATURE A binary signature is requested/provided
109 *    DSF_NOISE     Apply Bayesian Noise Reduction logic
110 *    DSF_WHITELIST Use automatic whitelisting logic
111 *    DSF_MERGED    Merge group metadata with user's in memory
112 *
113 * RETURN VALUES
114 *   Upon successful completion, dspam_init() will return a pointer to a new
115 *   classification context structure containing a copy of the configuration
116 *   passed into dspam_init(), a connected storage driver handle, and a  set
117 *   of preliminary user control data read from storage.
118 */
119
120DSPAM_CTX * dspam_init (
121  const char *username,
122  const char *group,
123  const char *home,
124  int operating_mode,
125  u_int32_t flags)
126{
127  DSPAM_CTX *CTX = dspam_create(username, group, home, operating_mode, flags);
128
129  if (CTX == NULL)
130    return NULL;
131
132  if (!dspam_attach(CTX, NULL))
133    return CTX;
134
135  dspam_destroy(CTX);
136
137  return NULL;
138}
139
140/* dspam_create()
141 *
142 * DESCRIPTION
143 *   The  dspam_create() function performs in exactly the same manner as the
144 *   dspam_init() function, but does not attach  to  storage.  Instead,  the
145 *   caller  must  also  call dspam_attach() after setting any storage- spe-
146 *   cific attributes using dspam_addattribute(). This is useful  for  cases
147 *   where  the  implementor  would  prefer  to configure storage internally
148 *   rather than having libdspam read a configuration from a file.
149 *
150 * RETURN VALUES
151 *   Upon successful completion, dspam_create() will return a pointer to a new
152 *   classification context structure containing a copy of the configuration
153 *   passed into dspam_create(). At this point, dspam_attach() must be called
154 *   for further processing.
155 */
156
157DSPAM_CTX * dspam_create (
158  const char *username,
159  const char *group,
160  const char *home,
161  int operating_mode,
162  u_int32_t flags)
163{
164  DSPAM_CTX *CTX;
165
166  CTX = calloc (1, sizeof (DSPAM_CTX));
167  if (CTX == NULL) {
168    LOG(LOG_WARNING, "dspam_create: unable to allocate space for classification context");
169    return NULL;
170  }
171
172  CTX->config = calloc(1, sizeof(struct _ds_config));
173  if (CTX->config == NULL) {
174    LOG(LOG_WARNING, "dspam_create: unable to allocate space for classification context configuration");
175    LOG(LOG_CRIT, ERR_MEM_ALLOC);
176    goto bail;
177  }
178
179  CTX->config->size = 128;
180  CTX->config->attributes = calloc(1, sizeof(attribute_t)*(CTX->config->size));
181  if (CTX->config->attributes == NULL) {
182    LOG(LOG_WARNING, "dspam_create: unable to allocate space for classification context attributes");
183    LOG(LOG_CRIT, ERR_MEM_ALLOC);
184    goto bail;
185  }
186
187  if (home != NULL && home[0] != 0)
188    CTX->home = strdup (home);
189  else {
190#ifdef DSPAM_HOME
191    CTX->home = strdup(DSPAM_HOME);
192#else
193    CTX->home = NULL;
194#endif
195  }
196
197  if (username != NULL && username[0] != 0)
198    CTX->username = strdup (username);
199  else
200    CTX->username = NULL;
201
202  if (group != NULL && group[0] != 0)
203    CTX->group = strdup (group);
204  else
205    CTX->group = NULL;
206
207  CTX->probability     = DSP_UNCALCULATED;
208  CTX->operating_mode  = operating_mode;
209  CTX->flags           = flags;
210  CTX->message         = NULL;
211  CTX->confidence      = 0;
212  CTX->training_mode   = DST_TEFT;
213  CTX->wh_threshold    = 10;
214  CTX->training_buffer = 0;
215  CTX->classification  = DSR_NONE;
216  CTX->source          = DSS_NONE;
217  CTX->_sig_provided   = 0;
218  CTX->factors         = NULL;
219  CTX->algorithms      = 0;
220  CTX->tokenizer       = DSZ_WORD;
221
222  return CTX;
223
224bail:
225  if (CTX != NULL) {
226    if (CTX->config != NULL) {
227      if (CTX->config->attributes != NULL)
228        _ds_destroy_config(CTX->config->attributes);
229      free(CTX->config);
230    }
231    if (CTX->username != NULL)
232      free(CTX->username);
233    if (CTX->group != NULL)
234      free(CTX->group);
235    if (CTX->home != NULL)
236      free(CTX->home);
237    free(CTX);
238  }
239  return NULL;
240}
241
242/*
243 * dspam_clearattributes()
244 *
245 * DESCRIPTION
246 *  The dspam_clearattributes() function is called to clear any attributes
247 *  previously set using dspam_addattribute()  within  the  classification
248 *  context.  It is necessary to call this function prior to replacing any
249 *  attributes already written.
250 *
251 * RETURN VALUES
252 *  returns 0 on success, standard errors on failure
253 *
254 */
255
256int dspam_clearattributes (DSPAM_CTX * CTX) {
257
258  if (CTX->config) {
259    _ds_destroy_config(CTX->config->attributes);
260    free(CTX->config);
261  } else {
262    return EFAILURE;
263  }
264
265  CTX->config = calloc(1, sizeof(struct _ds_config));
266  if (CTX->config == NULL)
267    goto bail;
268  CTX->config->size = 128;
269  CTX->config->attributes = calloc(1, sizeof(attribute_t)*(CTX->config->size));
270  if (CTX->config->attributes == NULL)
271    goto bail;
272
273  return 0;
274
275bail:
276  if (CTX->config != NULL) {
277    free(CTX->config);
278    CTX->config = NULL;
279  }
280  LOG(LOG_CRIT, ERR_MEM_ALLOC);
281  return EUNKNOWN;
282}
283
284/*
285 * dspam_addattribute()
286 *
287 * DESCRIPTION
288 *   The dspam_addattribute() function is called to  set  attributes  within
289 *   the  classification  context.  Some  storage drivers support the use of
290 *   passing specific attributes such as  server  connect  information.  The
291 *   driver-independent attributes supported by DSPAM include:
292 *
293 *    IgnoreHeader   Specify a specific header to ignore
294 *    LocalMX        Specify a local mail exchanger to assist in
295 *                   correct results from dspam_getsource().
296 *
297 *   Only  driver-dependent  attributes  need  be  set  prior  to  a call to
298 *   dspam_attach(). Driver-independent attributes may be  set  both  before
299 *   and after storage has been attached.
300 *
301 * RETURN VALUES
302 *   returns 0 on success, standard errors on failure
303 */                                                                                 
304int dspam_addattribute (DSPAM_CTX * CTX, const char *key, const char *value) {
305  int i, j = 0;
306                                                                               
307  if (_ds_find_attribute(CTX->config->attributes, key))
308    return _ds_add_attribute(CTX->config->attributes, key, value);
309                                                                               
310  for(i=0;CTX->config->attributes[i];i++)
311    j++;
312                                                                               
313  if (j >= CTX->config->size) {
314    config_t ptr;
315    CTX->config->size *= 2;
316    ptr = realloc(CTX->config->attributes,
317                  1+(sizeof(attribute_t)*CTX->config->size));
318    if (ptr) {
319      CTX->config->attributes = ptr;
320    } else {
321      LOG(LOG_CRIT, ERR_MEM_ALLOC);
322      return EFAILURE;
323    }
324  }
325                                                                               
326  return _ds_add_attribute(CTX->config->attributes, key, value);
327}
328
329/*
330 * dspam_attach()
331 *
332 * DESCRIPTION
333 *   The dspam_attach() function attaches the storage interface to the clas-
334 *   sification context and alternatively established an initial  connection
335 *   with  storage  if dbh is NULL. Some storage drivers support only a NULL
336 *   value  for  dbh,  while  others  (such  as  mysql_drv,  pgsql_drv,  and
337 *   sqlite_drv) allow an open database handle to be attached. This function
338 *   should only be called after  an  initial  call  to  dspam_create()  and
339 *   should  never  be called if using dspam_init(), as storage is automati-
340 *   cally attached by a call to dspam_init().
341 *
342 * RETURN VALUES
343 *   returns 0 on success, standard errors on failure
344 */
345
346int dspam_attach (DSPAM_CTX *CTX, void *dbh) {
347  if (!_ds_init_storage (CTX, dbh))
348    return 0;
349                                                                               
350  return EFAILURE;
351}
352
353/*
354 * dspam_detach()
355 *
356 * DESCRIPTION
357 *     The dspam_detach() function can be called when a detachment from  stor-
358 *     age  is desired, but the context is still needed. The storage driver is
359 *     closed, leaving the classification context in place. Once  the  context
360 *     is no longer needed, another call to dspam_destroy() should be made. If
361 *     you are closing storage and destroying the context at the same time, it
362 *     is   not  necessary  to  call  this  function.  Instead  you  may  call
363 *     dspam_destroy() directly.
364 *
365 * RETURN VALUES
366 *   returns 0 on success, standard errors on failure
367 */
368
369int
370dspam_detach (DSPAM_CTX * CTX)
371{
372  if (CTX->storage != NULL) {
373                                                                               
374    /* Sanity check totals before our shutdown call writes them */
375
376    if (CTX->totals.spam_learned < 0)
377      CTX->totals.spam_learned = 0;
378    if (CTX->totals.innocent_learned < 0)
379      CTX->totals.innocent_learned = 0;
380    if (CTX->totals.spam_misclassified < 0)
381      CTX->totals.spam_misclassified = 0;
382    if (CTX->totals.innocent_misclassified < 0)
383      CTX->totals.innocent_misclassified = 0;
384    if (CTX->totals.spam_classified < 0)
385      CTX->totals.spam_classified = 0;
386    if (CTX->totals.innocent_classified < 0)
387      CTX->totals.innocent_classified = 0;
388
389    _ds_shutdown_storage (CTX);
390    free(CTX->storage);
391    CTX->storage = NULL;
392  }
393
394  return 0;
395}
396
397/*
398 * dspam_destroy()
399 *
400 *     The dspam_destroy() function should be called when the  context  is  no
401 *     longer  needed.  If a connection was established to storage internally,
402 *     the connection is closed and all data is flushed and written. If a han-
403 *     dle was attached, the handle will remain open.
404 */
405
406void
407dspam_destroy (DSPAM_CTX * CTX)
408{
409  if (CTX->storage != NULL)
410    dspam_detach(CTX);
411
412  _ds_factor_destroy(CTX->factors);
413  if (CTX->config && CTX->config->attributes)
414    _ds_destroy_config (CTX->config->attributes);
415
416  free (CTX->config);
417  free (CTX->username);
418  free (CTX->group);
419  free (CTX->home);
420
421  if (! CTX->_sig_provided && CTX->signature != NULL) {
422    if (CTX->signature->data != NULL)
423      free (CTX->signature->data);
424    free (CTX->signature);
425  }
426
427  if (CTX->message)
428    _ds_destroy_message(CTX->message);
429  free (CTX);
430  return;
431}
432
433/*
434 * dspam_process()
435 *
436 * DESCRIPTION
437 *   The dspam_process() function performs analysis of  the  message  passed
438 *   into  it  and will return zero on successful completion. If successful,
439 *   CTX->result will be set to one of three classification results:
440 *
441 *    DSR_ISSPAM        Message was classified as spam
442 *    DSR_ISINNOCENT    Message was classified as nonspam
443 *
444 * RETURN VALUES
445 *   returns 0 on success
446 *
447 *   EINVAL    An invalid call or invalid parameter used.
448 *   EUNKNOWN  Unexpected error, such as malloc() failure
449 *   EFILE     Error opening or writing to a file or file handle
450 *   ELOCK     Locking failure
451 *   EFAILURE  The operation itself has failed
452 */
453
454int
455dspam_process (DSPAM_CTX * CTX, const char *message)
456{
457#ifdef DEBUG
458  struct timeval tp1, tp2;
459  struct timezone tzp;
460#endif
461  buffer *header, *body;
462  int spam_result = 0, is_toe = 0, is_undertrain = 0, retcode = 0;
463
464#ifdef DEBUG
465  gettimeofday(&tp1, &tzp);
466#endif
467
468  if (CTX->signature != NULL)
469    CTX->_sig_provided = 1;
470
471  /* Sanity check context behavior */
472
473  if (CTX->operating_mode == DSM_CLASSIFY && CTX->classification != DSR_NONE)
474  {
475    LOG(LOG_WARNING, "DSM_CLASSIFY can't be used with a classification");
476    return EINVAL;
477  }
478
479  if (CTX->algorithms == 0)
480  {
481    LOG(LOG_WARNING, "No algorithms configured. Use CTX->algorithms and DSA_");
482    return EINVAL;
483  }
484
485  if (CTX->classification != DSR_NONE && CTX->source == DSS_NONE)
486  {
487    LOG(LOG_WARNING, "A classification requires a source be specified");
488    return EINVAL;
489  }
490
491  if (CTX->classification == DSR_NONE && CTX->source != DSS_NONE)
492  {
493    LOG(LOG_WARNING, "A source requires a classification be specified");
494    return EINVAL;
495  }
496
497  /* Set TOE mode pretrain option if we haven't seen many messages yet */
498  if (CTX->training_mode == DST_TOE
499  && (CTX->totals.innocent_learned <= 100 || CTX->totals.spam_learned <= 100)
500  && (!(CTX->algorithms & DSP_MARKOV)))
501  {
502    is_undertrain = 1;
503    CTX->training_mode = DST_TEFT;
504  }
505
506  /* Classify only for TOE / NOTRAIN mode setting if data is mature enough */
507  if ( CTX->operating_mode == DSM_PROCESS
508    && CTX->classification == DSR_NONE
509    && (CTX->training_mode == DST_TOE || CTX->training_mode == DST_NOTRAIN))
510  {
511    CTX->operating_mode = DSM_CLASSIFY;
512    is_toe = 1;
513  }
514
515  /* A signature has been presented for training; process it */
516  /* Non-SPBH Signature */
517  if (CTX->operating_mode == DSM_PROCESS
518   && CTX->classification != DSR_NONE
519   && CTX->flags & DSF_SIGNATURE
520   && (CTX->tokenizer != DSZ_SBPH))
521  {
522    retcode = _ds_process_signature (CTX);
523    goto restore_mode;
524  }
525
526  header = buffer_create (NULL);
527  body   = buffer_create (NULL);
528  if (header == NULL || body == NULL)
529  {
530    LOG (LOG_CRIT, ERR_MEM_ALLOC);
531    buffer_destroy (header);
532    buffer_destroy (body);
533    retcode = EUNKNOWN;
534    goto restore_mode;
535  }
536
537  /* Parse the message if it hasn't already been by the client app */
538  if (!CTX->message && message)
539    CTX->message = _ds_actualize_message (message);
540
541  /* Analyze and filter (unless it's a signature based classification) */
542  if (! (CTX->flags & DSF_SIGNATURE
543     && CTX->operating_mode == DSM_CLASSIFY
544      && CTX->signature != NULL))
545  {
546    _ds_degenerate_message(CTX, header, body);
547  }
548
549  /*** Perform statistical operations and get a classification result ***/
550
551  /* Initialize */
552  CTX->result = DSR_NONE;
553
554  /* If SBPH reclassification, recall and operate on saved SBPH text */
555
556  if ( CTX->tokenizer == DSZ_SBPH
557    && CTX->operating_mode != DSM_CLASSIFY
558    && CTX->classification != DSR_NONE
559    && CTX->flags & DSF_SIGNATURE)
560  {
561    char *y, *h, *b;
562    char *ptrptr = NULL;
563
564    y = strdup((const char *) CTX->signature->data);
565    h = strtok_r(y, "\001", &ptrptr);
566    b = strtok_r(NULL, "\001", &ptrptr);
567    spam_result = _ds_operate (CTX, h, b);
568    free(y);
569
570  /* Otherwise, operate on the input message */
571
572  } else {
573    spam_result = _ds_operate (CTX, header->data, body->data);
574  }
575
576  /* Clean up */
577  buffer_destroy (header);
578  buffer_destroy (body);
579
580  /* _ds_operate() was unable to process message. Restore operating and training mode. */
581  if (spam_result != DSR_ISSPAM && spam_result != DSR_ISINNOCENT) {
582    LOG(LOG_WARNING, "received invalid result (!DSR_ISSPAM && !DSR_ISINNOCENT)"
583                     ": %d", spam_result);
584    retcode = EFAILURE;
585    goto restore_mode;
586  }
587
588  /* Force decision if a classification was specified */
589  if (CTX->classification != DSR_NONE) {
590    if (CTX->classification == DSR_ISINNOCENT)
591      spam_result = DSR_ISINNOCENT;
592    else if (CTX->classification == DSR_ISSPAM)
593      spam_result = DSR_ISSPAM;
594  }
595
596  /* Apply results to context */
597  CTX->result = spam_result;
598  if (CTX->class[0] == 0) {
599    if (spam_result == DSR_ISSPAM)
600      strcpy(CTX->class, LANG_CLASS_SPAM);
601    else if (spam_result == DSR_ISINNOCENT)
602      strcpy(CTX->class, LANG_CLASS_INNOCENT);
603  }
604
605/* Restore operating mode and/or training mode */
606restore_mode:
607
608  if (is_toe)
609    CTX->operating_mode = DSM_PROCESS;
610  if (is_undertrain)
611    CTX->training_mode = DST_TOE;
612
613#ifdef DEBUG
614  if (DO_DEBUG) {
615    if (CTX->source == DSS_NONE) {
616      gettimeofday(&tp2, &tzp);
617      LOGDEBUG("total processing time: %01.5fs",
618         (double) (tp2.tv_sec + (tp2.tv_usec / 1000000.0)) -
619         (double) (tp1.tv_sec + (tp1.tv_usec / 1000000.0)));
620    }
621  }
622#endif
623
624  return retcode;
625}
626
627/*
628 * dspam_getsource()
629 *
630 * DESCRIPTION
631 *
632 *   The dspam_getsource() function extracts the source sender from the mes-
633 *   sage  passed  in  during  a call to dspam_process() and writes not more
634 *   than size bytes to buf.
635 *
636 * RETURN VALUES
637 *   returns 0 on success, standard errors on failure
638 */
639
640int
641dspam_getsource (
642  DSPAM_CTX * CTX,
643  char *buf,
644  size_t size)
645{
646  ds_message_part_t current_block;
647  ds_header_t current_heading = NULL;
648  struct nt_node *node_nt;
649  struct nt_c c;
650  char qmailmode = 0;
651
652  if (CTX->message == NULL)
653    return EINVAL;
654
655  node_nt = c_nt_first (CTX->message->components, &c);
656  if (node_nt == NULL)
657    return EINVAL;
658
659  current_block = (ds_message_part_t) node_nt->ptr;
660
661  node_nt = c_nt_first (current_block->headers, &c);
662  while (node_nt != NULL)
663  {
664    current_heading = (ds_header_t) node_nt->ptr;
665    if (!strcmp (current_heading->heading, "Received"))
666    {
667      char *data, *ptr, *tok;
668
669      // detect and skip "Received: (qmail..." lines
670      if (!strncmp(current_heading->data, "(qmail", 6))
671      {
672        qmailmode = 1;
673        node_nt = c_nt_next (current_block->headers, &c);
674        continue;
675      }
676
677      data = strdup (current_heading->data);
678      ptr = strstr (data, "from");
679
680      if (ptr != NULL)
681      {
682        if (strchr(data, '['))  // found a non-qmail header
683        {
684          qmailmode = 0;
685        }
686
687        // qmail puts the sending IP inside the last "()" pair of the line
688        if (qmailmode)
689        {
690          tok = strrchr(data, ')');
691
692          if (tok != NULL)
693          {
694              *tok = 0;
695              tok = strrchr(data, '(');
696              if (tok != NULL)
697                tok++;
698          }
699        }
700        else
701        {
702          char *ptrptr = NULL;
703          tok = strtok_r (ptr, "[", &ptrptr);
704
705          if (tok != NULL)
706          {
707            tok = strtok_r (NULL, "]", &ptrptr);
708          }
709        }
710        if (tok != NULL)
711        {
712          int whitelisted = 0;
713          if (!strncmp (tok, "127.",4) ||        // ignore localhost
714              !strncmp (tok, "10.", 3) ||        // ignore RFC 1918 private addresses
715              !strncmp (tok, "172.16.", 7) ||
716              !strncmp (tok, "192.168.", 8) ||
717              !strncmp (tok, "169.254.", 8))     // ignore local-link
718            whitelisted = 1;
719
720          if (_ds_match_attribute(CTX->config->attributes, "LocalMX", tok))
721            whitelisted = 1;
722
723          if (!whitelisted)
724          {
725            strlcpy (buf, tok, size);
726            free (data);
727            return 0;
728          }
729        }
730      }
731      free (data);
732    }
733    node_nt = c_nt_next (current_block->headers, &c);
734  }
735
736  return EFAILURE;
737}
738
739/*
740 * _ds_operate() - operate on the message
741 *
742 * DESCRIPTION
743 *    calculate the statistical probability the email is spam
744 *    update tokens in dictionary according to result/mode
745 *
746 * INPUT ARGUMENTS
747 *     DSPAM_CTX *CTX    pointer to context
748 *     char *header      pointer to message header
749 *     char *body        pointer to message body
750 *
751 * RETURN VALUES
752 *   standard errors on failure
753 *
754 *     DSR_ISSPAM           message is spam
755 *     DSR_ISINNOCENT       message is innocent
756 */
757
758int
759_ds_operate (DSPAM_CTX * CTX, char *headers, char *body)
760{
761  int errcode = 0;
762
763  /* Create our diction (lexical data in message) and patterns */
764
765  ds_diction_t diction = ds_diction_create(24593ul);
766  ds_diction_t bnr_patterns = NULL;
767  ds_term_t ds_term;
768  ds_cursor_t ds_c;
769
770  ds_heap_t heap_sort = NULL;    /* Heap sort for top N tokens */
771
772#ifdef LIBBNR_DEBUG
773  ds_heap_t heap_nobnr = NULL;
774#endif
775
776  unsigned long long whitelist_token = 0;
777  int do_whitelist = 0;
778  int result;
779  unsigned int heap_sort_items = 0;
780
781  if (CTX->algorithms & DSA_BURTON)
782    heap_sort = ds_heap_create(BURTON_WINDOW_SIZE, HP_DELTA);
783  else if (CTX->algorithms & DSA_ROBINSON)
784    heap_sort = ds_heap_create(25, HP_DELTA);
785  else
786    heap_sort = ds_heap_create(15, HP_DELTA);
787
788  /* Allocate SBPH signature (stored as message text) */
789
790  if ( CTX->tokenizer == DSZ_SBPH
791    && CTX->flags & DSF_SIGNATURE
792    && ( (  CTX->operating_mode != DSM_CLASSIFY
793         && CTX->classification == DSR_NONE)
794       || ! (CTX->_sig_provided))
795    && CTX->source != DSS_CORPUS)
796  {
797    if (CTX->signature) {
798      if (CTX->signature->data)
799        free(CTX->signature->data);
800      free(CTX->signature);
801      CTX->signature = NULL;
802    }
803    CTX->signature = calloc (1, sizeof (struct _ds_spam_signature));
804    if (CTX->signature == NULL)
805    {
806      LOG (LOG_CRIT, "memory allocation error");
807      errcode = EUNKNOWN;
808      goto bail;
809    }
810                                                                               
811    CTX->signature->length = strlen(headers)+strlen(body)+2;
812    CTX->signature->data = malloc(CTX->signature->length);
813
814    if (CTX->signature->data == NULL)
815    {
816      LOG (LOG_CRIT, "memory allocation error");
817      free (CTX->signature);
818      CTX->signature = NULL;
819      errcode = EUNKNOWN;
820      goto bail;
821    }
822
823    strcpy(CTX->signature->data, headers);
824    strcat(CTX->signature->data, "\001");
825    strcat(CTX->signature->data, body);
826  }
827
828  if (!diction)
829  {
830    LOG (LOG_CRIT, ERR_MEM_ALLOC);
831    errcode = EUNKNOWN;
832    goto bail;
833  }
834
835#ifdef LIBBNR_DEBUG
836  heap_nobnr = ds_heap_create (heap_sort->size, HP_DELTA);
837  if (heap_nobnr == NULL) {
838    LOG (LOG_CRIT, ERR_MEM_ALLOC);
839    errcode = EUNKNOWN;
840    goto bail;
841  }
842#endif
843
844  CTX->result =
845    (CTX->classification == DSR_ISSPAM) ? DSR_ISSPAM : DSR_ISINNOCENT;
846
847  /* If we are classifying based on a signature, preprogram the tree */
848
849  if (CTX->flags & DSF_SIGNATURE          &&
850      CTX->operating_mode == DSM_CLASSIFY &&
851      CTX->_sig_provided)
852  {
853    int num_tokens =
854      CTX->signature->length / sizeof (struct _ds_signature_token);
855    struct _ds_signature_token t;
856
857    int i;
858    for (i = 0; i < num_tokens; i++)
859    {
860      char x[128];
861      memcpy (&t,
862              (char *) CTX->signature->data +
863              (i * sizeof (struct _ds_signature_token)),
864              sizeof (struct _ds_signature_token));
865      snprintf (x, sizeof (x), "E: %" LLU_FMT_SPEC, t.token);
866      ds_term = ds_diction_touch(diction, t.token, x, 0);
867      if (ds_term)
868        ds_term->frequency = t.frequency;
869    }
870  }
871
872  /* Otherwise, tokenize the message and propagate the tree */
873
874  else
875  {
876    if (_ds_tokenize(CTX, headers, body, diction)) {
877      LOG(LOG_CRIT, "tokenizer failed");
878    }
879    whitelist_token = diction->whitelist_token;
880  }
881
882
883  /* Load all token statistics */
884  if (_ds_getall_spamrecords (CTX, diction))
885  {
886    LOGDEBUG ("_ds_getall_spamrecords() failed");
887    errcode = EUNKNOWN;
888    goto bail;
889  }
890
891  /* Apply Bayesian Noise Reduction */
892  if (CTX->flags & DSF_NOISE)
893  {
894    ds_diction_t p = _ds_apply_bnr(CTX, diction);
895    if (p)
896      ds_diction_destroy(p);
897  }
898
899  if (CTX->flags & DSF_WHITELIST)
900  {
901    LOGDEBUG("Whitelist threshold: %d", CTX->wh_threshold);
902  }
903
904  /* Create a heap sort based on the token's delta from .5 */
905  ds_c = ds_diction_cursor(diction);
906  ds_term = ds_diction_next(ds_c);
907  while(ds_term)
908  {
909
910    if (ds_term->key == CONTROL_TOKEN) {
911      ds_term = ds_diction_next(ds_c);
912      continue;
913    }
914
915    if (ds_term->s.probability == 0.00000 || CTX->classification != DSR_NONE)
916      _ds_calc_stat (CTX, ds_term, &ds_term->s, DTT_DEFAULT, NULL);
917
918    if (CTX->flags & DSF_WHITELIST) {
919      if (ds_term->key == whitelist_token              &&
920          ds_term->s.spam_hits <= (ds_term->s.innocent_hits / 15) &&
921          ds_term->s.innocent_hits > CTX->wh_threshold &&
922          CTX->classification == DSR_NONE)
923      {
924        do_whitelist = 1;
925      }
926    }
927
928    if (ds_term->frequency > 0 && ds_term->type == 'D')
929    {
930      ds_heap_insert (heap_sort, ds_term->s.probability, ds_term->key,
931             ds_term->frequency, _ds_compute_complexity(ds_term->name));
932    }
933
934#ifdef LIBBNR_DEBUG
935    if (ds_term->type == 'D')
936    {
937      ds_heap_insert (heap_nobnr, ds_term->s.probability, ds_term->key,
938             ds_term->frequency, _ds_compute_complexity(ds_term->name));
939    }
940#endif
941
942#ifdef VERBOSE
943    LOGDEBUG ("Token: %s [%f] SH %ld IH %ld", ds_term->name, ds_term->s.probability, ds_term->s.spam_hits, ds_term->s.innocent_hits);
944#endif
945
946    ds_term = ds_diction_next(ds_c);
947  }
948  ds_diction_close(ds_c);
949
950  /* Keep track of items in heap_sort. We need that info later on when freeing the signature */
951  heap_sort_items = heap_sort->items;
952
953  /* Take the 15 most interesting tokens and generate a score */
954
955  if (heap_sort->items == 0)
956  {
957    LOGDEBUG ("no tokens found in message");
958    errcode = EINVAL;
959    goto bail;
960  }
961
962  /* Initialize Non-SBPH signature, if requested */
963
964  if ( CTX->tokenizer != DSZ_SBPH
965    && CTX->flags & DSF_SIGNATURE
966    && (CTX->operating_mode != DSM_CLASSIFY || ! CTX->_sig_provided))
967  {
968    if (CTX->signature) {
969      if (CTX->signature->data)
970        free(CTX->signature->data);
971      free(CTX->signature);
972      CTX->signature = NULL;
973    }
974    CTX->signature = calloc (1, sizeof (struct _ds_spam_signature));
975    if (CTX->signature == NULL)
976    {
977      LOG (LOG_CRIT, "memory allocation error");
978      errcode = EUNKNOWN;
979      goto bail;
980    }
981
982    CTX->signature->length =
983      sizeof (struct _ds_signature_token) * diction->items;
984    CTX->signature->data = malloc (CTX->signature->length);
985    if (CTX->signature->data == NULL)
986    {
987      LOG (LOG_CRIT, "memory allocation error");
988      free (CTX->signature);
989      CTX->signature = NULL;
990      errcode = EUNKNOWN;
991      goto bail;
992    }
993  }
994
995#ifdef LIBBNR_DEBUG
996  {
997    int x = CTX->result;
998    int nobnr_result = 0;
999
1000    if (CTX->flags & DSF_NOISE) {
1001      nobnr_result = _ds_calc_result(CTX, heap_nobnr, diction);
1002
1003      if (CTX->factors) {
1004        _ds_factor_destroy(CTX->factors);
1005        CTX->factors = NULL;
1006      }
1007      CTX->result = x;
1008      CTX->probability = DSP_UNCALCULATED;
1009    }
1010#endif
1011
1012    result = _ds_calc_result(CTX, heap_sort, diction);
1013
1014#ifdef LIBBNR_DEBUG
1015    if (CTX->flags & DSF_NOISE) {
1016      if (nobnr_result == result) {
1017        LOGDEBUG("BNR Decision Concurs");
1018      } else {
1019        LOGDEBUG("BNR Decision Conflicts: %d (BNR) / %d (No BNR)", result, nobnr_result);
1020      }
1021    }
1022  }
1023#endif
1024
1025  if (CTX->flags & DSF_WHITELIST && do_whitelist) {
1026    LOGDEBUG("auto-whitelisting this message");
1027    CTX->result = DSR_ISINNOCENT;
1028    strcpy(CTX->class, LANG_CLASS_WHITELISTED);
1029  }
1030
1031  /* Update Totals */
1032
1033  /* SPAM */
1034  if (CTX->result == DSR_ISSPAM && CTX->operating_mode != DSM_CLASSIFY)
1035  {
1036    if (!(CTX->flags & DSF_UNLEARN)) {
1037      CTX->totals.spam_learned++;
1038      CTX->learned = 1;
1039    }
1040
1041    if (CTX->classification == DSR_ISSPAM)
1042    {
1043      if (CTX->flags & DSF_UNLEARN) {
1044        CTX->totals.spam_learned -= (CTX->totals.spam_learned > 0) ? 1 : 0;
1045      } else if (CTX->source == DSS_CORPUS || CTX->source == DSS_INOCULATION) {
1046        CTX->totals.spam_corpusfed++;
1047      }
1048      else if (SPAM_MISS(CTX))
1049      {
1050        CTX->totals.spam_misclassified++;
1051        if (CTX->training_mode != DST_TOE && CTX->training_mode != DST_NOTRAIN)
1052        {
1053          CTX->totals.innocent_learned -=
1054            (CTX->totals.innocent_learned > 0) ? 1 : 0;
1055        }
1056      }
1057    }
1058
1059    /* INNOCENT */
1060  }
1061  else if ((CTX->result == DSR_ISINNOCENT) &&
1062            CTX->operating_mode != DSM_CLASSIFY)
1063  {
1064    if (!(CTX->flags & DSF_UNLEARN)) {
1065      CTX->totals.innocent_learned++;
1066      CTX->learned = 1;
1067    }
1068
1069    if (CTX->source == DSS_CORPUS || CTX->source == DSS_INOCULATION)
1070    {
1071      CTX->totals.innocent_corpusfed++;
1072    }
1073    else if (FALSE_POSITIVE(CTX))
1074    {
1075      if (CTX->flags & DSF_UNLEARN) {
1076        CTX->totals.innocent_learned -= (CTX->totals.innocent_learned >0) ? 1:0;
1077      } else {
1078        CTX->totals.innocent_misclassified++;
1079        if (CTX->training_mode != DST_TOE && CTX->training_mode != DST_NOTRAIN)
1080        {
1081          CTX->totals.spam_learned -= (CTX->totals.spam_learned > 0) ? 1 : 0;
1082        }
1083      }
1084    }
1085  }
1086
1087  /* TOE mode increments 'classified' totals */
1088  if (CTX->training_mode == DST_TOE && CTX->operating_mode == DSM_CLASSIFY) {
1089    if (CTX->result == DSR_ISSPAM)
1090      CTX->totals.spam_classified++;
1091    else if (CTX->result == DSR_ISINNOCENT)
1092      CTX->totals.innocent_classified++;
1093  }
1094
1095  _ds_increment_tokens(CTX, diction);
1096
1097  /* Store all tokens */
1098  if (CTX->training_mode != DST_NOTRAIN) {
1099    if (_ds_setall_spamrecords (CTX, diction))
1100    {
1101      LOGDEBUG ("_ds_setall_spamrecords() failed");
1102      errcode = EUNKNOWN;
1103      goto bail;
1104    }
1105  }
1106
1107  ds_diction_destroy (diction);
1108  ds_heap_destroy (heap_sort);
1109#ifdef LIBBNR_DEBUG
1110  ds_heap_destroy (heap_nobnr);
1111#endif
1112
1113  /* One final sanity check */
1114
1115  if (CTX->classification == DSR_ISINNOCENT)
1116  {
1117    CTX->probability = 0.0;
1118    CTX->result = DSR_ISINNOCENT;
1119  }
1120  else if (CTX->classification == DSR_ISSPAM)
1121  {
1122    CTX->probability = 1.0;
1123    CTX->result = DSR_ISSPAM;
1124  }
1125
1126  return CTX->result;
1127
1128bail:
1129  LOG(LOG_ERR, "bailing on error %d", errcode);
1130  ds_heap_destroy (heap_sort);
1131#ifdef LIBBNR_DEBUG
1132  ds_heap_destroy (heap_nobnr);
1133#endif
1134  ds_diction_destroy(diction);
1135  ds_diction_destroy(bnr_patterns);
1136  if (CTX->signature != NULL) {
1137    if (CTX->signature->data != NULL) {
1138      free(CTX->signature->data);
1139      CTX->signature->data = NULL;
1140    }
1141    if (CTX->signature != NULL && heap_sort_items > 0)
1142      free (CTX->signature);
1143    CTX->signature = NULL;
1144  }
1145  return errcode;
1146}
1147
1148/*
1149 * _ds_process_signature()
1150 *
1151 * DESCRIPTION
1152 *   process an erroneously classified message processing based on signature
1153 *
1154 * INPUT ARGUMENTS
1155 *   parameters: DSPAM_CTX *CTX         Pointer to context containing signature
1156 */
1157
1158int
1159_ds_process_signature (DSPAM_CTX * CTX)
1160{
1161  struct _ds_signature_token t;
1162  int num_tokens, i;
1163  ds_diction_t diction = ds_diction_create(24593ul);
1164  ds_term_t ds_term;
1165  ds_cursor_t ds_c;
1166  int occurrence = _ds_match_attribute(CTX->config->attributes,
1167     "ProcessorWordFrequency", "occurrence");
1168
1169  if (diction == NULL) {
1170    LOG (LOG_CRIT, ERR_MEM_ALLOC);
1171    return EUNKNOWN;
1172  }
1173
1174  if (CTX->signature == NULL) {
1175    LOG(LOG_WARNING, "DSF_SIGNATURE specified, but no signature provided.");
1176    ds_diction_destroy(diction);
1177    return EINVAL;
1178  }
1179
1180  LOGDEBUG ("processing signature.  length: %ld", CTX->signature->length);
1181
1182  CTX->result = DSR_NONE;
1183
1184  if (!(CTX->flags & DSF_UNLEARN))
1185    CTX->learned = 1;
1186
1187  /* INNOCENT */
1188  if (CTX->classification == DSR_ISINNOCENT &&
1189      CTX->operating_mode != DSM_CLASSIFY)
1190  {
1191    if (CTX->flags & DSF_UNLEARN) {
1192      CTX->totals.innocent_learned -= (CTX->totals.innocent_learned) > 0 ? 1:0;
1193    } else {
1194      if (CTX->source == DSS_ERROR) {
1195        CTX->totals.innocent_misclassified++;
1196        if (CTX->training_mode != DST_TOE && CTX->training_mode != DST_NOTRAIN)
1197        {
1198          CTX->totals.spam_learned -= (CTX->totals.spam_learned > 0) ? 1:0;
1199        }
1200      } else {
1201        CTX->totals.innocent_corpusfed++;
1202      }
1203
1204      CTX->totals.innocent_learned++;
1205    }
1206  }
1207
1208  /* SPAM */
1209  else if (CTX->classification == DSR_ISSPAM &&
1210           CTX->operating_mode != DSM_CLASSIFY)
1211  {
1212    if (CTX->flags & DSF_UNLEARN) {
1213      CTX->totals.spam_learned -= (CTX->totals.spam_learned > 0) ? 1 : 0;
1214    } else {
1215      if (CTX->source == DSS_ERROR) {
1216        CTX->totals.spam_misclassified++;
1217        if (CTX->training_mode != DST_TOE && CTX->training_mode != DST_NOTRAIN)
1218        {
1219          CTX->totals.innocent_learned -= (CTX->totals.innocent_learned > 0) ? 1:0;
1220        }
1221      } else {
1222        CTX->totals.spam_corpusfed++;
1223      }
1224      CTX->totals.spam_learned++;
1225    }
1226  }
1227
1228  num_tokens = CTX->signature->length / sizeof (struct _ds_signature_token);
1229
1230  if (CTX->class[0] == 0) {
1231    if (CTX->classification == DSR_ISSPAM)
1232      strcpy(CTX->class, LANG_CLASS_SPAM);
1233    else if (CTX->classification == DSR_ISINNOCENT)
1234      strcpy(CTX->class, LANG_CLASS_INNOCENT);
1235  }
1236
1237  /* Don't retrain if no tokens where loaded from the signature */
1238  if (num_tokens == 0)
1239  {
1240    LOG (LOG_WARNING, "Skipping retraining for signature with %d tokens", num_tokens);
1241    LOGDEBUG ("Skipping retraining for signature with %d tokens", num_tokens);
1242  } else {
1243    LOGDEBUG ("Reversing %d tokens", num_tokens);
1244    for (i = 0; i < num_tokens; i++)
1245    {
1246      memcpy (&t,
1247              (char *) CTX->signature->data +
1248              (i * sizeof (struct _ds_signature_token)),
1249              sizeof (struct _ds_signature_token));
1250      ds_term = ds_diction_touch (diction, t.token, "-", 0);
1251      if (ds_term)
1252      {
1253        ds_term->frequency = t.frequency;
1254      }
1255    }
1256
1257    if (_ds_getall_spamrecords (CTX, diction)) {
1258      ds_diction_destroy(diction);
1259      return EUNKNOWN;
1260    }
1261
1262    ds_c = ds_diction_cursor(diction);
1263    ds_term = ds_diction_next(ds_c);
1264    while(ds_term)
1265    {
1266      /* INNOCENT */
1267      if (CTX->classification == DSR_ISINNOCENT)
1268      {
1269        if (CTX->flags & DSF_UNLEARN)
1270        {
1271          if (occurrence)
1272          {
1273            ds_term->s.innocent_hits -= ds_term->frequency;
1274            if (ds_term->s.innocent_hits < 0)
1275              ds_term->s.innocent_hits = 0;
1276          } else {
1277            ds_term->s.innocent_hits -= (ds_term->s.innocent_hits>0) ? 1:0;
1278          }
1279        } else {
1280          if (CTX->source == DSS_ERROR          &&
1281              CTX->training_mode != DST_NOTRAIN &&
1282              CTX->training_mode != DST_TOE)
1283          {
1284            if (occurrence)
1285            {
1286              ds_term->s.spam_hits -= ds_term->frequency;
1287              if (ds_term->s.spam_hits < 0)
1288                ds_term->s.spam_hits = 0;
1289            } else {
1290              ds_term->s.spam_hits -= (ds_term->s.spam_hits>0) ? 1:0;
1291            }
1292          }
1293
1294          if (CTX->source == DSS_INOCULATION)
1295          {
1296            if (ds_term->s.spam_hits < 2 && ds_term->s.innocent_hits < 5)
1297            {
1298              ds_term->s.innocent_hits += 5;
1299            }
1300            else
1301            {
1302              ds_term->s.innocent_hits += 2;
1303            }
1304          } else /* ERROR or CORPUS */
1305          {
1306            if (occurrence)
1307            {
1308              ds_term->s.innocent_hits += ds_term->frequency;
1309            } else {
1310              ds_term->s.innocent_hits++;
1311            }
1312          }
1313        }
1314      }
1315
1316      /* SPAM */
1317      else if (CTX->classification == DSR_ISSPAM)
1318      {
1319        if (CTX->flags & DSF_UNLEARN)
1320        {
1321          if (occurrence)
1322          {
1323            ds_term->s.spam_hits -= ds_term->frequency;
1324            if (ds_term->s.spam_hits < 0)
1325              ds_term->s.spam_hits = 0;
1326          } else {
1327            ds_term->s.spam_hits -= (ds_term->s.spam_hits>0) ? 1:0;
1328          }
1329        } else {
1330          if (CTX->source == DSS_ERROR          &&
1331              CTX->training_mode != DST_NOTRAIN &&
1332              CTX->training_mode != DST_TOE)
1333          {
1334            if (occurrence)
1335            {
1336              ds_term->s.innocent_hits -= ds_term->frequency;
1337              if (ds_term->s.innocent_hits < 0)
1338                ds_term->s.innocent_hits = 0;
1339            } else {
1340              ds_term->s.innocent_hits -= (ds_term->s.innocent_hits>0) ? 1:0;
1341            }
1342          }
1343
1344          if (CTX->source == DSS_INOCULATION)
1345          {
1346            if (ds_term->s.innocent_hits < 2 && ds_term->s.spam_hits < 5)
1347            {
1348              ds_term->s.spam_hits += 5;
1349            }
1350            else
1351            {
1352              ds_term->s.spam_hits += 2;
1353            }
1354          } else /* ERROR or CORPUS */
1355          {
1356            if (occurrence)
1357            {
1358              ds_term->s.spam_hits += ds_term->frequency;
1359            } else {
1360              ds_term->s.spam_hits++;
1361            }
1362          }
1363        }
1364      }
1365
1366      ds_term->s.status |= TST_DIRTY;
1367      ds_term = ds_diction_next(ds_c);
1368    }
1369    ds_diction_close(ds_c);
1370
1371    if (CTX->training_mode != DST_NOTRAIN) {
1372      if (_ds_setall_spamrecords (CTX, diction)) {
1373        ds_diction_destroy(diction);
1374        return EUNKNOWN;
1375      }
1376    }
1377  }
1378
1379  if (CTX->classification == DSR_ISSPAM)
1380  {
1381    CTX->probability = 1.0;
1382    CTX->result = DSR_ISSPAM;
1383    LOGDEBUG ("Message classification/result: SPAM");
1384  }
1385  else
1386  {
1387    CTX->probability = 0.0;
1388    CTX->result = DSR_ISINNOCENT;
1389    LOGDEBUG ("Message classification/result: INNOCENT");
1390  }
1391
1392  ds_diction_destroy(diction);
1393  return 0;
1394}
1395
1396/*
1397 *  _ds_calc_stat() - Calculate the probability of a token
1398 *
1399 * DESCRIPTION
1400 *
1401 *  Calculates the probability of an individual token based on  the
1402 *  pvalue algorithm chosen. The resulting value largely depends on
1403 *  the total  amount of ham/spam in the user's corpus. The result
1404 *  is written to s.
1405 *
1406 * INPUT ARGUMENTS
1407 *      CTX           DSPAM context
1408 *      term          ds_term_t
1409 *      token_type    DTT_ value specifying token type
1410 *      bnr_tot       BNR totals structure
1411 */
1412
1413int
1414_ds_calc_stat (
1415  DSPAM_CTX * CTX,
1416  ds_term_t term,
1417  struct _ds_spam_stat *s,
1418  int token_type,
1419  struct _ds_spam_stat *bnr_tot)
1420{
1421  int min_hits, sed_hits = 0;
1422  unsigned long ti, ts;
1423
1424  if (token_type == DTT_BNR) {
1425    min_hits = 25; /* Bayesian Noise Reduction patterns */
1426
1427  } else {
1428    min_hits = 5; /* "Standard" token threshold */
1429  }
1430
1431  /*  Statistical Sedation: Adjust hapaxial threshold to compensate for a
1432   *  spam corpus imbalance
1433   */
1434
1435  ti = CTX->totals.innocent_learned + CTX->totals.innocent_classified;
1436  ts = CTX->totals.spam_learned + CTX->totals.spam_classified;
1437  if (CTX->training_buffer>0) {
1438    if (ti < 1000 && ti < ts)
1439    {
1440      sed_hits = min_hits+(CTX->training_buffer/2)+
1441                   (CTX->training_buffer*((ts-ti)/200));
1442    }
1443
1444    if (ti < 2500 && ti >=1000 && ts > ti)
1445    {
1446      float spams = (ts * 1.0 / (ts * 1.0 + ti * 1.0)) * 100;
1447      sed_hits = min_hits+(CTX->training_buffer/2)+
1448                   (CTX->training_buffer*(spams/20));
1449    }
1450  } else if (! CTX->training_buffer) {
1451    min_hits = 5;
1452  }
1453
1454  if (token_type != DTT_DEFAULT || sed_hits > min_hits)
1455    min_hits = sed_hits;
1456
1457  /*  TUM mode training only records up to 20 hits so we need to make sure we
1458   *  don't require more than that.
1459   */
1460
1461  if (CTX->training_mode == DST_TUM && min_hits > 20)
1462    min_hits = 20;
1463
1464  if (CTX->classification == DSR_ISSPAM)
1465    s->probability = .7;
1466  else
1467    s->probability = (CTX->algorithms & DSP_MARKOV) ? .5 : .4;
1468
1469  /* Markovian Weighting */
1470
1471  if (CTX->algorithms & DSP_MARKOV) {
1472    unsigned int weight;
1473    long num, den;
1474
1475    /*  some utilities don't provide the token name, and so we can't compute
1476     *  a probability. just return something neutral.
1477     */
1478    if (term == NULL) {
1479      s->probability = .5;
1480      return 0;
1481    }
1482
1483    /* return neutral probability for BNR patterns */
1484    if (token_type == DTT_BNR || term->type == 'B' || !strncmp(term->name, "bnr.", 4)) {
1485      s->probability = .5;
1486      return 0;
1487    }
1488
1489    /* return neutral probability for frequency tokens */
1490    if (!strncmp(term->name, "E: ", 3)) {
1491      s->probability = .5;
1492      return 0;
1493    }
1494
1495    /* return neutral probability for "From" tokens (used for when whitelisting) */
1496    if (!strncmp(term->name, "From*", 5)) {
1497      s->probability = .5;
1498      return 0;
1499    }
1500
1501    /* return neutral probability for control tokens */
1502    if (!strncmp(term->name, "$$CONTROL$$", 11)) {
1503      s->probability = .5;
1504      return 0;
1505    }
1506
1507    weight = _ds_compute_weight(term->name);
1508
1509    if (CTX->flags & DSF_BIAS) {
1510      num = weight * (s->spam_hits - (s->innocent_hits*2));
1511      den = C1 * (s->spam_hits + (s->innocent_hits*2) + C2) * 256;
1512      s->probability = 0.49 + ((double) num / (double) den);
1513    } else {
1514      num = (s->spam_hits - s->innocent_hits) * weight;
1515      den = C1 * (s->spam_hits + s->innocent_hits + C2) * 256;
1516      s->probability = 0.5 + ((double) num / (double) den);
1517    }
1518
1519  /* Graham and Robinson Start Here */
1520
1521  } else {
1522    int ih = 1;
1523    if (CTX->flags & DSF_BIAS)
1524      ih = 2;
1525
1526    if (CTX->totals.spam_learned > 0 && CTX->totals.innocent_learned > 0)
1527    {
1528      if (token_type == DTT_BNR) {
1529        s->probability =
1530          (s->spam_hits * 1.0 / bnr_tot->spam_hits * 1.0) /
1531          ((s->spam_hits * 1.0 / bnr_tot->spam_hits * 1.0) +
1532           (s->innocent_hits * 1.0 / bnr_tot->innocent_hits * 1.0));
1533      } else {
1534        s->probability =
1535          (s->spam_hits * 1.0 / CTX->totals.spam_learned * 1.0) /
1536          ((s->spam_hits * 1.0 / CTX->totals.spam_learned * 1.0) +
1537           (s->innocent_hits * ih * 1.0 / CTX->totals.innocent_learned * 1.0));
1538      }
1539    }
1540
1541    if (s->spam_hits == 0 && s->innocent_hits > 0) {
1542      s->probability = 0.01;
1543      if (CTX->totals.spam_learned > 0 && CTX->totals.innocent_learned > 0)
1544      {
1545        if ((1.0 / CTX->totals.spam_learned * 1.0) /
1546           ((1.0 / CTX->totals.spam_learned * 1.0) +
1547           (s->innocent_hits * ih * 1.0 / CTX->totals.innocent_learned * 1.0))
1548          < 0.01)
1549        {
1550          s->probability = (1.0 / CTX->totals.spam_learned * 1.0) /
1551           ((1.0 / CTX->totals.spam_learned * 1.0) +
1552            (s->innocent_hits * ih *1.0 / CTX->totals.innocent_learned * 1.0));
1553        }
1554      }
1555    }
1556    else if (s->spam_hits > 0 && s->innocent_hits == 0) {
1557      s->probability = 0.99;
1558      if (CTX->totals.spam_learned > 0 && CTX->totals.innocent_learned > 0)
1559      {
1560        if ((s->spam_hits * 1.0 / CTX->totals.spam_learned * 1.0) /
1561           ((s->spam_hits * 1.0 / CTX->totals.spam_learned * 1.0) +
1562           (ih * 1.0 / CTX->totals.innocent_learned * 1.0))
1563          > 0.99)
1564        {
1565          s->probability = (s->spam_hits * 1.0 / CTX->totals.spam_learned * 1.0)
1566           / ((s->spam_hits * 1.0 / CTX->totals.spam_learned * 1.0)
1567           + (ih * 1.0 / CTX->totals.innocent_learned * 1.0));
1568        }
1569      }
1570    }
1571
1572    if (  (CTX->flags & DSF_BIAS &&
1573          (s->spam_hits + (2 * s->innocent_hits) < min_hits))
1574       || (!(CTX->flags & DSF_BIAS) &&
1575          (s->spam_hits + s->innocent_hits < min_hits)))
1576    {
1577      s->probability = (CTX->algorithms & DSP_MARKOV) ? .5000 : .4;
1578    }
1579  }
1580
1581  if (s->probability < 0.0001)
1582    s->probability = 0.0001;
1583
1584  if (s->probability > 0.9999)
1585    s->probability = 0.9999;
1586
1587  /* Finish off Robinson */
1588
1589  if (token_type != DTT_BNR && CTX->algorithms & DSP_ROBINSON)
1590  {
1591    unsigned long n = s->spam_hits + s->innocent_hits;
1592    double fw = ((CHI_S * CHI_X) + (n * s->probability))/(CHI_S + n);
1593    s->probability = fw;
1594  }
1595
1596  return 0;
1597}
1598
1599/*
1600 *  _ds_calc_result()
1601 *
1602 * DESCRIPTION
1603 *   Perform statistical combination of the token index
1604 *
1605 *    Passed in an index of tokens, this function is responsible for choosing
1606 *    and combining the most relevant characteristics (based on the algorithms
1607 *    configured) and calculating libdspam's decision about the provided
1608 *    message sample.
1609 */
1610
1611int
1612_ds_calc_result(DSPAM_CTX *CTX, ds_heap_t heap_sort, ds_diction_t diction)
1613{
1614  struct _ds_spam_stat stat;
1615  ds_heap_element_t node_heap;
1616  ds_heap_element_t heap_list[heap_sort->items];
1617
1618  /* Naive-Bayesian */
1619  float nbay_top = 0.0;
1620  float nbay_bot = 0.0;
1621  float nbay_result = -1;
1622  long nbay_used = 0;            /* Total tokens used in naive bayes */
1623  struct nt *factor_nbayes = nt_create(NT_PTR);
1624
1625  /* Graham-Bayesian */
1626  float bay_top = 0.0;
1627  float bay_bot = 0.0;
1628  float bay_result = -1;
1629  long bay_used = 0;            /* Total tokens used in bayes */
1630  struct nt *factor_bayes = nt_create(NT_PTR);
1631
1632  /* Burton-Bayesian */
1633  double abay_top = 0.0;
1634  double abay_bot = 0.0;
1635  double abay_result = -1;
1636  long abay_used = 0;           /* Total tokens used in altbayes */
1637  struct nt *factor_altbayes = nt_create(NT_PTR);
1638
1639  /* Robinson's Geometric Mean, used to calculate confidence */
1640  float rob_top = 0.0;                  /* Robinson's Geometric Mean */
1641  float rob_bot = 0.0;
1642  float rob_result = -1;
1643  double p = 0.0, q = 0.0, s = 0.0;     /* Robinson PQS Calculations */
1644  long rob_used = 0;                    /* Total tokens used in Robinson's GM */
1645  struct nt *factor_rob = nt_create(NT_PTR);
1646
1647  /* Fisher-Robinson's Chi-Square */
1648  float chi_result = -1;
1649  long chi_used  = 0, chi_sx = 0, chi_hx = 0;
1650  double chi_s = 1.0, chi_h = 1.0;
1651  struct nt *factor_chi = nt_create(NT_PTR);
1652  unsigned int i;
1653
1654  /* Invert the heap */
1655  node_heap = heap_sort->root;
1656  for(i=0;i<heap_sort->items;i++) {
1657    heap_list[(heap_sort->items-i)-1] = node_heap;
1658    node_heap = node_heap->next;
1659  }
1660
1661  /* BEGIN Combine Token Values */
1662  for(i=0;i<heap_sort->items;i++)
1663  {
1664    char *token_name;
1665    ds_term_t ds_term;
1666
1667    node_heap = heap_list[i];
1668    ds_term = ds_diction_find(diction, node_heap->token);
1669
1670    if (!ds_term)
1671      continue;
1672
1673    /* Skip BNR patterns */
1674    if (ds_term->type == 'B')
1675      continue;
1676
1677    token_name = ds_term->name;
1678
1679    if (ds_diction_getstat(diction, node_heap->token, &stat) || !token_name)
1680      continue;
1681
1682    /* Set the probability if we've provided a classification */
1683    if (CTX->classification == DSR_ISSPAM)
1684      stat.probability = 1.00;
1685    else if (CTX->classification == DSR_ISINNOCENT)
1686      stat.probability = 0.00;
1687
1688    /* Graham-Bayesian */
1689    if (CTX->algorithms & DSA_GRAHAM && bay_used < 15)
1690    {
1691        LOGDEBUG ("[graham] [%2.6f] %s (%dfrq, %lds, %ldi)",
1692                  stat.probability, token_name, ds_term->frequency,
1693                  stat.spam_hits, stat.innocent_hits);
1694
1695      _ds_factor(factor_bayes, token_name, stat.probability);
1696
1697      if (bay_used == 0)
1698      {
1699        bay_top = stat.probability;
1700        bay_bot = 1 - stat.probability;
1701      }
1702      else
1703      {
1704        bay_top *= stat.probability;
1705        bay_bot *= (1 - stat.probability);
1706      }
1707
1708      bay_used++;
1709    }
1710
1711    /* Burton Bayesian */
1712    if (CTX->algorithms & DSA_BURTON && abay_used < BURTON_WINDOW_SIZE)
1713    {
1714        LOGDEBUG ("[burton] [%2.6f] %s (%dfrq, %lds, %ldi)",
1715                  stat.probability, token_name, ds_term->frequency,
1716                  stat.spam_hits, stat.innocent_hits);
1717
1718      _ds_factor(factor_altbayes, token_name, stat.probability);
1719
1720      if (abay_used == 0)
1721      {
1722        abay_top = stat.probability;
1723        abay_bot = (1 - stat.probability);
1724      }
1725      else
1726      {
1727        abay_top *= stat.probability;
1728        abay_bot *= (1 - stat.probability);
1729      }
1730
1731      abay_used++;
1732
1733      if (abay_used < BURTON_WINDOW_SIZE && ds_term->frequency > 1 )
1734      {
1735          LOGDEBUG ("[burton] [%2.6f] %s (%dfrq, %lds, %ldi)",
1736                    stat.probability, token_name, ds_term->frequency,
1737                    stat.spam_hits, stat.innocent_hits);
1738
1739        _ds_factor(factor_altbayes, token_name, stat.probability);
1740
1741        abay_used++;
1742        abay_top *= stat.probability;
1743        abay_bot *= (1 - stat.probability);
1744      }
1745
1746    }
1747
1748    /* Robinson's Geometric Mean Definitions */
1749
1750//#define ROB_S 0.010           /* Sensitivity */
1751//#define ROB_X 0.415           /* Value to use when N = 0 */
1752//#define ROB_CUTOFF    0.54
1753
1754
1755#define ROB_S   0.010           /* Sensitivity */
1756#define ROB_X   0.500           /* Value to use when N = 0 */
1757#define ROB_CUTOFF      0.50
1758
1759
1760    if (rob_used < 25)
1761    {
1762      float probability;
1763      long n = (heap_sort->items > 25) ? 25 : heap_sort->items;
1764
1765      probability = ((ROB_S * ROB_X) + (n * stat.probability)) / (ROB_S + n);
1766
1767#ifdef ROBINSON
1768#ifndef VERBOSE
1769      if (CTX->operating_mode != DSM_CLASSIFY)
1770      {
1771#endif
1772        LOGDEBUG ("[rob] [%2.6f] %s (%dfrq, %lds, %ldi)",
1773                  stat.probability, token_name, ds_term->frequency,
1774                  stat.spam_hits, stat.innocent_hits);
1775#ifndef VERBOSE
1776      }
1777#endif
1778#endif
1779
1780      _ds_factor(factor_rob, token_name, stat.probability);
1781
1782      if (probability < 0.3 || probability > 0.7)
1783      {
1784
1785        if (rob_used == 0)
1786        {
1787          rob_top = probability;
1788          rob_bot = (1 - probability);
1789        }
1790        else
1791        {
1792          rob_top *= probability;
1793          rob_bot *= (1 - probability);
1794        }
1795
1796        rob_used++;
1797
1798        if (rob_used < 25 && ds_term->frequency > 1)
1799        {
1800#ifdef ROBINSON
1801#ifndef VERBOSE
1802          if (CTX->operating_mode != DSM_CLASSIFY)
1803          {
1804#endif
1805            LOGDEBUG ("[rob] [%2.6f] %s (%dfrq, %lds, %ldi)",
1806                      stat.probability, token_name, ds_term->frequency,
1807                      stat.spam_hits, stat.innocent_hits);
1808
1809#ifndef VERBOSE
1810          }
1811#endif
1812#endif
1813
1814          _ds_factor(factor_rob, token_name, stat.probability);
1815
1816          rob_used++;
1817          rob_top *= probability;
1818          rob_bot *= (1 - probability);
1819        }
1820      }
1821    }
1822  }
1823
1824  /* END Combine Token Values */
1825
1826  /* Fisher-Robinson's Inverse Chi-Square */
1827#define CHI_CUTOFF      0.5010  /* Ham/Spam Cutoff */
1828#define CHI_EXCR        0.4500  /* Exclusionary Radius */
1829#define LN2             0.69314718055994530942 /* log e2 */
1830
1831  if (CTX->algorithms & DSA_CHI_SQUARE || CTX->algorithms & DSA_NAIVE)
1832  {
1833    ds_term_t ds_term;
1834    ds_cursor_t ds_c;
1835    double fw;
1836    int n, exp;
1837
1838    ds_c = ds_diction_cursor(diction);
1839    ds_term = ds_diction_next(ds_c);
1840    while(ds_term) {
1841
1842      if (ds_term->key == CONTROL_TOKEN) {
1843        ds_term = ds_diction_next(ds_c);
1844        continue;
1845      }
1846
1847      /* Naive-Bayesian */
1848      if (CTX->algorithms & DSA_NAIVE)
1849      {
1850          LOGDEBUG ("[naive] [%2.6f] %s (%dfrq, %lds, %ldi)",
1851                    ds_term->s.probability, ds_term->name, ds_term->frequency,
1852                    ds_term->s.spam_hits, ds_term->s.innocent_hits);
1853
1854        _ds_factor(factor_nbayes, ds_term->name, stat.probability);
1855
1856        if (nbay_used == 0)
1857        {
1858          nbay_top = stat.probability;
1859          nbay_bot = 1 - stat.probability;
1860        }
1861        else
1862        {
1863          nbay_top *= stat.probability;
1864          nbay_bot *= (1 - stat.probability);
1865        }
1866
1867        nbay_used++;
1868      }
1869
1870      if (CTX->algorithms & DSA_CHI_SQUARE) {
1871
1872        /* Skip BNR Tokens */
1873        if (ds_term->type == 'B')
1874          goto CHI_NEXT;
1875
1876        /* Convert the p-value */
1877
1878      if (CTX->algorithms & DSP_ROBINSON) {
1879          fw = ds_term->s.probability;
1880        } else {
1881          n = ds_term->s.spam_hits + ds_term->s.innocent_hits;
1882          fw = ((CHI_S * CHI_X) + (n * ds_term->s.probability))/(CHI_S + n);
1883        }
1884
1885        if (fabs(0.5-fw)>CHI_EXCR) {
1886          int iter = 1;
1887
1888          while(iter>0) {
1889            iter --;
1890
1891#ifndef VERBOSE
1892            if (CTX->operating_mode != DSM_CLASSIFY)
1893            {
1894#endif
1895              LOGDEBUG ("[chi-sq] [%2.6f] %s (%dfrq, %lds, %ldi)",
1896                        fw, ds_term->name, ds_term->frequency,
1897                        ds_term->s.spam_hits, ds_term->s.innocent_hits);
1898#ifndef VERBOSE
1899            }
1900#endif
1901
1902            _ds_factor(factor_chi, ds_term->name, ds_term->s.probability);
1903
1904            chi_used++;
1905            chi_s *= (1.0 - fw);
1906            chi_h *= fw;
1907            if (chi_s < 1e-200) {
1908              chi_s = frexp(chi_s, &exp);
1909              chi_sx += exp;
1910            }
1911            if (chi_h < 1e-200) {
1912              chi_h = frexp(chi_h, &exp);
1913              chi_hx += exp;
1914            }
1915          }
1916        }
1917      }
1918
1919CHI_NEXT:
1920      ds_term = ds_diction_next(ds_c);
1921    }
1922    ds_diction_close(ds_c);
1923  }
1924
1925  /* BEGIN Calculate Individual Probabilities */
1926
1927  if (CTX->algorithms & DSA_NAIVE) {
1928    nbay_result = (nbay_top) / (nbay_top + nbay_bot);
1929    LOGDEBUG ("Naive-Bayesian Probability: %f Samples: %ld", nbay_result,
1930              nbay_used);
1931  }
1932
1933  if (CTX->algorithms & DSA_GRAHAM) {
1934    bay_result = (bay_top) / (bay_top + bay_bot);
1935    LOGDEBUG ("Graham-Bayesian Probability: %f Samples: %ld", bay_result,
1936              bay_used);
1937  }
1938
1939  if (CTX->algorithms & DSA_BURTON) {
1940    abay_result = (abay_top) / (abay_top + abay_bot);
1941    LOGDEBUG ("Burton-Bayesian Probability: %f Samples: %ld", abay_result,
1942              abay_used);
1943  }
1944
1945  /* Robinson's */
1946  if (rob_used == 0)
1947  {
1948    p = q = s = 0;
1949  }
1950  else
1951  {
1952    p = 1.0 - pow (rob_bot, 1.0 / rob_used);
1953    q = 1.0 - pow (rob_top, 1.0 / rob_used);
1954    s = (p - q) / (p + q);
1955    s = (s + 1.0) / 2.0;
1956  }
1957
1958  rob_result = s;
1959
1960  if (CTX->algorithms & DSA_ROBINSON) {
1961    LOGDEBUG("Robinson's Geometric Confidence: %f (Spamminess: %f, "
1962      "Non-Spamminess: %f, Samples: %ld)", rob_result, p, q, rob_used);
1963  }
1964
1965  if (CTX->algorithms & DSA_CHI_SQUARE) {
1966    chi_s = log(chi_s) + chi_sx * LN2;
1967    chi_h = log(chi_h) + chi_hx * LN2;
1968
1969    if (chi_used) {
1970      chi_s = 1.0 - chi2Q(-2.0 * chi_s, 2 * chi_used);
1971      chi_h = 1.0 - chi2Q(-2.0 * chi_h, 2 * chi_used);
1972
1973      chi_result = ((chi_s-chi_h)+1.0) / 2.0;
1974    } else {
1975      chi_result = (float)(CHI_CUTOFF-0.1);
1976    }
1977
1978    LOGDEBUG("Chi-Square Confidence: %f", chi_result);
1979  }
1980
1981/* END Calculate Individual Probabilities */
1982
1983/* BEGIN Determine Result */
1984
1985  if (CTX->classification == DSR_ISSPAM) {
1986    CTX->result = DSR_ISSPAM;
1987    CTX->probability = 1.0;
1988  } else if (CTX->classification == DSR_ISINNOCENT) {
1989    CTX->result = DSR_ISINNOCENT;
1990    CTX->probability = 0.0;
1991  } else {
1992    struct nt *factor = NULL;
1993
1994    if (CTX->algorithms & DSA_NAIVE) {
1995      factor = factor_nbayes;
1996      if (((CTX->algorithms & DSP_MARKOV) && nbay_result > 0.5000) ||
1997          (!(CTX->algorithms & DSP_MARKOV) && nbay_result >= 0.9))
1998      {
1999        CTX->result = DSR_ISSPAM;
2000        CTX->probability = nbay_result;
2001        CTX->factors = factor;
2002        LOGDEBUG("using Naive-Bayes factors");
2003      }
2004    }
2005
2006    if (CTX->algorithms & DSA_GRAHAM) {
2007      factor = factor_bayes;
2008      if (((CTX->algorithms & DSP_MARKOV) && bay_result > 0.5000) ||
2009          (!(CTX->algorithms & DSP_MARKOV) && bay_result >= 0.9))
2010      {
2011        CTX->result = DSR_ISSPAM;
2012        CTX->probability = bay_result;
2013        CTX->factors = factor;
2014        LOGDEBUG("using Graham factors");
2015      }
2016    }
2017
2018    if (CTX->algorithms & DSA_BURTON) {
2019      factor = factor_altbayes;
2020      if (((CTX->algorithms & DSP_MARKOV) && abay_result > 0.5000) ||
2021          (!(CTX->algorithms & DSP_MARKOV) && abay_result >= 0.9))
2022      {
2023        CTX->result = DSR_ISSPAM;
2024        CTX->probability = abay_result;
2025        if (!CTX->factors) {
2026          CTX->factors = factor;
2027          LOGDEBUG("using Burton factors");
2028        }
2029      }
2030    }
2031
2032    if (CTX->algorithms & DSA_ROBINSON) {
2033      factor = factor_rob;
2034      if (((CTX->algorithms & DSP_MARKOV) && rob_result > 0.5000) ||
2035          (!(CTX->algorithms & DSP_MARKOV) && rob_result >= ROB_CUTOFF))
2036      {
2037        CTX->result = DSR_ISSPAM;
2038        if (CTX->probability < 0)
2039          CTX->probability = rob_result;
2040        if (!CTX->factors) {
2041          CTX->factors = factor;
2042          LOGDEBUG("using Robinson-Geom factors");
2043        }
2044      }
2045    }
2046
2047    if (CTX->algorithms & DSA_CHI_SQUARE) {
2048     factor = factor_chi;
2049     if (((CTX->algorithms & DSP_MARKOV) && chi_result > 0.5000) ||
2050         (!(CTX->algorithms & DSP_MARKOV) && chi_result >= CHI_CUTOFF))
2051     {
2052       CTX->result = DSR_ISSPAM;
2053       if (CTX->probability < 0)
2054         CTX->probability = chi_result;
2055       if (!CTX->factors) {
2056         CTX->factors = factor;
2057         LOGDEBUG("using Chi-Square factors");
2058       }
2059      }
2060    }
2061
2062    if (!CTX->factors) {
2063      CTX->factors = factor;
2064      LOGDEBUG("no factors specified; using default");
2065    }
2066  }
2067
2068  if (CTX->factors != factor_nbayes)
2069    _ds_factor_destroy(factor_nbayes);
2070  if (CTX->factors != factor_bayes)
2071    _ds_factor_destroy(factor_bayes);
2072  if (CTX->factors != factor_altbayes)
2073    _ds_factor_destroy(factor_altbayes);
2074  if (CTX->factors != factor_rob)
2075    _ds_factor_destroy(factor_rob);
2076  if (CTX->factors != factor_chi)
2077    _ds_factor_destroy(factor_chi);
2078
2079  /* If somehow we haven't yet assigned a probability, assign one */
2080  if (CTX->probability == DSP_UNCALCULATED)
2081  {
2082    if (CTX->algorithms & DSA_GRAHAM)
2083      CTX->probability = bay_result;
2084
2085    if (CTX->algorithms & DSA_NAIVE)
2086      CTX->probability = nbay_result;
2087
2088    if (CTX->probability < 0 && CTX->algorithms & DSA_BURTON)
2089      CTX->probability = abay_result;
2090
2091    if (CTX->probability < 0 && CTX->algorithms & DSA_ROBINSON)
2092      CTX->probability = rob_result;
2093
2094    if (CTX->probability < 0 && CTX->algorithms & DSA_CHI_SQUARE)
2095      CTX->probability = chi_result;
2096  }
2097
2098#ifdef VERBOSE
2099  if (DO_DEBUG && (!(CTX->algorithms & DSP_MARKOV))) {
2100    if (abay_result >= 0.9 && bay_result < 0.9)
2101    {
2102      LOGDEBUG ("CATCH: Burton Bayesian");
2103    }
2104    else if (abay_result < 0.9 && bay_result >= 0.9)
2105    {
2106      LOGDEBUG ("MISS: Burton Bayesian");
2107    }
2108
2109    if (rob_result >= ROB_CUTOFF && bay_result < 0.9)
2110    {
2111      LOGDEBUG ("CATCH: Robinson's");
2112    }
2113    else if (rob_result < ROB_CUTOFF && bay_result >= 0.9)
2114    {
2115      LOGDEBUG ("MISS: Robinson's");
2116    }
2117
2118    if (chi_result >= CHI_CUTOFF && bay_result < 0.9)
2119    {
2120      LOGDEBUG("CATCH: Chi-Square");
2121    }
2122    else if (chi_result < CHI_CUTOFF && bay_result >= 0.9)
2123    {
2124      LOGDEBUG("MISS: Chi-Square");
2125    }
2126  }
2127#endif
2128
2129  /* Calculate Confidence */
2130
2131  if (CTX->algorithms & DSP_MARKOV) {
2132    if (CTX->result == DSR_ISSPAM)
2133    {
2134      CTX->confidence = CTX->probability;
2135    }
2136    else
2137    {
2138      CTX->confidence = 1.0 - CTX->probability;
2139    }
2140  } else {
2141    if (CTX->result == DSR_ISSPAM)
2142    {
2143      CTX->confidence = rob_result;
2144    }
2145    else
2146    {
2147      CTX->confidence = 1.0 - rob_result;
2148    }
2149  }
2150
2151  LOGDEBUG("Result Confidence: %1.2f", CTX->confidence);
2152  return CTX->result;
2153}
2154
2155/*
2156 *  _ds_factor()
2157 *
2158 * DESCRIPTION
2159 *   Factors a token/value into a set
2160 *
2161 *    Adds a token/value pair to a factor set. The factor set of the dominant
2162 *    calculation  is provided to the  client in order to explain  libdspam's
2163 *    final decision about the message's classification.
2164 */
2165 
2166int _ds_factor(struct nt *set, char *token_name, float value) {
2167  struct dspam_factor *f;
2168  f = calloc(1, sizeof(struct dspam_factor));
2169  if (!f)
2170    return EUNKNOWN;
2171  f->token_name = strdup(token_name);
2172  f->value = value;
2173  nt_add(set, (void *) f);
2174  return 0;
2175}
2176
2177/*
2178 *  _ds_factor_destroy - destroy a factor tree
2179 *
2180 */
2181
2182void _ds_factor_destroy(struct nt *factors) {
2183  struct dspam_factor *f;
2184  struct nt_node *node;
2185  struct nt_c c;
2186
2187  if (factors == NULL)
2188        return;
2189 
2190  node = c_nt_first(factors, &c);
2191  while(node != NULL) {
2192    f = (struct dspam_factor *) node->ptr;
2193    if (f)
2194        free(f->token_name);
2195    node = c_nt_next(factors, &c);
2196  }
2197  nt_destroy(factors);
2198
2199  return;
2200}
2201
2202int libdspam_init(const char *driver) {
2203
2204#ifndef STATIC_DRIVER
2205  if (driver == NULL) {
2206      LOG(LOG_CRIT, "dlopen() failed: Can not load NULL driver");
2207      return EFAILURE;
2208  } else if (driver) {
2209    if ((_drv_handle = dlopen(driver, RTLD_NOW))==NULL) {
2210      LOG(LOG_CRIT, "dlopen() failed: %s: %s", driver, dlerror());
2211      return EFAILURE;
2212    }
2213  }
2214#endif
2215
2216  return 0;
2217}
2218
2219int libdspam_shutdown(void) {
2220
2221#ifndef STATIC_DRIVER
2222  if (_drv_handle) {
2223    int r;
2224    if ((r=dlclose(_drv_handle))) {
2225      LOG(LOG_CRIT, "dlclose() failed: %s", dlerror());
2226      return r;
2227    }
2228  }
2229#endif
2230
2231  return 0;
2232}
2233
2234int _ds_instantiate_bnr(
2235  DSPAM_CTX *CTX,
2236  ds_diction_t patterns,
2237  struct nt *stream,
2238  char identifier)
2239{
2240  float previous_bnr_probs[BNR_SIZE];
2241  ds_term_t ds_term, ds_touch;
2242  struct nt_node *node_nt;
2243  struct nt_c c_nt;
2244  unsigned long long crc;
2245  char bnr_token[64];
2246  int i;
2247
2248  for(i=0;i<BNR_SIZE;i++)
2249    previous_bnr_probs[i] = 0.00000;
2250
2251  node_nt = c_nt_first(stream, &c_nt);
2252  while(node_nt != NULL) {
2253    ds_term = node_nt->ptr;
2254
2255    _ds_calc_stat (CTX, ds_term, &ds_term->s, DTT_DEFAULT, NULL);
2256
2257    for(i=0;i<BNR_SIZE-1;i++)
2258      previous_bnr_probs[i] = previous_bnr_probs[i+1];
2259
2260    previous_bnr_probs[BNR_SIZE-1] = _ds_round(ds_term->s.probability);
2261    sprintf(bnr_token, "bnr.%c|", identifier);
2262    for(i=0;i<BNR_SIZE;i++) {
2263      char x[6];
2264      snprintf(x, 6, "%01.2f_", previous_bnr_probs[i]);
2265      strlcat(bnr_token, x, sizeof(bnr_token));
2266    }
2267
2268    crc = _ds_getcrc64 (bnr_token);
2269#ifdef VERBOSE
2270    LOGDEBUG ("BNR pattern instantiated: '%s'", bnr_token);
2271#endif
2272    ds_touch = ds_diction_touch(patterns, crc, bnr_token, 0);
2273    ds_touch->type = 'B';
2274    node_nt = c_nt_next(stream, &c_nt);
2275  }
2276  return 0;
2277}
2278
2279ds_diction_t _ds_apply_bnr (DSPAM_CTX *CTX, ds_diction_t diction) {
2280
2281  /*
2282     Bayesian Noise Reduction - Contextual Symmetry Logic
2283     http://bnr.nuclearelephant.com
2284  */
2285
2286  ds_diction_t bnr_patterns = ds_diction_create(3079);
2287  struct _ds_spam_stat bnr_tot;
2288  unsigned long long crc;
2289  BNR_CTX *BTX_S, *BTX_C;
2290  struct nt_node *node_nt;
2291  struct nt_c c_nt;
2292  ds_term_t ds_term, ds_touch;
2293  ds_cursor_t ds_c;
2294
2295  if (!bnr_patterns)
2296  {
2297    LOG (LOG_CRIT, ERR_MEM_ALLOC);
2298    return NULL;
2299  }
2300
2301  BTX_S = bnr_init(BNR_INDEX, 's');
2302  BTX_C = bnr_init(BNR_INDEX, 'c');
2303
2304  if (!BTX_S || !BTX_C) {
2305    LOGDEBUG("bnr_init() failed");
2306    bnr_destroy(BTX_S);
2307    bnr_destroy(BTX_C);
2308    ds_diction_destroy(bnr_patterns);
2309    return NULL;
2310  }
2311
2312  BTX_S->window_size = BNR_SIZE;
2313  BTX_C->window_size = BNR_SIZE;
2314
2315  _ds_instantiate_bnr(CTX, bnr_patterns, diction->order, 's');
2316  _ds_instantiate_bnr(CTX, bnr_patterns, diction->chained_order, 'c');
2317
2318  /* Add BNR totals to the list of load elements */
2319  memset(&bnr_tot, 0, sizeof(struct _ds_spam_stat));
2320  crc = _ds_getcrc64("bnr.t|");
2321  ds_touch = ds_diction_touch(bnr_patterns, crc, "bnr.t|", 0);
2322  ds_touch->type = 'B';
2323
2324  /* Load BNR patterns */
2325  LOGDEBUG("Loading %ld BNR patterns", bnr_patterns->items);
2326  if (_ds_getall_spamrecords (CTX, bnr_patterns)) {
2327    LOGDEBUG ("_ds_getall_spamrecords() failed");
2328    ds_diction_destroy(bnr_patterns);
2329    return NULL;
2330  }
2331
2332  /* Perform BNR Processing */
2333
2334  if (CTX->classification == DSR_NONE   &&
2335      CTX->_sig_provided == 0           &&
2336      CTX->totals.innocent_learned + CTX->totals.innocent_classified > 2500)
2337  {
2338    int elim;
2339#ifdef LIBBNR_DEBUG
2340    char fn[MAX_FILENAME_LENGTH];
2341    FILE *file;
2342#endif
2343
2344    node_nt = c_nt_first(diction->order, &c_nt);
2345    while(node_nt != NULL) {
2346      ds_term = node_nt->ptr;
2347      bnr_add(BTX_S, ds_term->name, ds_term->s.probability);
2348      node_nt = c_nt_next(diction->order, &c_nt);
2349    }
2350
2351    node_nt = c_nt_first(diction->chained_order, &c_nt);
2352    while(node_nt != NULL) {
2353      ds_term = node_nt->ptr;
2354      bnr_add(BTX_C, ds_term->name, ds_term->s.probability);
2355      node_nt = c_nt_next(diction->chained_order, &c_nt);
2356    }
2357
2358    bnr_instantiate(BTX_S);
2359    bnr_instantiate(BTX_C);
2360
2361    /* Calculate pattern p-values */
2362    ds_diction_getstat(bnr_patterns, crc, &bnr_tot);
2363    ds_c = ds_diction_cursor(bnr_patterns);
2364    ds_term = ds_diction_next(ds_c);
2365    while(ds_term) {
2366      _ds_calc_stat(CTX, ds_term, &ds_term->s, DTT_BNR, &bnr_tot);
2367      if (ds_term->name[4] == 's')
2368        bnr_set_pattern(BTX_S, ds_term->name, ds_term->s.probability);
2369      else if (ds_term->name[4] == 'c')
2370        bnr_set_pattern(BTX_C, ds_term->name, ds_term->s.probability);
2371      ds_term = ds_diction_next(ds_c);
2372    }
2373    ds_diction_close(ds_c);
2374
2375    bnr_finalize(BTX_S);
2376    bnr_finalize(BTX_C);
2377
2378    /* Propagate eliminations to DSPAM */
2379
2380    node_nt = c_nt_first(diction->order, &c_nt);
2381    while(node_nt != NULL) {
2382      ds_term = node_nt->ptr;
2383      bnr_get_token(BTX_S, &elim);
2384      if (elim)
2385        ds_term->frequency--;
2386      node_nt = c_nt_next(diction->order, &c_nt);
2387    }
2388
2389    node_nt = c_nt_first(diction->chained_order, &c_nt);
2390    while(node_nt != NULL) {
2391      ds_term = node_nt->ptr;
2392      bnr_get_token(BTX_C, &elim);
2393      if (elim)
2394        ds_term->frequency--;
2395      node_nt = c_nt_next(diction->chained_order, &c_nt);
2396    }
2397
2398#ifdef LIBBNR_DEBUG
2399    float snr;
2400    if (BTX_S->stream->items + BTX_C->stream->items +
2401        BTX_S->eliminations  + BTX_C->eliminations > 0)
2402    {
2403      snr = 100.0*((BTX_S->eliminations + BTX_C->eliminations + 0.0)/
2404            (BTX_S->stream->items + BTX_C->stream->items +
2405             BTX_S->eliminations  + BTX_C->eliminations));
2406    } else {
2407      snr = 0;
2408    }
2409
2410    LOGDEBUG("bnr reported snr of %02.3f", snr);
2411
2412#ifdef LIBBNR_GRAPH_OUTPUT
2413    printf("BEFORE\n\n");
2414    node_nt = c_nt_first(diction->order, &c_nt);
2415    while(node_nt != NULL) {
2416      ds_term = node_nt->ptr;
2417      printf("%1.5f\n", ds_term->s.probability);
2418      node_nt = c_nt_next(diction->order, &c_nt);
2419    }
2420
2421    printf("\n\nAFTER\n\n");
2422    node_nt = c_nt_first(diction->order, &c_nt);
2423    while(node_nt != NULL) {
2424      ds_term = node_nt->ptr;
2425      if (ds_term->frequency > 0)
2426        printf("%1.5f\n", ds_term->s.probability);
2427      node_nt = c_nt_next(diction->order, &c_nt);
2428    }
2429    printf("\n");
2430#endif
2431         
2432
2433    snprintf(fn, sizeof(fn), "%s/bnr.log", LOGDIR);
2434    file = fopen(fn, "a");
2435    if (file != NULL) {
2436      fprintf(file, "-- BNR Filter Process Results --\n");
2437      fprintf(file, "Eliminations:\n");
2438      node_nt = c_nt_first(diction->order, &c_nt);
2439      while(node_nt != NULL) {
2440        ds_term = node_nt->ptr;
2441        if (ds_term->frequency <= 0)
2442          fprintf(file, "%s ", ds_term->name);
2443        node_nt = c_nt_next(diction->order, &c_nt);
2444      }
2445      fprintf(file, "\n[");
2446      node_nt = c_nt_first(diction->order, &c_nt);
2447      while(node_nt != NULL) {
2448        ds_term = node_nt->ptr;
2449        if (ds_term->frequency <= 0)
2450          fprintf(file, "%1.2f ", ds_term->s.probability);
2451        node_nt = c_nt_next(diction->order, &c_nt);
2452      }
2453
2454      fprintf(file, "]\n\nRemaining:\n");
2455      node_nt = c_nt_first(diction->order, &c_nt);
2456      while(node_nt != NULL) {
2457        ds_term = node_nt->ptr;
2458        if (ds_term->frequency > 0)
2459          fprintf(file, "%s ", ds_term->name);
2460        node_nt = c_nt_next(diction->order, &c_nt);
2461      }
2462      fprintf(file, "\n[");
2463      node_nt = c_nt_first(diction->order, &c_nt);
2464      while(node_nt != NULL) {
2465        ds_term = node_nt->ptr;
2466        if (ds_term->frequency > 0)
2467          fprintf(file, "%1.2f ", ds_term->s.probability);
2468        node_nt = c_nt_next(diction->order, &c_nt);
2469      }
2470
2471      fprintf(file, "]\nProcessed for: %s\n\n", CTX->username);
2472
2473      fprintf(file, "-- Chained Tokens --\n");
2474      fprintf(file, "Eliminations:\n");
2475      node_nt = c_nt_first(diction->chained_order, &c_nt);
2476      while(node_nt != NULL) {
2477        ds_term = node_nt->ptr;
2478        if (ds_term->frequency <= 0)
2479          fprintf(file, "%s ", ds_term->name);
2480        node_nt = c_nt_next(diction->chained_order, &c_nt);
2481      }
2482      fprintf(file, "\n[");
2483      node_nt = c_nt_first(diction->chained_order, &c_nt);
2484      while(node_nt != NULL) {
2485        ds_term = node_nt->ptr;
2486        if (ds_term->frequency <= 0)
2487          fprintf(file, "%1.2f ", ds_term->s.probability);
2488        node_nt = c_nt_next(diction->chained_order, &c_nt);
2489      }
2490
2491      fprintf(file, "]\n\nRemaining:\n");
2492      node_nt = c_nt_first(diction->chained_order, &c_nt);
2493      while(node_nt != NULL) {
2494        ds_term = node_nt->ptr;
2495        if (ds_term->frequency > 0)
2496          fprintf(file, "%s ", ds_term->name);
2497        node_nt = c_nt_next(diction->chained_order, &c_nt);
2498      }
2499      fprintf(file, "\n[");
2500      node_nt = c_nt_first(diction->chained_order, &c_nt);
2501      while(node_nt != NULL) {
2502        ds_term = node_nt->ptr;
2503        if (ds_term->frequency > 0)
2504          fprintf(file, "%1.2f ", ds_term->s.probability);
2505        node_nt = c_nt_next(diction->chained_order, &c_nt);
2506      }
2507
2508
2509      fprintf(file, "]\nProcessed for: %s\n\n", CTX->username);
2510      fclose(file);
2511    }
2512#endif
2513
2514  }
2515
2516  bnr_destroy(BTX_S);
2517  bnr_destroy(BTX_C);
2518
2519  /* Add BNR pattern to token hash */
2520  if (CTX->totals.innocent_learned + CTX->totals.innocent_classified > 1000) {
2521    ds_c = ds_diction_cursor(bnr_patterns);
2522    ds_term = ds_diction_next(ds_c);
2523    while(ds_term) {
2524      ds_term_t t = ds_diction_touch(diction, ds_term->key, ds_term->name, 0);
2525      t->type = 'B';
2526      ds_diction_setstat(diction, ds_term->key, &ds_term->s);
2527      if (t)
2528        t->frequency = 1;
2529 
2530#ifdef LIBBNR_DEBUG
2531      if (fabs(0.5-ds_term->s.probability)>0.25) {
2532        LOGDEBUG("Interesting BNR Pattern: %s %01.5f %lds %ldi",
2533                 ds_term->name,
2534                 ds_term->s.probability,
2535                 ds_term->s.spam_hits,
2536                 ds_term->s.innocent_hits);
2537      }
2538#endif
2539 
2540      ds_term = ds_diction_next(ds_c);
2541    }
2542    ds_diction_close(ds_c);
2543  }
2544
2545  return bnr_patterns;
2546}
2547
2548int _ds_increment_tokens(DSPAM_CTX *CTX, ds_diction_t diction) {
2549  ds_cursor_t ds_c;
2550  ds_term_t ds_term;
2551  int i = 0;
2552  int occurrence = _ds_match_attribute(CTX->config->attributes,
2553     "ProcessorWordFrequency", "occurrence");
2554
2555  ds_c = ds_diction_cursor(diction);
2556  ds_term = ds_diction_next(ds_c);
2557  while(ds_term) {
2558    unsigned long long crc;
2559
2560    crc = ds_term->key;
2561
2562    /* Create a signature if we're processing a message */
2563
2564    if (CTX->tokenizer != DSZ_SBPH
2565      && CTX->flags & DSF_SIGNATURE
2566      && (CTX->operating_mode != DSM_CLASSIFY || !(CTX->_sig_provided)))
2567    {
2568      struct _ds_signature_token t;
2569
2570      memset(&t, 0, sizeof(t));
2571      t.token = crc;
2572      t.frequency = ds_term->frequency;
2573      memcpy ((char *) CTX->signature->data +
2574              (i * sizeof (struct _ds_signature_token)), &t,
2575              sizeof (struct _ds_signature_token));
2576    }
2577
2578    /* If classification was provided, force probabilities */
2579    if (CTX->classification == DSR_ISSPAM)
2580      ds_term->s.probability = 1.00;
2581    else if (CTX->classification == DSR_ISINNOCENT)
2582      ds_term->s.probability = 0.00;
2583
2584    if (ds_term->type == 'D' &&
2585        ( CTX->training_mode != DST_TUM  ||
2586          CTX->source == DSS_ERROR       ||
2587          CTX->source == DSS_INOCULATION ||
2588          ds_term->s.spam_hits + ds_term->s.innocent_hits < 50 ||
2589          ds_term->key == diction->whitelist_token             ||
2590          CTX->confidence < 0.70))
2591    {
2592        ds_term->s.status |= TST_DIRTY;
2593    }
2594
2595    if (ds_term->type == 'B' &&
2596        CTX->totals.innocent_learned + CTX->totals.innocent_classified > 500 &&
2597        CTX->flags & DSF_NOISE &&
2598        CTX->_sig_provided == 0)
2599    {
2600        ds_term->s.status |= TST_DIRTY;
2601    }
2602
2603    /* SPAM */
2604    if (CTX->result == DSR_ISSPAM)
2605    {
2606      /* Inoculations increase token count considerably */
2607      if (CTX->source == DSS_INOCULATION)
2608      {
2609        if (ds_term->s.innocent_hits < 2 && ds_term->s.spam_hits < 5)
2610          ds_term->s.spam_hits += 5;
2611        else
2612          ds_term->s.spam_hits += 2;
2613      }
2614
2615      /* Standard increase */
2616      else
2617      {
2618        if (CTX->flags & DSF_UNLEARN) {
2619          if (CTX->classification == DSR_ISSPAM)
2620          {
2621            if (occurrence)
2622            {
2623              ds_term->s.spam_hits -= ds_term->frequency;
2624              if (ds_term->s.spam_hits < 0)
2625                ds_term->s.spam_hits = 0;
2626            } else {
2627              ds_term->s.spam_hits -= (ds_term->s.spam_hits>0) ? 1:0;
2628            }
2629          }
2630        } else {
2631          if (occurrence)
2632          {
2633            ds_term->s.spam_hits += ds_term->frequency;
2634          } else {
2635            ds_term->s.spam_hits++;
2636          }
2637        }
2638      }
2639
2640      if (SPAM_MISS(CTX) &&
2641          !(CTX->flags & DSF_UNLEARN) &&
2642          CTX->training_mode != DST_TOE &&
2643          CTX->training_mode != DST_NOTRAIN)
2644      {
2645        if (occurrence)
2646        {
2647          ds_term->s.innocent_hits -= ds_term->frequency;
2648          if (ds_term->s.innocent_hits < 0)
2649            ds_term->s.innocent_hits = 0;
2650        } else {
2651          ds_term->s.innocent_hits -= (ds_term->s.innocent_hits>0) ? 1:0;
2652        }
2653      }
2654    }
2655
2656    /* INNOCENT */
2657    else
2658    {
2659      if (CTX->flags & DSF_UNLEARN) {
2660        if (CTX->classification == DSR_ISINNOCENT)
2661        {
2662          if (occurrence)
2663          {
2664            ds_term->s.innocent_hits -= ds_term->frequency;
2665            if (ds_term->s.innocent_hits < 0)
2666              ds_term->s.innocent_hits = 0;
2667          } else {
2668            ds_term->s.innocent_hits -= (ds_term->s.innocent_hits>0) ? 1:0;
2669          }
2670        }
2671      } else {
2672        if (occurrence)
2673        {
2674          ds_term->s.innocent_hits += ds_term->frequency;
2675        } else {
2676          ds_term->s.innocent_hits++;
2677        }
2678      }
2679
2680      if (FALSE_POSITIVE(CTX)         &&
2681          !(CTX->flags & DSF_UNLEARN) &&
2682          CTX->training_mode != DST_TOE &&
2683          CTX->training_mode != DST_NOTRAIN)
2684      {
2685
2686        if (occurrence)
2687        {
2688          ds_term->s.spam_hits -= ds_term->frequency;
2689          if (ds_term->s.spam_hits < 0)
2690            ds_term->s.spam_hits = 0;
2691        } else {
2692          ds_term->s.spam_hits -= (ds_term->s.spam_hits>0) ? 1:0;
2693        }
2694
2695      }
2696    }
2697
2698    ds_term = ds_diction_next(ds_c);
2699    i++;
2700  }
2701  ds_diction_close(ds_c);
2702  return 0;
2703}
Note: See TracBrowser for help on using the repository browser.