/* $Id: libdspam.c,v 1.205 2011/07/13 00:51:46 sbajic Exp $ */ /* DSPAM COPYRIGHT (C) 2002-2012 DSPAM PROJECT This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ /* * libdspam.c - DSPAM core analytical engine * * DESCRIPTION * libdspam is at the core of the decision making process and is called * by the agent to perform all tasks related to message classification. * The libdspam API functions are documented in libdspam(1). */ #ifndef STATIC_DRIVER void *_drv_handle; #endif #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include #include #ifdef HAVE_UNISTD_H #include #endif #include #include #include #include #ifdef TIME_WITH_SYS_TIME # include # include #else # ifdef HAVE_SYS_TIME_H # include # else # include # endif #endif #include "config.h" #include "libdspam_objects.h" #include "libdspam.h" #include "nodetree.h" #include "config.h" #include "base64.h" #include "bnr.h" #include "util.h" #include "storage_driver.h" #include "buffer.h" #include "heap.h" #include "error.h" #include "decode.h" #include "language.h" #define CHI_S 0.1 /* Chi-Sq Strength */ #define CHI_X 0.5000 /* Chi-Sq Assumed Probability */ #define C1 16 /* Markov C1 */ #define C2 1 /* Markov C2 */ #ifdef DEBUG int DO_DEBUG = 0; #endif /* * dspam_init() * * DESCRIPTION * The dspam_init() function creates and initializes a new classification * context and attaches the context to whatever backend storage facility * was configured. The user and group arguments provided are used to read * and write information stored for the user and group specified. The home * argument is used to configure libdspam's storage around the base direc- * tory specified. The mode specifies the operating mode to initialize the * classification context with and may be one of: * * DSM_PROCESS Process the message and return a result * DSM_CLASSIFY Classify message only, no learning * DSM_TOOLS No processing, attach to storage only * * The flags provided further tune the classification context for a spe- * cific function. Multiple flags may be OR'd together. * * DSF_SIGNATURE A binary signature is requested/provided * DSF_NOISE Apply Bayesian Noise Reduction logic * DSF_WHITELIST Use automatic whitelisting logic * DSF_MERGED Merge group metadata with user's in memory * * RETURN VALUES * Upon successful completion, dspam_init() will return a pointer to a new * classification context structure containing a copy of the configuration * passed into dspam_init(), a connected storage driver handle, and a set * of preliminary user control data read from storage. */ DSPAM_CTX * dspam_init ( const char *username, const char *group, const char *home, int operating_mode, u_int32_t flags) { DSPAM_CTX *CTX = dspam_create(username, group, home, operating_mode, flags); if (CTX == NULL) return NULL; if (!dspam_attach(CTX, NULL)) return CTX; dspam_destroy(CTX); return NULL; } /* dspam_create() * * DESCRIPTION * The dspam_create() function performs in exactly the same manner as the * dspam_init() function, but does not attach to storage. Instead, the * caller must also call dspam_attach() after setting any storage- spe- * cific attributes using dspam_addattribute(). This is useful for cases * where the implementor would prefer to configure storage internally * rather than having libdspam read a configuration from a file. * * RETURN VALUES * Upon successful completion, dspam_create() will return a pointer to a new * classification context structure containing a copy of the configuration * passed into dspam_create(). At this point, dspam_attach() must be called * for further processing. */ DSPAM_CTX * dspam_create ( const char *username, const char *group, const char *home, int operating_mode, u_int32_t flags) { DSPAM_CTX *CTX; CTX = calloc (1, sizeof (DSPAM_CTX)); if (CTX == NULL) { LOG(LOG_WARNING, "dspam_create: unable to allocate space for classification context"); return NULL; } CTX->config = calloc(1, sizeof(struct _ds_config)); if (CTX->config == NULL) { LOG(LOG_WARNING, "dspam_create: unable to allocate space for classification context configuration"); LOG(LOG_CRIT, ERR_MEM_ALLOC); goto bail; } CTX->config->size = 128; CTX->config->attributes = calloc(1, sizeof(attribute_t)*(CTX->config->size)); if (CTX->config->attributes == NULL) { LOG(LOG_WARNING, "dspam_create: unable to allocate space for classification context attributes"); LOG(LOG_CRIT, ERR_MEM_ALLOC); goto bail; } if (home != NULL && home[0] != 0) CTX->home = strdup (home); else { #ifdef DSPAM_HOME CTX->home = strdup(DSPAM_HOME); #else CTX->home = NULL; #endif } if (username != NULL && username[0] != 0) CTX->username = strdup (username); else CTX->username = NULL; if (group != NULL && group[0] != 0) CTX->group = strdup (group); else CTX->group = NULL; CTX->probability = DSP_UNCALCULATED; CTX->operating_mode = operating_mode; CTX->flags = flags; CTX->message = NULL; CTX->confidence = 0; CTX->training_mode = DST_TEFT; CTX->wh_threshold = 10; CTX->training_buffer = 0; CTX->classification = DSR_NONE; CTX->source = DSS_NONE; CTX->_sig_provided = 0; CTX->factors = NULL; CTX->algorithms = 0; CTX->tokenizer = DSZ_WORD; return CTX; bail: if (CTX != NULL) { if (CTX->config != NULL) { if (CTX->config->attributes != NULL) _ds_destroy_config(CTX->config->attributes); free(CTX->config); } if (CTX->username != NULL) free(CTX->username); if (CTX->group != NULL) free(CTX->group); if (CTX->home != NULL) free(CTX->home); free(CTX); } return NULL; } /* * dspam_clearattributes() * * DESCRIPTION * The dspam_clearattributes() function is called to clear any attributes * previously set using dspam_addattribute() within the classification * context. It is necessary to call this function prior to replacing any * attributes already written. * * RETURN VALUES * returns 0 on success, standard errors on failure * */ int dspam_clearattributes (DSPAM_CTX * CTX) { if (CTX->config) { _ds_destroy_config(CTX->config->attributes); free(CTX->config); } else { return EFAILURE; } CTX->config = calloc(1, sizeof(struct _ds_config)); if (CTX->config == NULL) goto bail; CTX->config->size = 128; CTX->config->attributes = calloc(1, sizeof(attribute_t)*(CTX->config->size)); if (CTX->config->attributes == NULL) goto bail; return 0; bail: if (CTX->config != NULL) { free(CTX->config); CTX->config = NULL; } LOG(LOG_CRIT, ERR_MEM_ALLOC); return EUNKNOWN; } /* * dspam_addattribute() * * DESCRIPTION * The dspam_addattribute() function is called to set attributes within * the classification context. Some storage drivers support the use of * passing specific attributes such as server connect information. The * driver-independent attributes supported by DSPAM include: * * IgnoreHeader Specify a specific header to ignore * LocalMX Specify a local mail exchanger to assist in * correct results from dspam_getsource(). * * Only driver-dependent attributes need be set prior to a call to * dspam_attach(). Driver-independent attributes may be set both before * and after storage has been attached. * * RETURN VALUES * returns 0 on success, standard errors on failure */ int dspam_addattribute (DSPAM_CTX * CTX, const char *key, const char *value) { int i, j = 0; if (_ds_find_attribute(CTX->config->attributes, key)) return _ds_add_attribute(CTX->config->attributes, key, value); for(i=0;CTX->config->attributes[i];i++) j++; if (j >= CTX->config->size) { config_t ptr; CTX->config->size *= 2; ptr = realloc(CTX->config->attributes, 1+(sizeof(attribute_t)*CTX->config->size)); if (ptr) { CTX->config->attributes = ptr; } else { LOG(LOG_CRIT, ERR_MEM_ALLOC); return EFAILURE; } } return _ds_add_attribute(CTX->config->attributes, key, value); } /* * dspam_attach() * * DESCRIPTION * The dspam_attach() function attaches the storage interface to the clas- * sification context and alternatively established an initial connection * with storage if dbh is NULL. Some storage drivers support only a NULL * value for dbh, while others (such as mysql_drv, pgsql_drv, and * sqlite_drv) allow an open database handle to be attached. This function * should only be called after an initial call to dspam_create() and * should never be called if using dspam_init(), as storage is automati- * cally attached by a call to dspam_init(). * * RETURN VALUES * returns 0 on success, standard errors on failure */ int dspam_attach (DSPAM_CTX *CTX, void *dbh) { if (!_ds_init_storage (CTX, dbh)) return 0; return EFAILURE; } /* * dspam_detach() * * DESCRIPTION * The dspam_detach() function can be called when a detachment from stor- * age is desired, but the context is still needed. The storage driver is * closed, leaving the classification context in place. Once the context * is no longer needed, another call to dspam_destroy() should be made. If * you are closing storage and destroying the context at the same time, it * is not necessary to call this function. Instead you may call * dspam_destroy() directly. * * RETURN VALUES * returns 0 on success, standard errors on failure */ int dspam_detach (DSPAM_CTX * CTX) { if (CTX->storage != NULL) { /* Sanity check totals before our shutdown call writes them */ if (CTX->totals.spam_learned < 0) CTX->totals.spam_learned = 0; if (CTX->totals.innocent_learned < 0) CTX->totals.innocent_learned = 0; if (CTX->totals.spam_misclassified < 0) CTX->totals.spam_misclassified = 0; if (CTX->totals.innocent_misclassified < 0) CTX->totals.innocent_misclassified = 0; if (CTX->totals.spam_classified < 0) CTX->totals.spam_classified = 0; if (CTX->totals.innocent_classified < 0) CTX->totals.innocent_classified = 0; _ds_shutdown_storage (CTX); free(CTX->storage); CTX->storage = NULL; } return 0; } /* * dspam_destroy() * * The dspam_destroy() function should be called when the context is no * longer needed. If a connection was established to storage internally, * the connection is closed and all data is flushed and written. If a han- * dle was attached, the handle will remain open. */ void dspam_destroy (DSPAM_CTX * CTX) { if (CTX->storage != NULL) dspam_detach(CTX); _ds_factor_destroy(CTX->factors); if (CTX->config && CTX->config->attributes) _ds_destroy_config (CTX->config->attributes); free (CTX->config); free (CTX->username); free (CTX->group); free (CTX->home); if (! CTX->_sig_provided && CTX->signature != NULL) { if (CTX->signature->data != NULL) free (CTX->signature->data); free (CTX->signature); } if (CTX->message) _ds_destroy_message(CTX->message); free (CTX); return; } /* * dspam_process() * * DESCRIPTION * The dspam_process() function performs analysis of the message passed * into it and will return zero on successful completion. If successful, * CTX->result will be set to one of three classification results: * * DSR_ISSPAM Message was classified as spam * DSR_ISINNOCENT Message was classified as nonspam * * RETURN VALUES * returns 0 on success * * EINVAL An invalid call or invalid parameter used. * EUNKNOWN Unexpected error, such as malloc() failure * EFILE Error opening or writing to a file or file handle * ELOCK Locking failure * EFAILURE The operation itself has failed */ int dspam_process (DSPAM_CTX * CTX, const char *message) { #ifdef DEBUG struct timeval tp1, tp2; struct timezone tzp; #endif buffer *header, *body; int spam_result = 0, is_toe = 0, is_undertrain = 0, retcode = 0; #ifdef DEBUG gettimeofday(&tp1, &tzp); #endif if (CTX->signature != NULL) CTX->_sig_provided = 1; /* Sanity check context behavior */ if (CTX->operating_mode == DSM_CLASSIFY && CTX->classification != DSR_NONE) { LOG(LOG_WARNING, "DSM_CLASSIFY can't be used with a classification"); return EINVAL; } if (CTX->algorithms == 0) { LOG(LOG_WARNING, "No algorithms configured. Use CTX->algorithms and DSA_"); return EINVAL; } if (CTX->classification != DSR_NONE && CTX->source == DSS_NONE) { LOG(LOG_WARNING, "A classification requires a source be specified"); return EINVAL; } if (CTX->classification == DSR_NONE && CTX->source != DSS_NONE) { LOG(LOG_WARNING, "A source requires a classification be specified"); return EINVAL; } /* Set TOE mode pretrain option if we haven't seen many messages yet */ if (CTX->training_mode == DST_TOE && (CTX->totals.innocent_learned <= 100 || CTX->totals.spam_learned <= 100) && (!(CTX->algorithms & DSP_MARKOV))) { is_undertrain = 1; CTX->training_mode = DST_TEFT; } /* Classify only for TOE / NOTRAIN mode setting if data is mature enough */ if ( CTX->operating_mode == DSM_PROCESS && CTX->classification == DSR_NONE && (CTX->training_mode == DST_TOE || CTX->training_mode == DST_NOTRAIN)) { CTX->operating_mode = DSM_CLASSIFY; is_toe = 1; } /* A signature has been presented for training; process it */ /* Non-SPBH Signature */ if (CTX->operating_mode == DSM_PROCESS && CTX->classification != DSR_NONE && CTX->flags & DSF_SIGNATURE && (CTX->tokenizer != DSZ_SBPH)) { retcode = _ds_process_signature (CTX); goto restore_mode; } header = buffer_create (NULL); body = buffer_create (NULL); if (header == NULL || body == NULL) { LOG (LOG_CRIT, ERR_MEM_ALLOC); buffer_destroy (header); buffer_destroy (body); retcode = EUNKNOWN; goto restore_mode; } /* Parse the message if it hasn't already been by the client app */ if (!CTX->message && message) CTX->message = _ds_actualize_message (message); /* Analyze and filter (unless it's a signature based classification) */ if (! (CTX->flags & DSF_SIGNATURE && CTX->operating_mode == DSM_CLASSIFY && CTX->signature != NULL)) { _ds_degenerate_message(CTX, header, body); } /*** Perform statistical operations and get a classification result ***/ /* Initialize */ CTX->result = DSR_NONE; /* If SBPH reclassification, recall and operate on saved SBPH text */ if ( CTX->tokenizer == DSZ_SBPH && CTX->operating_mode != DSM_CLASSIFY && CTX->classification != DSR_NONE && CTX->flags & DSF_SIGNATURE) { char *y, *h, *b; char *ptrptr = NULL; y = strdup((const char *) CTX->signature->data); h = strtok_r(y, "\001", &ptrptr); b = strtok_r(NULL, "\001", &ptrptr); spam_result = _ds_operate (CTX, h, b); free(y); /* Otherwise, operate on the input message */ } else { spam_result = _ds_operate (CTX, header->data, body->data); } /* Clean up */ buffer_destroy (header); buffer_destroy (body); /* _ds_operate() was unable to process message. Restore operating and training mode. */ if (spam_result != DSR_ISSPAM && spam_result != DSR_ISINNOCENT) { LOG(LOG_WARNING, "received invalid result (!DSR_ISSPAM && !DSR_ISINNOCENT)" ": %d", spam_result); retcode = EFAILURE; goto restore_mode; } /* Force decision if a classification was specified */ if (CTX->classification != DSR_NONE) { if (CTX->classification == DSR_ISINNOCENT) spam_result = DSR_ISINNOCENT; else if (CTX->classification == DSR_ISSPAM) spam_result = DSR_ISSPAM; } /* Apply results to context */ CTX->result = spam_result; if (CTX->class[0] == 0) { if (spam_result == DSR_ISSPAM) strcpy(CTX->class, LANG_CLASS_SPAM); else if (spam_result == DSR_ISINNOCENT) strcpy(CTX->class, LANG_CLASS_INNOCENT); } /* Restore operating mode and/or training mode */ restore_mode: if (is_toe) CTX->operating_mode = DSM_PROCESS; if (is_undertrain) CTX->training_mode = DST_TOE; #ifdef DEBUG if (DO_DEBUG) { if (CTX->source == DSS_NONE) { gettimeofday(&tp2, &tzp); LOGDEBUG("total processing time: %01.5fs", (double) (tp2.tv_sec + (tp2.tv_usec / 1000000.0)) - (double) (tp1.tv_sec + (tp1.tv_usec / 1000000.0))); } } #endif return retcode; } /* * dspam_getsource() * * DESCRIPTION * * The dspam_getsource() function extracts the source sender from the mes- * sage passed in during a call to dspam_process() and writes not more * than size bytes to buf. * * RETURN VALUES * returns 0 on success, standard errors on failure */ int dspam_getsource ( DSPAM_CTX * CTX, char *buf, size_t size) { ds_message_part_t current_block; ds_header_t current_heading = NULL; struct nt_node *node_nt; struct nt_c c; char qmailmode = 0; if (CTX->message == NULL) return EINVAL; node_nt = c_nt_first (CTX->message->components, &c); if (node_nt == NULL) return EINVAL; current_block = (ds_message_part_t) node_nt->ptr; node_nt = c_nt_first (current_block->headers, &c); while (node_nt != NULL) { current_heading = (ds_header_t) node_nt->ptr; if (!strcmp (current_heading->heading, "Received")) { char *data, *ptr, *tok; // detect and skip "Received: (qmail..." lines if (!strncmp(current_heading->data, "(qmail", 6)) { qmailmode = 1; node_nt = c_nt_next (current_block->headers, &c); continue; } data = strdup (current_heading->data); ptr = strstr (data, "from"); if (ptr != NULL) { if (strchr(data, '[')) // found a non-qmail header { qmailmode = 0; } // qmail puts the sending IP inside the last "()" pair of the line if (qmailmode) { tok = strrchr(data, ')'); if (tok != NULL) { *tok = 0; tok = strrchr(data, '('); if (tok != NULL) tok++; } } else { char *ptrptr = NULL; tok = strtok_r (ptr, "[", &ptrptr); if (tok != NULL) { tok = strtok_r (NULL, "]", &ptrptr); } } if (tok != NULL) { int whitelisted = 0; if (!strncmp (tok, "127.",4) || // ignore localhost !strncmp (tok, "10.", 3) || // ignore RFC 1918 private addresses !strncmp (tok, "172.16.", 7) || !strncmp (tok, "192.168.", 8) || !strncmp (tok, "169.254.", 8)) // ignore local-link whitelisted = 1; if (_ds_match_attribute(CTX->config->attributes, "LocalMX", tok)) whitelisted = 1; if (!whitelisted) { strlcpy (buf, tok, size); free (data); return 0; } } } free (data); } node_nt = c_nt_next (current_block->headers, &c); } return EFAILURE; } /* * _ds_operate() - operate on the message * * DESCRIPTION * calculate the statistical probability the email is spam * update tokens in dictionary according to result/mode * * INPUT ARGUMENTS * DSPAM_CTX *CTX pointer to context * char *header pointer to message header * char *body pointer to message body * * RETURN VALUES * standard errors on failure * * DSR_ISSPAM message is spam * DSR_ISINNOCENT message is innocent */ int _ds_operate (DSPAM_CTX * CTX, char *headers, char *body) { int errcode = 0; /* Create our diction (lexical data in message) and patterns */ ds_diction_t diction = ds_diction_create(24593ul); ds_diction_t bnr_patterns = NULL; ds_term_t ds_term; ds_cursor_t ds_c; ds_heap_t heap_sort = NULL; /* Heap sort for top N tokens */ #ifdef LIBBNR_DEBUG ds_heap_t heap_nobnr = NULL; #endif unsigned long long whitelist_token = 0; int do_whitelist = 0; int result; unsigned int heap_sort_items = 0; if (CTX->algorithms & DSA_BURTON) heap_sort = ds_heap_create(BURTON_WINDOW_SIZE, HP_DELTA); else if (CTX->algorithms & DSA_ROBINSON) heap_sort = ds_heap_create(25, HP_DELTA); else heap_sort = ds_heap_create(15, HP_DELTA); /* Allocate SBPH signature (stored as message text) */ if ( CTX->tokenizer == DSZ_SBPH && CTX->flags & DSF_SIGNATURE && ( ( CTX->operating_mode != DSM_CLASSIFY && CTX->classification == DSR_NONE) || ! (CTX->_sig_provided)) && CTX->source != DSS_CORPUS) { if (CTX->signature) { if (CTX->signature->data) free(CTX->signature->data); free(CTX->signature); CTX->signature = NULL; } CTX->signature = calloc (1, sizeof (struct _ds_spam_signature)); if (CTX->signature == NULL) { LOG (LOG_CRIT, "memory allocation error"); errcode = EUNKNOWN; goto bail; } CTX->signature->length = strlen(headers)+strlen(body)+2; CTX->signature->data = malloc(CTX->signature->length); if (CTX->signature->data == NULL) { LOG (LOG_CRIT, "memory allocation error"); free (CTX->signature); CTX->signature = NULL; errcode = EUNKNOWN; goto bail; } strcpy(CTX->signature->data, headers); strcat(CTX->signature->data, "\001"); strcat(CTX->signature->data, body); } if (!diction) { LOG (LOG_CRIT, ERR_MEM_ALLOC); errcode = EUNKNOWN; goto bail; } #ifdef LIBBNR_DEBUG heap_nobnr = ds_heap_create (heap_sort->size, HP_DELTA); if (heap_nobnr == NULL) { LOG (LOG_CRIT, ERR_MEM_ALLOC); errcode = EUNKNOWN; goto bail; } #endif CTX->result = (CTX->classification == DSR_ISSPAM) ? DSR_ISSPAM : DSR_ISINNOCENT; /* If we are classifying based on a signature, preprogram the tree */ if (CTX->flags & DSF_SIGNATURE && CTX->operating_mode == DSM_CLASSIFY && CTX->_sig_provided) { int num_tokens = CTX->signature->length / sizeof (struct _ds_signature_token); struct _ds_signature_token t; int i; for (i = 0; i < num_tokens; i++) { char x[128]; memcpy (&t, (char *) CTX->signature->data + (i * sizeof (struct _ds_signature_token)), sizeof (struct _ds_signature_token)); snprintf (x, sizeof (x), "E: %" LLU_FMT_SPEC, t.token); ds_term = ds_diction_touch(diction, t.token, x, 0); if (ds_term) ds_term->frequency = t.frequency; } } /* Otherwise, tokenize the message and propagate the tree */ else { if (_ds_tokenize(CTX, headers, body, diction)) { LOG(LOG_CRIT, "tokenizer failed"); } whitelist_token = diction->whitelist_token; } /* Load all token statistics */ if (_ds_getall_spamrecords (CTX, diction)) { LOGDEBUG ("_ds_getall_spamrecords() failed"); errcode = EUNKNOWN; goto bail; } /* Apply Bayesian Noise Reduction */ if (CTX->flags & DSF_NOISE) { ds_diction_t p = _ds_apply_bnr(CTX, diction); if (p) ds_diction_destroy(p); } if (CTX->flags & DSF_WHITELIST) { LOGDEBUG("Whitelist threshold: %d", CTX->wh_threshold); } /* Create a heap sort based on the token's delta from .5 */ ds_c = ds_diction_cursor(diction); ds_term = ds_diction_next(ds_c); while(ds_term) { if (ds_term->key == CONTROL_TOKEN) { ds_term = ds_diction_next(ds_c); continue; } if (ds_term->s.probability == 0.00000 || CTX->classification != DSR_NONE) _ds_calc_stat (CTX, ds_term, &ds_term->s, DTT_DEFAULT, NULL); if (CTX->flags & DSF_WHITELIST) { if (ds_term->key == whitelist_token && ds_term->s.spam_hits <= (ds_term->s.innocent_hits / 15) && ds_term->s.innocent_hits > CTX->wh_threshold && CTX->classification == DSR_NONE) { do_whitelist = 1; } } if (ds_term->frequency > 0 && ds_term->type == 'D') { ds_heap_insert (heap_sort, ds_term->s.probability, ds_term->key, ds_term->frequency, _ds_compute_complexity(ds_term->name)); } #ifdef LIBBNR_DEBUG if (ds_term->type == 'D') { ds_heap_insert (heap_nobnr, ds_term->s.probability, ds_term->key, ds_term->frequency, _ds_compute_complexity(ds_term->name)); } #endif #ifdef VERBOSE LOGDEBUG ("Token: %s [%f] SH %ld IH %ld", ds_term->name, ds_term->s.probability, ds_term->s.spam_hits, ds_term->s.innocent_hits); #endif ds_term = ds_diction_next(ds_c); } ds_diction_close(ds_c); /* Keep track of items in heap_sort. We need that info later on when freeing the signature */ heap_sort_items = heap_sort->items; /* Take the 15 most interesting tokens and generate a score */ if (heap_sort->items == 0) { LOGDEBUG ("no tokens found in message"); errcode = EINVAL; goto bail; } /* Initialize Non-SBPH signature, if requested */ if ( CTX->tokenizer != DSZ_SBPH && CTX->flags & DSF_SIGNATURE && (CTX->operating_mode != DSM_CLASSIFY || ! CTX->_sig_provided)) { if (CTX->signature) { if (CTX->signature->data) free(CTX->signature->data); free(CTX->signature); CTX->signature = NULL; } CTX->signature = calloc (1, sizeof (struct _ds_spam_signature)); if (CTX->signature == NULL) { LOG (LOG_CRIT, "memory allocation error"); errcode = EUNKNOWN; goto bail; } CTX->signature->length = sizeof (struct _ds_signature_token) * diction->items; CTX->signature->data = malloc (CTX->signature->length); if (CTX->signature->data == NULL) { LOG (LOG_CRIT, "memory allocation error"); free (CTX->signature); CTX->signature = NULL; errcode = EUNKNOWN; goto bail; } } #ifdef LIBBNR_DEBUG { int x = CTX->result; int nobnr_result = 0; if (CTX->flags & DSF_NOISE) { nobnr_result = _ds_calc_result(CTX, heap_nobnr, diction); if (CTX->factors) { _ds_factor_destroy(CTX->factors); CTX->factors = NULL; } CTX->result = x; CTX->probability = DSP_UNCALCULATED; } #endif result = _ds_calc_result(CTX, heap_sort, diction); #ifdef LIBBNR_DEBUG if (CTX->flags & DSF_NOISE) { if (nobnr_result == result) { LOGDEBUG("BNR Decision Concurs"); } else { LOGDEBUG("BNR Decision Conflicts: %d (BNR) / %d (No BNR)", result, nobnr_result); } } } #endif if (CTX->flags & DSF_WHITELIST && do_whitelist) { LOGDEBUG("auto-whitelisting this message"); CTX->result = DSR_ISINNOCENT; strcpy(CTX->class, LANG_CLASS_WHITELISTED); } /* Update Totals */ /* SPAM */ if (CTX->result == DSR_ISSPAM && CTX->operating_mode != DSM_CLASSIFY) { if (!(CTX->flags & DSF_UNLEARN)) { CTX->totals.spam_learned++; CTX->learned = 1; } if (CTX->classification == DSR_ISSPAM) { if (CTX->flags & DSF_UNLEARN) { CTX->totals.spam_learned -= (CTX->totals.spam_learned > 0) ? 1 : 0; } else if (CTX->source == DSS_CORPUS || CTX->source == DSS_INOCULATION) { CTX->totals.spam_corpusfed++; } else if (SPAM_MISS(CTX)) { CTX->totals.spam_misclassified++; if (CTX->training_mode != DST_TOE && CTX->training_mode != DST_NOTRAIN) { CTX->totals.innocent_learned -= (CTX->totals.innocent_learned > 0) ? 1 : 0; } } } /* INNOCENT */ } else if ((CTX->result == DSR_ISINNOCENT) && CTX->operating_mode != DSM_CLASSIFY) { if (!(CTX->flags & DSF_UNLEARN)) { CTX->totals.innocent_learned++; CTX->learned = 1; } if (CTX->source == DSS_CORPUS || CTX->source == DSS_INOCULATION) { CTX->totals.innocent_corpusfed++; } else if (FALSE_POSITIVE(CTX)) { if (CTX->flags & DSF_UNLEARN) { CTX->totals.innocent_learned -= (CTX->totals.innocent_learned >0) ? 1:0; } else { CTX->totals.innocent_misclassified++; if (CTX->training_mode != DST_TOE && CTX->training_mode != DST_NOTRAIN) { CTX->totals.spam_learned -= (CTX->totals.spam_learned > 0) ? 1 : 0; } } } } /* TOE mode increments 'classified' totals */ if (CTX->training_mode == DST_TOE && CTX->operating_mode == DSM_CLASSIFY) { if (CTX->result == DSR_ISSPAM) CTX->totals.spam_classified++; else if (CTX->result == DSR_ISINNOCENT) CTX->totals.innocent_classified++; } _ds_increment_tokens(CTX, diction); /* Store all tokens */ if (CTX->training_mode != DST_NOTRAIN) { if (_ds_setall_spamrecords (CTX, diction)) { LOGDEBUG ("_ds_setall_spamrecords() failed"); errcode = EUNKNOWN; goto bail; } } ds_diction_destroy (diction); ds_heap_destroy (heap_sort); #ifdef LIBBNR_DEBUG ds_heap_destroy (heap_nobnr); #endif /* One final sanity check */ if (CTX->classification == DSR_ISINNOCENT) { CTX->probability = 0.0; CTX->result = DSR_ISINNOCENT; } else if (CTX->classification == DSR_ISSPAM) { CTX->probability = 1.0; CTX->result = DSR_ISSPAM; } return CTX->result; bail: LOG(LOG_ERR, "bailing on error %d", errcode); ds_heap_destroy (heap_sort); #ifdef LIBBNR_DEBUG ds_heap_destroy (heap_nobnr); #endif ds_diction_destroy(diction); ds_diction_destroy(bnr_patterns); if (CTX->signature != NULL) { if (CTX->signature->data != NULL) { free(CTX->signature->data); CTX->signature->data = NULL; } if (CTX->signature != NULL && heap_sort_items > 0) free (CTX->signature); CTX->signature = NULL; } return errcode; } /* * _ds_process_signature() * * DESCRIPTION * process an erroneously classified message processing based on signature * * INPUT ARGUMENTS * parameters: DSPAM_CTX *CTX Pointer to context containing signature */ int _ds_process_signature (DSPAM_CTX * CTX) { struct _ds_signature_token t; int num_tokens, i; ds_diction_t diction = ds_diction_create(24593ul); ds_term_t ds_term; ds_cursor_t ds_c; int occurrence = _ds_match_attribute(CTX->config->attributes, "ProcessorWordFrequency", "occurrence"); if (diction == NULL) { LOG (LOG_CRIT, ERR_MEM_ALLOC); return EUNKNOWN; } if (CTX->signature == NULL) { LOG(LOG_WARNING, "DSF_SIGNATURE specified, but no signature provided."); ds_diction_destroy(diction); return EINVAL; } LOGDEBUG ("processing signature. length: %ld", CTX->signature->length); CTX->result = DSR_NONE; if (!(CTX->flags & DSF_UNLEARN)) CTX->learned = 1; /* INNOCENT */ if (CTX->classification == DSR_ISINNOCENT && CTX->operating_mode != DSM_CLASSIFY) { if (CTX->flags & DSF_UNLEARN) { CTX->totals.innocent_learned -= (CTX->totals.innocent_learned) > 0 ? 1:0; } else { if (CTX->source == DSS_ERROR) { CTX->totals.innocent_misclassified++; if (CTX->training_mode != DST_TOE && CTX->training_mode != DST_NOTRAIN) { CTX->totals.spam_learned -= (CTX->totals.spam_learned > 0) ? 1:0; } } else { CTX->totals.innocent_corpusfed++; } CTX->totals.innocent_learned++; } } /* SPAM */ else if (CTX->classification == DSR_ISSPAM && CTX->operating_mode != DSM_CLASSIFY) { if (CTX->flags & DSF_UNLEARN) { CTX->totals.spam_learned -= (CTX->totals.spam_learned > 0) ? 1 : 0; } else { if (CTX->source == DSS_ERROR) { CTX->totals.spam_misclassified++; if (CTX->training_mode != DST_TOE && CTX->training_mode != DST_NOTRAIN) { CTX->totals.innocent_learned -= (CTX->totals.innocent_learned > 0) ? 1:0; } } else { CTX->totals.spam_corpusfed++; } CTX->totals.spam_learned++; } } num_tokens = CTX->signature->length / sizeof (struct _ds_signature_token); if (CTX->class[0] == 0) { if (CTX->classification == DSR_ISSPAM) strcpy(CTX->class, LANG_CLASS_SPAM); else if (CTX->classification == DSR_ISINNOCENT) strcpy(CTX->class, LANG_CLASS_INNOCENT); } /* Don't retrain if no tokens where loaded from the signature */ if (num_tokens == 0) { LOG (LOG_WARNING, "Skipping retraining for signature with %d tokens", num_tokens); LOGDEBUG ("Skipping retraining for signature with %d tokens", num_tokens); } else { LOGDEBUG ("Reversing %d tokens", num_tokens); for (i = 0; i < num_tokens; i++) { memcpy (&t, (char *) CTX->signature->data + (i * sizeof (struct _ds_signature_token)), sizeof (struct _ds_signature_token)); ds_term = ds_diction_touch (diction, t.token, "-", 0); if (ds_term) { ds_term->frequency = t.frequency; } } if (_ds_getall_spamrecords (CTX, diction)) { ds_diction_destroy(diction); return EUNKNOWN; } ds_c = ds_diction_cursor(diction); ds_term = ds_diction_next(ds_c); while(ds_term) { /* INNOCENT */ if (CTX->classification == DSR_ISINNOCENT) { if (CTX->flags & DSF_UNLEARN) { if (occurrence) { ds_term->s.innocent_hits -= ds_term->frequency; if (ds_term->s.innocent_hits < 0) ds_term->s.innocent_hits = 0; } else { ds_term->s.innocent_hits -= (ds_term->s.innocent_hits>0) ? 1:0; } } else { if (CTX->source == DSS_ERROR && CTX->training_mode != DST_NOTRAIN && CTX->training_mode != DST_TOE) { if (occurrence) { ds_term->s.spam_hits -= ds_term->frequency; if (ds_term->s.spam_hits < 0) ds_term->s.spam_hits = 0; } else { ds_term->s.spam_hits -= (ds_term->s.spam_hits>0) ? 1:0; } } if (CTX->source == DSS_INOCULATION) { if (ds_term->s.spam_hits < 2 && ds_term->s.innocent_hits < 5) { ds_term->s.innocent_hits += 5; } else { ds_term->s.innocent_hits += 2; } } else /* ERROR or CORPUS */ { if (occurrence) { ds_term->s.innocent_hits += ds_term->frequency; } else { ds_term->s.innocent_hits++; } } } } /* SPAM */ else if (CTX->classification == DSR_ISSPAM) { if (CTX->flags & DSF_UNLEARN) { if (occurrence) { ds_term->s.spam_hits -= ds_term->frequency; if (ds_term->s.spam_hits < 0) ds_term->s.spam_hits = 0; } else { ds_term->s.spam_hits -= (ds_term->s.spam_hits>0) ? 1:0; } } else { if (CTX->source == DSS_ERROR && CTX->training_mode != DST_NOTRAIN && CTX->training_mode != DST_TOE) { if (occurrence) { ds_term->s.innocent_hits -= ds_term->frequency; if (ds_term->s.innocent_hits < 0) ds_term->s.innocent_hits = 0; } else { ds_term->s.innocent_hits -= (ds_term->s.innocent_hits>0) ? 1:0; } } if (CTX->source == DSS_INOCULATION) { if (ds_term->s.innocent_hits < 2 && ds_term->s.spam_hits < 5) { ds_term->s.spam_hits += 5; } else { ds_term->s.spam_hits += 2; } } else /* ERROR or CORPUS */ { if (occurrence) { ds_term->s.spam_hits += ds_term->frequency; } else { ds_term->s.spam_hits++; } } } } ds_term->s.status |= TST_DIRTY; ds_term = ds_diction_next(ds_c); } ds_diction_close(ds_c); if (CTX->training_mode != DST_NOTRAIN) { if (_ds_setall_spamrecords (CTX, diction)) { ds_diction_destroy(diction); return EUNKNOWN; } } } if (CTX->classification == DSR_ISSPAM) { CTX->probability = 1.0; CTX->result = DSR_ISSPAM; LOGDEBUG ("Message classification/result: SPAM"); } else { CTX->probability = 0.0; CTX->result = DSR_ISINNOCENT; LOGDEBUG ("Message classification/result: INNOCENT"); } ds_diction_destroy(diction); return 0; } /* * _ds_calc_stat() - Calculate the probability of a token * * DESCRIPTION * * Calculates the probability of an individual token based on the * pvalue algorithm chosen. The resulting value largely depends on * the total amount of ham/spam in the user's corpus. The result * is written to s. * * INPUT ARGUMENTS * CTX DSPAM context * term ds_term_t * token_type DTT_ value specifying token type * bnr_tot BNR totals structure */ int _ds_calc_stat ( DSPAM_CTX * CTX, ds_term_t term, struct _ds_spam_stat *s, int token_type, struct _ds_spam_stat *bnr_tot) { int min_hits, sed_hits = 0; unsigned long ti, ts; if (token_type == DTT_BNR) { min_hits = 25; /* Bayesian Noise Reduction patterns */ } else { min_hits = 5; /* "Standard" token threshold */ } /* Statistical Sedation: Adjust hapaxial threshold to compensate for a * spam corpus imbalance */ ti = CTX->totals.innocent_learned + CTX->totals.innocent_classified; ts = CTX->totals.spam_learned + CTX->totals.spam_classified; if (CTX->training_buffer>0) { if (ti < 1000 && ti < ts) { sed_hits = min_hits+(CTX->training_buffer/2)+ (CTX->training_buffer*((ts-ti)/200)); } if (ti < 2500 && ti >=1000 && ts > ti) { float spams = (ts * 1.0 / (ts * 1.0 + ti * 1.0)) * 100; sed_hits = min_hits+(CTX->training_buffer/2)+ (CTX->training_buffer*(spams/20)); } } else if (! CTX->training_buffer) { min_hits = 5; } if (token_type != DTT_DEFAULT || sed_hits > min_hits) min_hits = sed_hits; /* TUM mode training only records up to 20 hits so we need to make sure we * don't require more than that. */ if (CTX->training_mode == DST_TUM && min_hits > 20) min_hits = 20; if (CTX->classification == DSR_ISSPAM) s->probability = .7; else s->probability = (CTX->algorithms & DSP_MARKOV) ? .5 : .4; /* Markovian Weighting */ if (CTX->algorithms & DSP_MARKOV) { unsigned int weight; long num, den; /* some utilities don't provide the token name, and so we can't compute * a probability. just return something neutral. */ if (term == NULL) { s->probability = .5; return 0; } /* return neutral probability for BNR patterns */ if (token_type == DTT_BNR || term->type == 'B' || !strncmp(term->name, "bnr.", 4)) { s->probability = .5; return 0; } /* return neutral probability for frequency tokens */ if (!strncmp(term->name, "E: ", 3)) { s->probability = .5; return 0; } /* return neutral probability for "From" tokens (used for when whitelisting) */ if (!strncmp(term->name, "From*", 5)) { s->probability = .5; return 0; } /* return neutral probability for control tokens */ if (!strncmp(term->name, "$$CONTROL$$", 11)) { s->probability = .5; return 0; } weight = _ds_compute_weight(term->name); if (CTX->flags & DSF_BIAS) { num = weight * (s->spam_hits - (s->innocent_hits*2)); den = C1 * (s->spam_hits + (s->innocent_hits*2) + C2) * 256; s->probability = 0.49 + ((double) num / (double) den); } else { num = (s->spam_hits - s->innocent_hits) * weight; den = C1 * (s->spam_hits + s->innocent_hits + C2) * 256; s->probability = 0.5 + ((double) num / (double) den); } /* Graham and Robinson Start Here */ } else { int ih = 1; if (CTX->flags & DSF_BIAS) ih = 2; if (CTX->totals.spam_learned > 0 && CTX->totals.innocent_learned > 0) { if (token_type == DTT_BNR) { s->probability = (s->spam_hits * 1.0 / bnr_tot->spam_hits * 1.0) / ((s->spam_hits * 1.0 / bnr_tot->spam_hits * 1.0) + (s->innocent_hits * 1.0 / bnr_tot->innocent_hits * 1.0)); } else { s->probability = (s->spam_hits * 1.0 / CTX->totals.spam_learned * 1.0) / ((s->spam_hits * 1.0 / CTX->totals.spam_learned * 1.0) + (s->innocent_hits * ih * 1.0 / CTX->totals.innocent_learned * 1.0)); } } if (s->spam_hits == 0 && s->innocent_hits > 0) { s->probability = 0.01; if (CTX->totals.spam_learned > 0 && CTX->totals.innocent_learned > 0) { if ((1.0 / CTX->totals.spam_learned * 1.0) / ((1.0 / CTX->totals.spam_learned * 1.0) + (s->innocent_hits * ih * 1.0 / CTX->totals.innocent_learned * 1.0)) < 0.01) { s->probability = (1.0 / CTX->totals.spam_learned * 1.0) / ((1.0 / CTX->totals.spam_learned * 1.0) + (s->innocent_hits * ih *1.0 / CTX->totals.innocent_learned * 1.0)); } } } else if (s->spam_hits > 0 && s->innocent_hits == 0) { s->probability = 0.99; if (CTX->totals.spam_learned > 0 && CTX->totals.innocent_learned > 0) { if ((s->spam_hits * 1.0 / CTX->totals.spam_learned * 1.0) / ((s->spam_hits * 1.0 / CTX->totals.spam_learned * 1.0) + (ih * 1.0 / CTX->totals.innocent_learned * 1.0)) > 0.99) { s->probability = (s->spam_hits * 1.0 / CTX->totals.spam_learned * 1.0) / ((s->spam_hits * 1.0 / CTX->totals.spam_learned * 1.0) + (ih * 1.0 / CTX->totals.innocent_learned * 1.0)); } } } if ( (CTX->flags & DSF_BIAS && (s->spam_hits + (2 * s->innocent_hits) < min_hits)) || (!(CTX->flags & DSF_BIAS) && (s->spam_hits + s->innocent_hits < min_hits))) { s->probability = (CTX->algorithms & DSP_MARKOV) ? .5000 : .4; } } if (s->probability < 0.0001) s->probability = 0.0001; if (s->probability > 0.9999) s->probability = 0.9999; /* Finish off Robinson */ if (token_type != DTT_BNR && CTX->algorithms & DSP_ROBINSON) { unsigned long n = s->spam_hits + s->innocent_hits; double fw = ((CHI_S * CHI_X) + (n * s->probability))/(CHI_S + n); s->probability = fw; } return 0; } /* * _ds_calc_result() * * DESCRIPTION * Perform statistical combination of the token index * * Passed in an index of tokens, this function is responsible for choosing * and combining the most relevant characteristics (based on the algorithms * configured) and calculating libdspam's decision about the provided * message sample. */ int _ds_calc_result(DSPAM_CTX *CTX, ds_heap_t heap_sort, ds_diction_t diction) { struct _ds_spam_stat stat; ds_heap_element_t node_heap; ds_heap_element_t heap_list[heap_sort->items]; /* Naive-Bayesian */ float nbay_top = 0.0; float nbay_bot = 0.0; float nbay_result = -1; long nbay_used = 0; /* Total tokens used in naive bayes */ struct nt *factor_nbayes = nt_create(NT_PTR); /* Graham-Bayesian */ float bay_top = 0.0; float bay_bot = 0.0; float bay_result = -1; long bay_used = 0; /* Total tokens used in bayes */ struct nt *factor_bayes = nt_create(NT_PTR); /* Burton-Bayesian */ double abay_top = 0.0; double abay_bot = 0.0; double abay_result = -1; long abay_used = 0; /* Total tokens used in altbayes */ struct nt *factor_altbayes = nt_create(NT_PTR); /* Robinson's Geometric Mean, used to calculate confidence */ float rob_top = 0.0; /* Robinson's Geometric Mean */ float rob_bot = 0.0; float rob_result = -1; double p = 0.0, q = 0.0, s = 0.0; /* Robinson PQS Calculations */ long rob_used = 0; /* Total tokens used in Robinson's GM */ struct nt *factor_rob = nt_create(NT_PTR); /* Fisher-Robinson's Chi-Square */ float chi_result = -1; long chi_used = 0, chi_sx = 0, chi_hx = 0; double chi_s = 1.0, chi_h = 1.0; struct nt *factor_chi = nt_create(NT_PTR); unsigned int i; /* Invert the heap */ node_heap = heap_sort->root; for(i=0;iitems;i++) { heap_list[(heap_sort->items-i)-1] = node_heap; node_heap = node_heap->next; } /* BEGIN Combine Token Values */ for(i=0;iitems;i++) { char *token_name; ds_term_t ds_term; node_heap = heap_list[i]; ds_term = ds_diction_find(diction, node_heap->token); if (!ds_term) continue; /* Skip BNR patterns */ if (ds_term->type == 'B') continue; token_name = ds_term->name; if (ds_diction_getstat(diction, node_heap->token, &stat) || !token_name) continue; /* Set the probability if we've provided a classification */ if (CTX->classification == DSR_ISSPAM) stat.probability = 1.00; else if (CTX->classification == DSR_ISINNOCENT) stat.probability = 0.00; /* Graham-Bayesian */ if (CTX->algorithms & DSA_GRAHAM && bay_used < 15) { LOGDEBUG ("[graham] [%2.6f] %s (%dfrq, %lds, %ldi)", stat.probability, token_name, ds_term->frequency, stat.spam_hits, stat.innocent_hits); _ds_factor(factor_bayes, token_name, stat.probability); if (bay_used == 0) { bay_top = stat.probability; bay_bot = 1 - stat.probability; } else { bay_top *= stat.probability; bay_bot *= (1 - stat.probability); } bay_used++; } /* Burton Bayesian */ if (CTX->algorithms & DSA_BURTON && abay_used < BURTON_WINDOW_SIZE) { LOGDEBUG ("[burton] [%2.6f] %s (%dfrq, %lds, %ldi)", stat.probability, token_name, ds_term->frequency, stat.spam_hits, stat.innocent_hits); _ds_factor(factor_altbayes, token_name, stat.probability); if (abay_used == 0) { abay_top = stat.probability; abay_bot = (1 - stat.probability); } else { abay_top *= stat.probability; abay_bot *= (1 - stat.probability); } abay_used++; if (abay_used < BURTON_WINDOW_SIZE && ds_term->frequency > 1 ) { LOGDEBUG ("[burton] [%2.6f] %s (%dfrq, %lds, %ldi)", stat.probability, token_name, ds_term->frequency, stat.spam_hits, stat.innocent_hits); _ds_factor(factor_altbayes, token_name, stat.probability); abay_used++; abay_top *= stat.probability; abay_bot *= (1 - stat.probability); } } /* Robinson's Geometric Mean Definitions */ //#define ROB_S 0.010 /* Sensitivity */ //#define ROB_X 0.415 /* Value to use when N = 0 */ //#define ROB_CUTOFF 0.54 #define ROB_S 0.010 /* Sensitivity */ #define ROB_X 0.500 /* Value to use when N = 0 */ #define ROB_CUTOFF 0.50 if (rob_used < 25) { float probability; long n = (heap_sort->items > 25) ? 25 : heap_sort->items; probability = ((ROB_S * ROB_X) + (n * stat.probability)) / (ROB_S + n); #ifdef ROBINSON #ifndef VERBOSE if (CTX->operating_mode != DSM_CLASSIFY) { #endif LOGDEBUG ("[rob] [%2.6f] %s (%dfrq, %lds, %ldi)", stat.probability, token_name, ds_term->frequency, stat.spam_hits, stat.innocent_hits); #ifndef VERBOSE } #endif #endif _ds_factor(factor_rob, token_name, stat.probability); if (probability < 0.3 || probability > 0.7) { if (rob_used == 0) { rob_top = probability; rob_bot = (1 - probability); } else { rob_top *= probability; rob_bot *= (1 - probability); } rob_used++; if (rob_used < 25 && ds_term->frequency > 1) { #ifdef ROBINSON #ifndef VERBOSE if (CTX->operating_mode != DSM_CLASSIFY) { #endif LOGDEBUG ("[rob] [%2.6f] %s (%dfrq, %lds, %ldi)", stat.probability, token_name, ds_term->frequency, stat.spam_hits, stat.innocent_hits); #ifndef VERBOSE } #endif #endif _ds_factor(factor_rob, token_name, stat.probability); rob_used++; rob_top *= probability; rob_bot *= (1 - probability); } } } } /* END Combine Token Values */ /* Fisher-Robinson's Inverse Chi-Square */ #define CHI_CUTOFF 0.5010 /* Ham/Spam Cutoff */ #define CHI_EXCR 0.4500 /* Exclusionary Radius */ #define LN2 0.69314718055994530942 /* log e2 */ if (CTX->algorithms & DSA_CHI_SQUARE || CTX->algorithms & DSA_NAIVE) { ds_term_t ds_term; ds_cursor_t ds_c; double fw; int n, exp; ds_c = ds_diction_cursor(diction); ds_term = ds_diction_next(ds_c); while(ds_term) { if (ds_term->key == CONTROL_TOKEN) { ds_term = ds_diction_next(ds_c); continue; } /* Naive-Bayesian */ if (CTX->algorithms & DSA_NAIVE) { LOGDEBUG ("[naive] [%2.6f] %s (%dfrq, %lds, %ldi)", ds_term->s.probability, ds_term->name, ds_term->frequency, ds_term->s.spam_hits, ds_term->s.innocent_hits); _ds_factor(factor_nbayes, ds_term->name, stat.probability); if (nbay_used == 0) { nbay_top = stat.probability; nbay_bot = 1 - stat.probability; } else { nbay_top *= stat.probability; nbay_bot *= (1 - stat.probability); } nbay_used++; } if (CTX->algorithms & DSA_CHI_SQUARE) { /* Skip BNR Tokens */ if (ds_term->type == 'B') goto CHI_NEXT; /* Convert the p-value */ if (CTX->algorithms & DSP_ROBINSON) { fw = ds_term->s.probability; } else { n = ds_term->s.spam_hits + ds_term->s.innocent_hits; fw = ((CHI_S * CHI_X) + (n * ds_term->s.probability))/(CHI_S + n); } if (fabs(0.5-fw)>CHI_EXCR) { int iter = 1; while(iter>0) { iter --; #ifndef VERBOSE if (CTX->operating_mode != DSM_CLASSIFY) { #endif LOGDEBUG ("[chi-sq] [%2.6f] %s (%dfrq, %lds, %ldi)", fw, ds_term->name, ds_term->frequency, ds_term->s.spam_hits, ds_term->s.innocent_hits); #ifndef VERBOSE } #endif _ds_factor(factor_chi, ds_term->name, ds_term->s.probability); chi_used++; chi_s *= (1.0 - fw); chi_h *= fw; if (chi_s < 1e-200) { chi_s = frexp(chi_s, &exp); chi_sx += exp; } if (chi_h < 1e-200) { chi_h = frexp(chi_h, &exp); chi_hx += exp; } } } } CHI_NEXT: ds_term = ds_diction_next(ds_c); } ds_diction_close(ds_c); } /* BEGIN Calculate Individual Probabilities */ if (CTX->algorithms & DSA_NAIVE) { nbay_result = (nbay_top) / (nbay_top + nbay_bot); LOGDEBUG ("Naive-Bayesian Probability: %f Samples: %ld", nbay_result, nbay_used); } if (CTX->algorithms & DSA_GRAHAM) { bay_result = (bay_top) / (bay_top + bay_bot); LOGDEBUG ("Graham-Bayesian Probability: %f Samples: %ld", bay_result, bay_used); } if (CTX->algorithms & DSA_BURTON) { abay_result = (abay_top) / (abay_top + abay_bot); LOGDEBUG ("Burton-Bayesian Probability: %f Samples: %ld", abay_result, abay_used); } /* Robinson's */ if (rob_used == 0) { p = q = s = 0; } else { p = 1.0 - pow (rob_bot, 1.0 / rob_used); q = 1.0 - pow (rob_top, 1.0 / rob_used); s = (p - q) / (p + q); s = (s + 1.0) / 2.0; } rob_result = s; if (CTX->algorithms & DSA_ROBINSON) { LOGDEBUG("Robinson's Geometric Confidence: %f (Spamminess: %f, " "Non-Spamminess: %f, Samples: %ld)", rob_result, p, q, rob_used); } if (CTX->algorithms & DSA_CHI_SQUARE) { chi_s = log(chi_s) + chi_sx * LN2; chi_h = log(chi_h) + chi_hx * LN2; if (chi_used) { chi_s = 1.0 - chi2Q(-2.0 * chi_s, 2 * chi_used); chi_h = 1.0 - chi2Q(-2.0 * chi_h, 2 * chi_used); chi_result = ((chi_s-chi_h)+1.0) / 2.0; } else { chi_result = (float)(CHI_CUTOFF-0.1); } LOGDEBUG("Chi-Square Confidence: %f", chi_result); } /* END Calculate Individual Probabilities */ /* BEGIN Determine Result */ if (CTX->classification == DSR_ISSPAM) { CTX->result = DSR_ISSPAM; CTX->probability = 1.0; } else if (CTX->classification == DSR_ISINNOCENT) { CTX->result = DSR_ISINNOCENT; CTX->probability = 0.0; } else { struct nt *factor = NULL; if (CTX->algorithms & DSA_NAIVE) { factor = factor_nbayes; if (((CTX->algorithms & DSP_MARKOV) && nbay_result > 0.5000) || (!(CTX->algorithms & DSP_MARKOV) && nbay_result >= 0.9)) { CTX->result = DSR_ISSPAM; CTX->probability = nbay_result; CTX->factors = factor; LOGDEBUG("using Naive-Bayes factors"); } } if (CTX->algorithms & DSA_GRAHAM) { factor = factor_bayes; if (((CTX->algorithms & DSP_MARKOV) && bay_result > 0.5000) || (!(CTX->algorithms & DSP_MARKOV) && bay_result >= 0.9)) { CTX->result = DSR_ISSPAM; CTX->probability = bay_result; CTX->factors = factor; LOGDEBUG("using Graham factors"); } } if (CTX->algorithms & DSA_BURTON) { factor = factor_altbayes; if (((CTX->algorithms & DSP_MARKOV) && abay_result > 0.5000) || (!(CTX->algorithms & DSP_MARKOV) && abay_result >= 0.9)) { CTX->result = DSR_ISSPAM; CTX->probability = abay_result; if (!CTX->factors) { CTX->factors = factor; LOGDEBUG("using Burton factors"); } } } if (CTX->algorithms & DSA_ROBINSON) { factor = factor_rob; if (((CTX->algorithms & DSP_MARKOV) && rob_result > 0.5000) || (!(CTX->algorithms & DSP_MARKOV) && rob_result >= ROB_CUTOFF)) { CTX->result = DSR_ISSPAM; if (CTX->probability < 0) CTX->probability = rob_result; if (!CTX->factors) { CTX->factors = factor; LOGDEBUG("using Robinson-Geom factors"); } } } if (CTX->algorithms & DSA_CHI_SQUARE) { factor = factor_chi; if (((CTX->algorithms & DSP_MARKOV) && chi_result > 0.5000) || (!(CTX->algorithms & DSP_MARKOV) && chi_result >= CHI_CUTOFF)) { CTX->result = DSR_ISSPAM; if (CTX->probability < 0) CTX->probability = chi_result; if (!CTX->factors) { CTX->factors = factor; LOGDEBUG("using Chi-Square factors"); } } } if (!CTX->factors) { CTX->factors = factor; LOGDEBUG("no factors specified; using default"); } } if (CTX->factors != factor_nbayes) _ds_factor_destroy(factor_nbayes); if (CTX->factors != factor_bayes) _ds_factor_destroy(factor_bayes); if (CTX->factors != factor_altbayes) _ds_factor_destroy(factor_altbayes); if (CTX->factors != factor_rob) _ds_factor_destroy(factor_rob); if (CTX->factors != factor_chi) _ds_factor_destroy(factor_chi); /* If somehow we haven't yet assigned a probability, assign one */ if (CTX->probability == DSP_UNCALCULATED) { if (CTX->algorithms & DSA_GRAHAM) CTX->probability = bay_result; if (CTX->algorithms & DSA_NAIVE) CTX->probability = nbay_result; if (CTX->probability < 0 && CTX->algorithms & DSA_BURTON) CTX->probability = abay_result; if (CTX->probability < 0 && CTX->algorithms & DSA_ROBINSON) CTX->probability = rob_result; if (CTX->probability < 0 && CTX->algorithms & DSA_CHI_SQUARE) CTX->probability = chi_result; } #ifdef VERBOSE if (DO_DEBUG && (!(CTX->algorithms & DSP_MARKOV))) { if (abay_result >= 0.9 && bay_result < 0.9) { LOGDEBUG ("CATCH: Burton Bayesian"); } else if (abay_result < 0.9 && bay_result >= 0.9) { LOGDEBUG ("MISS: Burton Bayesian"); } if (rob_result >= ROB_CUTOFF && bay_result < 0.9) { LOGDEBUG ("CATCH: Robinson's"); } else if (rob_result < ROB_CUTOFF && bay_result >= 0.9) { LOGDEBUG ("MISS: Robinson's"); } if (chi_result >= CHI_CUTOFF && bay_result < 0.9) { LOGDEBUG("CATCH: Chi-Square"); } else if (chi_result < CHI_CUTOFF && bay_result >= 0.9) { LOGDEBUG("MISS: Chi-Square"); } } #endif /* Calculate Confidence */ if (CTX->algorithms & DSP_MARKOV) { if (CTX->result == DSR_ISSPAM) { CTX->confidence = CTX->probability; } else { CTX->confidence = 1.0 - CTX->probability; } } else { if (CTX->result == DSR_ISSPAM) { CTX->confidence = rob_result; } else { CTX->confidence = 1.0 - rob_result; } } LOGDEBUG("Result Confidence: %1.2f", CTX->confidence); return CTX->result; } /* * _ds_factor() * * DESCRIPTION * Factors a token/value into a set * * Adds a token/value pair to a factor set. The factor set of the dominant * calculation is provided to the client in order to explain libdspam's * final decision about the message's classification. */ int _ds_factor(struct nt *set, char *token_name, float value) { struct dspam_factor *f; f = calloc(1, sizeof(struct dspam_factor)); if (!f) return EUNKNOWN; f->token_name = strdup(token_name); f->value = value; nt_add(set, (void *) f); return 0; } /* * _ds_factor_destroy - destroy a factor tree * */ void _ds_factor_destroy(struct nt *factors) { struct dspam_factor *f; struct nt_node *node; struct nt_c c; if (factors == NULL) return; node = c_nt_first(factors, &c); while(node != NULL) { f = (struct dspam_factor *) node->ptr; if (f) free(f->token_name); node = c_nt_next(factors, &c); } nt_destroy(factors); return; } int libdspam_init(const char *driver) { #ifndef STATIC_DRIVER if (driver == NULL) { LOG(LOG_CRIT, "dlopen() failed: Can not load NULL driver"); return EFAILURE; } else if (driver) { if ((_drv_handle = dlopen(driver, RTLD_NOW))==NULL) { LOG(LOG_CRIT, "dlopen() failed: %s: %s", driver, dlerror()); return EFAILURE; } } #endif return 0; } int libdspam_shutdown(void) { #ifndef STATIC_DRIVER if (_drv_handle) { int r; if ((r=dlclose(_drv_handle))) { LOG(LOG_CRIT, "dlclose() failed: %s", dlerror()); return r; } } #endif return 0; } int _ds_instantiate_bnr( DSPAM_CTX *CTX, ds_diction_t patterns, struct nt *stream, char identifier) { float previous_bnr_probs[BNR_SIZE]; ds_term_t ds_term, ds_touch; struct nt_node *node_nt; struct nt_c c_nt; unsigned long long crc; char bnr_token[64]; int i; for(i=0;iptr; _ds_calc_stat (CTX, ds_term, &ds_term->s, DTT_DEFAULT, NULL); for(i=0;is.probability); sprintf(bnr_token, "bnr.%c|", identifier); for(i=0;itype = 'B'; node_nt = c_nt_next(stream, &c_nt); } return 0; } ds_diction_t _ds_apply_bnr (DSPAM_CTX *CTX, ds_diction_t diction) { /* Bayesian Noise Reduction - Contextual Symmetry Logic http://bnr.nuclearelephant.com */ ds_diction_t bnr_patterns = ds_diction_create(3079); struct _ds_spam_stat bnr_tot; unsigned long long crc; BNR_CTX *BTX_S, *BTX_C; struct nt_node *node_nt; struct nt_c c_nt; ds_term_t ds_term, ds_touch; ds_cursor_t ds_c; if (!bnr_patterns) { LOG (LOG_CRIT, ERR_MEM_ALLOC); return NULL; } BTX_S = bnr_init(BNR_INDEX, 's'); BTX_C = bnr_init(BNR_INDEX, 'c'); if (!BTX_S || !BTX_C) { LOGDEBUG("bnr_init() failed"); bnr_destroy(BTX_S); bnr_destroy(BTX_C); ds_diction_destroy(bnr_patterns); return NULL; } BTX_S->window_size = BNR_SIZE; BTX_C->window_size = BNR_SIZE; _ds_instantiate_bnr(CTX, bnr_patterns, diction->order, 's'); _ds_instantiate_bnr(CTX, bnr_patterns, diction->chained_order, 'c'); /* Add BNR totals to the list of load elements */ memset(&bnr_tot, 0, sizeof(struct _ds_spam_stat)); crc = _ds_getcrc64("bnr.t|"); ds_touch = ds_diction_touch(bnr_patterns, crc, "bnr.t|", 0); ds_touch->type = 'B'; /* Load BNR patterns */ LOGDEBUG("Loading %ld BNR patterns", bnr_patterns->items); if (_ds_getall_spamrecords (CTX, bnr_patterns)) { LOGDEBUG ("_ds_getall_spamrecords() failed"); ds_diction_destroy(bnr_patterns); return NULL; } /* Perform BNR Processing */ if (CTX->classification == DSR_NONE && CTX->_sig_provided == 0 && CTX->totals.innocent_learned + CTX->totals.innocent_classified > 2500) { int elim; #ifdef LIBBNR_DEBUG char fn[MAX_FILENAME_LENGTH]; FILE *file; #endif node_nt = c_nt_first(diction->order, &c_nt); while(node_nt != NULL) { ds_term = node_nt->ptr; bnr_add(BTX_S, ds_term->name, ds_term->s.probability); node_nt = c_nt_next(diction->order, &c_nt); } node_nt = c_nt_first(diction->chained_order, &c_nt); while(node_nt != NULL) { ds_term = node_nt->ptr; bnr_add(BTX_C, ds_term->name, ds_term->s.probability); node_nt = c_nt_next(diction->chained_order, &c_nt); } bnr_instantiate(BTX_S); bnr_instantiate(BTX_C); /* Calculate pattern p-values */ ds_diction_getstat(bnr_patterns, crc, &bnr_tot); ds_c = ds_diction_cursor(bnr_patterns); ds_term = ds_diction_next(ds_c); while(ds_term) { _ds_calc_stat(CTX, ds_term, &ds_term->s, DTT_BNR, &bnr_tot); if (ds_term->name[4] == 's') bnr_set_pattern(BTX_S, ds_term->name, ds_term->s.probability); else if (ds_term->name[4] == 'c') bnr_set_pattern(BTX_C, ds_term->name, ds_term->s.probability); ds_term = ds_diction_next(ds_c); } ds_diction_close(ds_c); bnr_finalize(BTX_S); bnr_finalize(BTX_C); /* Propagate eliminations to DSPAM */ node_nt = c_nt_first(diction->order, &c_nt); while(node_nt != NULL) { ds_term = node_nt->ptr; bnr_get_token(BTX_S, &elim); if (elim) ds_term->frequency--; node_nt = c_nt_next(diction->order, &c_nt); } node_nt = c_nt_first(diction->chained_order, &c_nt); while(node_nt != NULL) { ds_term = node_nt->ptr; bnr_get_token(BTX_C, &elim); if (elim) ds_term->frequency--; node_nt = c_nt_next(diction->chained_order, &c_nt); } #ifdef LIBBNR_DEBUG float snr; if (BTX_S->stream->items + BTX_C->stream->items + BTX_S->eliminations + BTX_C->eliminations > 0) { snr = 100.0*((BTX_S->eliminations + BTX_C->eliminations + 0.0)/ (BTX_S->stream->items + BTX_C->stream->items + BTX_S->eliminations + BTX_C->eliminations)); } else { snr = 0; } LOGDEBUG("bnr reported snr of %02.3f", snr); #ifdef LIBBNR_GRAPH_OUTPUT printf("BEFORE\n\n"); node_nt = c_nt_first(diction->order, &c_nt); while(node_nt != NULL) { ds_term = node_nt->ptr; printf("%1.5f\n", ds_term->s.probability); node_nt = c_nt_next(diction->order, &c_nt); } printf("\n\nAFTER\n\n"); node_nt = c_nt_first(diction->order, &c_nt); while(node_nt != NULL) { ds_term = node_nt->ptr; if (ds_term->frequency > 0) printf("%1.5f\n", ds_term->s.probability); node_nt = c_nt_next(diction->order, &c_nt); } printf("\n"); #endif snprintf(fn, sizeof(fn), "%s/bnr.log", LOGDIR); file = fopen(fn, "a"); if (file != NULL) { fprintf(file, "-- BNR Filter Process Results --\n"); fprintf(file, "Eliminations:\n"); node_nt = c_nt_first(diction->order, &c_nt); while(node_nt != NULL) { ds_term = node_nt->ptr; if (ds_term->frequency <= 0) fprintf(file, "%s ", ds_term->name); node_nt = c_nt_next(diction->order, &c_nt); } fprintf(file, "\n["); node_nt = c_nt_first(diction->order, &c_nt); while(node_nt != NULL) { ds_term = node_nt->ptr; if (ds_term->frequency <= 0) fprintf(file, "%1.2f ", ds_term->s.probability); node_nt = c_nt_next(diction->order, &c_nt); } fprintf(file, "]\n\nRemaining:\n"); node_nt = c_nt_first(diction->order, &c_nt); while(node_nt != NULL) { ds_term = node_nt->ptr; if (ds_term->frequency > 0) fprintf(file, "%s ", ds_term->name); node_nt = c_nt_next(diction->order, &c_nt); } fprintf(file, "\n["); node_nt = c_nt_first(diction->order, &c_nt); while(node_nt != NULL) { ds_term = node_nt->ptr; if (ds_term->frequency > 0) fprintf(file, "%1.2f ", ds_term->s.probability); node_nt = c_nt_next(diction->order, &c_nt); } fprintf(file, "]\nProcessed for: %s\n\n", CTX->username); fprintf(file, "-- Chained Tokens --\n"); fprintf(file, "Eliminations:\n"); node_nt = c_nt_first(diction->chained_order, &c_nt); while(node_nt != NULL) { ds_term = node_nt->ptr; if (ds_term->frequency <= 0) fprintf(file, "%s ", ds_term->name); node_nt = c_nt_next(diction->chained_order, &c_nt); } fprintf(file, "\n["); node_nt = c_nt_first(diction->chained_order, &c_nt); while(node_nt != NULL) { ds_term = node_nt->ptr; if (ds_term->frequency <= 0) fprintf(file, "%1.2f ", ds_term->s.probability); node_nt = c_nt_next(diction->chained_order, &c_nt); } fprintf(file, "]\n\nRemaining:\n"); node_nt = c_nt_first(diction->chained_order, &c_nt); while(node_nt != NULL) { ds_term = node_nt->ptr; if (ds_term->frequency > 0) fprintf(file, "%s ", ds_term->name); node_nt = c_nt_next(diction->chained_order, &c_nt); } fprintf(file, "\n["); node_nt = c_nt_first(diction->chained_order, &c_nt); while(node_nt != NULL) { ds_term = node_nt->ptr; if (ds_term->frequency > 0) fprintf(file, "%1.2f ", ds_term->s.probability); node_nt = c_nt_next(diction->chained_order, &c_nt); } fprintf(file, "]\nProcessed for: %s\n\n", CTX->username); fclose(file); } #endif } bnr_destroy(BTX_S); bnr_destroy(BTX_C); /* Add BNR pattern to token hash */ if (CTX->totals.innocent_learned + CTX->totals.innocent_classified > 1000) { ds_c = ds_diction_cursor(bnr_patterns); ds_term = ds_diction_next(ds_c); while(ds_term) { ds_term_t t = ds_diction_touch(diction, ds_term->key, ds_term->name, 0); t->type = 'B'; ds_diction_setstat(diction, ds_term->key, &ds_term->s); if (t) t->frequency = 1; #ifdef LIBBNR_DEBUG if (fabs(0.5-ds_term->s.probability)>0.25) { LOGDEBUG("Interesting BNR Pattern: %s %01.5f %lds %ldi", ds_term->name, ds_term->s.probability, ds_term->s.spam_hits, ds_term->s.innocent_hits); } #endif ds_term = ds_diction_next(ds_c); } ds_diction_close(ds_c); } return bnr_patterns; } int _ds_increment_tokens(DSPAM_CTX *CTX, ds_diction_t diction) { ds_cursor_t ds_c; ds_term_t ds_term; int i = 0; int occurrence = _ds_match_attribute(CTX->config->attributes, "ProcessorWordFrequency", "occurrence"); ds_c = ds_diction_cursor(diction); ds_term = ds_diction_next(ds_c); while(ds_term) { unsigned long long crc; crc = ds_term->key; /* Create a signature if we're processing a message */ if (CTX->tokenizer != DSZ_SBPH && CTX->flags & DSF_SIGNATURE && (CTX->operating_mode != DSM_CLASSIFY || !(CTX->_sig_provided))) { struct _ds_signature_token t; memset(&t, 0, sizeof(t)); t.token = crc; t.frequency = ds_term->frequency; memcpy ((char *) CTX->signature->data + (i * sizeof (struct _ds_signature_token)), &t, sizeof (struct _ds_signature_token)); } /* If classification was provided, force probabilities */ if (CTX->classification == DSR_ISSPAM) ds_term->s.probability = 1.00; else if (CTX->classification == DSR_ISINNOCENT) ds_term->s.probability = 0.00; if (ds_term->type == 'D' && ( CTX->training_mode != DST_TUM || CTX->source == DSS_ERROR || CTX->source == DSS_INOCULATION || ds_term->s.spam_hits + ds_term->s.innocent_hits < 50 || ds_term->key == diction->whitelist_token || CTX->confidence < 0.70)) { ds_term->s.status |= TST_DIRTY; } if (ds_term->type == 'B' && CTX->totals.innocent_learned + CTX->totals.innocent_classified > 500 && CTX->flags & DSF_NOISE && CTX->_sig_provided == 0) { ds_term->s.status |= TST_DIRTY; } /* SPAM */ if (CTX->result == DSR_ISSPAM) { /* Inoculations increase token count considerably */ if (CTX->source == DSS_INOCULATION) { if (ds_term->s.innocent_hits < 2 && ds_term->s.spam_hits < 5) ds_term->s.spam_hits += 5; else ds_term->s.spam_hits += 2; } /* Standard increase */ else { if (CTX->flags & DSF_UNLEARN) { if (CTX->classification == DSR_ISSPAM) { if (occurrence) { ds_term->s.spam_hits -= ds_term->frequency; if (ds_term->s.spam_hits < 0) ds_term->s.spam_hits = 0; } else { ds_term->s.spam_hits -= (ds_term->s.spam_hits>0) ? 1:0; } } } else { if (occurrence) { ds_term->s.spam_hits += ds_term->frequency; } else { ds_term->s.spam_hits++; } } } if (SPAM_MISS(CTX) && !(CTX->flags & DSF_UNLEARN) && CTX->training_mode != DST_TOE && CTX->training_mode != DST_NOTRAIN) { if (occurrence) { ds_term->s.innocent_hits -= ds_term->frequency; if (ds_term->s.innocent_hits < 0) ds_term->s.innocent_hits = 0; } else { ds_term->s.innocent_hits -= (ds_term->s.innocent_hits>0) ? 1:0; } } } /* INNOCENT */ else { if (CTX->flags & DSF_UNLEARN) { if (CTX->classification == DSR_ISINNOCENT) { if (occurrence) { ds_term->s.innocent_hits -= ds_term->frequency; if (ds_term->s.innocent_hits < 0) ds_term->s.innocent_hits = 0; } else { ds_term->s.innocent_hits -= (ds_term->s.innocent_hits>0) ? 1:0; } } } else { if (occurrence) { ds_term->s.innocent_hits += ds_term->frequency; } else { ds_term->s.innocent_hits++; } } if (FALSE_POSITIVE(CTX) && !(CTX->flags & DSF_UNLEARN) && CTX->training_mode != DST_TOE && CTX->training_mode != DST_NOTRAIN) { if (occurrence) { ds_term->s.spam_hits -= ds_term->frequency; if (ds_term->s.spam_hits < 0) ds_term->s.spam_hits = 0; } else { ds_term->s.spam_hits -= (ds_term->s.spam_hits>0) ? 1:0; } } } ds_term = ds_diction_next(ds_c); i++; } ds_diction_close(ds_c); return 0; }