/* $Id: libdspam_objects.h,v 1.27 2011/07/11 21:29:57 sbajic Exp $ */ /* DSPAM COPYRIGHT (C) 2002-2012 DSPAM PROJECT This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ #ifndef _LIBDSPAM_OBJECTS_H # define _LIBDSPAM_OBJECTS_H #ifdef HAVE_CONFIG_H #include #endif #include #include "config.h" #include "config_shared.h" #include "decode.h" #if ((defined(__sun__) && defined(__svr4__)) || (defined(__sun) && defined(__SUNPRO_C))) && !defined(u_int32_t) && !defined(__BIT_TYPES_DEFINED__) #define __BIT_TYPES_DEFINED__ typedef unsigned long long u_int64_t; typedef unsigned int u_int32_t; typedef unsigned short u_int16_t; typedef unsigned char u_int8_t; #endif #ifdef _WIN32 typedef unsigned int u_int32_t; typedef u_int32_t uid_t; #endif extern void *_drv_handle; /* Handle to storage driver library */ /* * struct dspam_factor - A single determining factor * * An element containing a determining factor in the dominant calculation of * a message. An array of these are returned to the calling application to * explain libdspam's final classification decision. */ struct dspam_factor { char *token_name; float value; }; /* * struct _ds_spam_totals - User spam totals * * Spam totals loaded into the user's filter context upon a call to * dspam_init(). This structure represents the user's cumulative statistics. * * spam_learned, innocent_learned * The total number of messages trained on. * * spam_misclassified, innocent_misclassified * The total number of messages that were misclassified by DSPAM, and * submitted for retraining. * * spam_classified, innocent_classified * The total number of messages that were classified by DSPAM, but not * learned. Used exclusively with Train-on-Error mode. * * spam_corpusfed, innocent_corpusfed * The total number of messages supplied by the end-user for training. * * NOTE: The ordering of the variables in the structure must remain * consistent to ensure backward-compatibility with some storage * drivers (such as the Berkeley DB drivers) */ struct _ds_spam_totals { long spam_learned; long innocent_learned; long spam_misclassified; long innocent_misclassified; long spam_corpusfed; long innocent_corpusfed; long spam_classified; long innocent_classified; }; /* * struct _ds_spam_stat - Statistics for a single token: * * probability * The calculated probability of the token based on the active pvalue * algorithm (selected at configure-time). * * spam_hits, innocent_hits * The total number of times the token has appeared in each class of * message. If Train-on-Error or Train-until-Mature training modes are * employed, these values will not necessarily be updated for every * message. * * status * TST_DISK Value was loaded from the storage interface * TST_DIRTY Statistic is dirty (not written to disk since last modified) */ typedef struct _ds_spam_stat { double probability; long spam_hits; long innocent_hits; char status; unsigned long offset; } *ds_spam_stat_t; /* * struct _ds_spam_signature - A historical classification signature * * A binary representation of the original training instance. The spam * signature contains all the metadata used in the original decision * about the message, so that a 1:1 retraining can take place if the * message is submitted for retraining (e.g. was misclassified). The * signature contains a series of _ds_signature_token structures, which * house the original set of tokens used and their frequency counts in * the message. A spam signature is a temporary piece of data that is * usually purged from disk after a short period of time. */ struct _ds_spam_signature { void *data; unsigned long length; }; /* * struct _ds_signature_token - An entry in the classification signature * * A signature token is a single entry in the binary _ds_spam_signature * data blob, representing a single data point from the original * training instance. * * token * The checksum of the original token in the message * * frequency * The token's frequency in the original message */ struct _ds_signature_token { unsigned long long token; unsigned char frequency; }; /* * struct _ds_config - libdspam attributes configuration * * Each classification context may have an attributes configuration * which is read by various components of libdspam. This structure * contains an array of attributes and the size of the array. */ struct _ds_config { config_t attributes; long size; }; /* * DSPAM_CTX - The DSPAM Classification Context * * A classification context is attached directly to a filter instance * and supplies the entire context for the filter instance to operate * under. This includes the user and group, operational flags, * training mode, and the message being operated on. The filter * instance also sets specific output variables within the context * such as the result of a classification, confidence level, and * etcetera. * * username, group (input) * The current username and group that is being operated on. * * totals (output) * The set of statistics loaded when dspam_init() is called. * * signature (input, output) * The signature represents a DSPAM signature, and can be supplied * as an input variable for retraining (e.g. in the event of a * misclassification) or used as an output variable to store a * signature generated by the filter instance during normal * classification. * * message (input) * The message being operated on, post-actualization. This can be * left NULL, and libdspam will automatically actualize the message * * probability (output) * The probability of the resulting operation. This is generally a * floating point number between 0 and 1, 1 being the highest * probability of high order classification. * * result (output) * The final result of the requested operation. This is generally * either DSR_ISSPAM, DSR_ISINNOCENT, or DSR_WHITELISTED. * * confidence (output) * The confidence that the filter has in its returned result. * NOTE: Confidence is not always supported, and may be zero. * * operating_mode (input) * Sets the operating mode of the filter instance. This can be one * of the following: * * DSM_PROCESS Classify and learn the supplied message using * whatever training mode is specified * * DSM_CLASSIFY Classify the supplied message only; do not * learn or update any counters. * * DSM_TOOLS Identifies that the calling function is from * a utility, and no operation will be requested. * * training_mode (input) * The training mode sets the type of training the filter instance * should apply to the process. This can be one of: * * DST_TEFT Train-on-Everything * Trains every single message processed * * DST_TOE Train-on-Error * Trains only on a misclassification or * corpus-fed message. * * DST_TUM Train-until-Mature * Trains individual tokens based on the * maturity of the user's dictionary * * DST_NOTRAIN No Training * Process the message but do not perform * any training. * training_buffer (input) * Sets the amount of training-loop buffering. This number is a * range from 0-10 and changes the amount of token sedation used * during the training loop. The higher the number, the more token * statistics are watered down during initial training to prevent * false positives. Setting this value to zero results in no * sedation being performed. * * flags (input) * Applies different fine-tuning behavior to the context: * * DSF_NOISE Apply Bayesian Noise Reduction logic * DSF_SIGNATURE Signature is provided/requested * DSF_WHITELIST Use automatic whitelisting logic * DSF_MERGED Merge user/group data in memory * DSF_UNLEARN Unlearn the message * DSF_BIAS Assign processor bias to unknown tokens * * tokenizer (input) * Specifies which tokenizer to use * * DSZ_WORD Use WORD (uniGram) tokenizer * DSZ_CHAIN Use CHAIN (biGram) tokenizer * DSZ_SBPH Use SBPH (Sparse Binary Polynomial Hashing) tokenizer * DSZ_OSB Use OSB (Orthogonal Sparse biGram) tokenizer * * algorithms (input) * Optional API to override the default algorithms. This value is set * with the default compiled values whenever dspam_create() is called. * * DSA_GRAHAM Graham-Bayesian * DSA_BURTON Burton-Bayesian * DSA_ROBINSON Robinson's Geometric Mean Test * DSA_CHI_SQUARE Fisher-Robinson's Chi-Square * DSA_NAIVE Naive-Bayesian * * P-Value Computations: * * DSP_ROBINSON Robinson's Technique * DSP_GRAHAM Graham's Technique * DSP_MARKOV Markov Weighted Technique * * locked (output) * Identifies that the user's storage is presently locked */ typedef struct { struct _ds_spam_totals totals; struct _ds_spam_signature * signature; struct _ds_message * message; struct _ds_config * config; char *username; char *group; char *home; /* DSPAM Home */ int operating_mode; /* DSM_ */ int training_mode; /* DST_ */ int training_buffer; /* 0-10 */ int wh_threshold; /* Whitelisting Threshold (default 10) */ int classification; /* DSR_ */ int source; /* DSS_ */ int learned; /* Did we actually learn something? */ int tokenizer; /* DSZ_ */ u_int32_t flags; u_int32_t algorithms; int result; char class[32]; float probability; float confidence; int locked; void * storage; time_t _process_start; int _sig_provided; struct nt * factors; } DSPAM_CTX; /* Processing Flags */ #define DSF_SIGNATURE 0x02 #define DSF_BIAS 0x04 #define DSF_NOISE 0x08 #define DSF_WHITELIST 0x10 #define DSF_MERGED 0x20 #define DSF_UNLEARN 0x80 /* Tokenizers */ #define DSZ_WORD 0x01 #define DSZ_CHAIN 0x02 #define DSZ_SBPH 0x03 #define DSZ_OSB 0x04 /* Algorithms */ #define DSA_GRAHAM 0x01 #define DSA_BURTON 0x02 #define DSA_ROBINSON 0x04 #define DSA_CHI_SQUARE 0x08 #define DSP_ROBINSON 0x10 #define DSP_GRAHAM 0x20 #define DSP_MARKOV 0x40 #define DSA_NAIVE 0x80 /* Operating Modes */ #define DSM_PROCESS 0x00 #define DSM_TOOLS 0x01 #define DSM_CLASSIFY 0x02 #define DSM_NONE 0xFF /* Training Modes */ #define DST_TEFT 0x00 #define DST_TOE 0x01 #define DST_TUM 0x02 #define DST_NOTRAIN 0xFE /* Classification Results */ #define DSR_ISSPAM 0x01 #define DSR_ISINNOCENT 0x02 #define DSR_NONE 0xFF /* Classification Sources */ #define DSS_ERROR 0x00 /* Retraining an error */ #define DSS_CORPUS 0x01 /* Training a message from corpus */ #define DSS_INOCULATION 0x02 /* Message is an inoculation */ #define DSS_NONE 0xFF /* Standard inbound processing */ /* Statuses for token-status bit */ #define TST_DISK 0x01 #define TST_DIRTY 0x02 /* Token Types */ #define DTT_DEFAULT 0x00 #define DTT_BNR 0x01 #define DSP_UNCALCULATED -1 #define BURTON_WINDOW_SIZE 27 #endif /* _LIBDSPAM_OBJECTS */