Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

source: npl/mailserver/dspam/dspam-3.10.2/src/libdspam_objects.h

Last change on this file was c5c522c, checked in by Edwin Eefting <edwin@datux.nl>, 9 years ago
initial commit, transferred from cleaned syn3 svn tree
Property mode set to `100644`
File size: 11.7 KB

Rev	Line
[c5c522c]	1	/* $Id: libdspam_objects.h,v 1.27 2011/07/11 21:29:57 sbajic Exp $ */
	2
	3	/*
	4	DSPAM
	5	COPYRIGHT (C) 2002-2012 DSPAM PROJECT
	6
	7	This program is free software: you can redistribute it and/or modify
	8	it under the terms of the GNU Affero General Public License as
	9	published by the Free Software Foundation, either version 3 of the
	10	License, or (at your option) any later version.
	11
	12	This program is distributed in the hope that it will be useful,
	13	but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	GNU Affero General Public License for more details.
	16
	17	You should have received a copy of the GNU Affero General Public License
	18	along with this program. If not, see <http://www.gnu.org/licenses/>.
	19
	20	*/
	21
	22	#ifndef _LIBDSPAM_OBJECTS_H
	23	# define _LIBDSPAM_OBJECTS_H
	24
	25	#ifdef HAVE_CONFIG_H
	26	#include <auto-config.h>
	27	#endif
	28
	29	#include <time.h>
	30	#include "config.h"
	31	#include "config_shared.h"
	32	#include "decode.h"
	33
	34	#if ((defined(__sun__) && defined(__svr4__)) \|\| (defined(__sun) && defined(__SUNPRO_C))) && !defined(u_int32_t) && !defined(__BIT_TYPES_DEFINED__)
	35	#define __BIT_TYPES_DEFINED__
	36	typedef unsigned long long u_int64_t;
	37	typedef unsigned int u_int32_t;
	38	typedef unsigned short u_int16_t;
	39	typedef unsigned char u_int8_t;
	40	#endif
	41
	42	#ifdef _WIN32
	43	typedef unsigned int u_int32_t;
	44	typedef u_int32_t uid_t;
	45	#endif
	46
	47	extern void _drv_handle; / Handle to storage driver library */
	48
	49	/*
	50	* struct dspam_factor - A single determining factor
	51	*
	52	* An element containing a determining factor in the dominant calculation of
	53	* a message. An array of these are returned to the calling application to
	54	* explain libdspam's final classification decision.
	55	*/
	56
	57	struct dspam_factor {
	58	char *token_name;
	59	float value;
	60	};
	61
	62	/*
	63	* struct _ds_spam_totals - User spam totals
	64	*
	65	* Spam totals loaded into the user's filter context upon a call to
	66	* dspam_init(). This structure represents the user's cumulative statistics.
	67	*
	68	* spam_learned, innocent_learned
	69	* The total number of messages trained on.
	70	*
	71	* spam_misclassified, innocent_misclassified
	72	* The total number of messages that were misclassified by DSPAM, and
	73	* submitted for retraining.
	74	*
	75	* spam_classified, innocent_classified
	76	* The total number of messages that were classified by DSPAM, but not
	77	* learned. Used exclusively with Train-on-Error mode.
	78	*
	79	* spam_corpusfed, innocent_corpusfed
	80	* The total number of messages supplied by the end-user for training.
	81	*
	82	* NOTE: The ordering of the variables in the structure must remain
	83	* consistent to ensure backward-compatibility with some storage
	84	* drivers (such as the Berkeley DB drivers)
	85	*/
	86
	87	struct _ds_spam_totals
	88	{
	89	long spam_learned;
	90	long innocent_learned;
	91	long spam_misclassified;
	92	long innocent_misclassified;
	93	long spam_corpusfed;
	94	long innocent_corpusfed;
	95	long spam_classified;
	96	long innocent_classified;
	97	};
	98
	99	/*
	100	* struct _ds_spam_stat - Statistics for a single token:
	101	*
	102	* probability
	103	* The calculated probability of the token based on the active pvalue
	104	* algorithm (selected at configure-time).
	105	*
	106	* spam_hits, innocent_hits
	107	* The total number of times the token has appeared in each class of
	108	* message. If Train-on-Error or Train-until-Mature training modes are
	109	* employed, these values will not necessarily be updated for every
	110	* message.
	111	*
	112	* status
	113	* TST_DISK Value was loaded from the storage interface
	114	* TST_DIRTY Statistic is dirty (not written to disk since last modified)
	115	*/
	116
	117	typedef struct _ds_spam_stat
	118	{
	119	double probability;
	120	long spam_hits;
	121	long innocent_hits;
	122	char status;
	123	unsigned long offset;
	124	} *ds_spam_stat_t;
	125
	126	/*
	127	* struct _ds_spam_signature - A historical classification signature
	128	*
	129	* A binary representation of the original training instance. The spam
	130	* signature contains all the metadata used in the original decision
	131	* about the message, so that a 1:1 retraining can take place if the
	132	* message is submitted for retraining (e.g. was misclassified). The
	133	* signature contains a series of _ds_signature_token structures, which
	134	* house the original set of tokens used and their frequency counts in
	135	* the message. A spam signature is a temporary piece of data that is
	136	* usually purged from disk after a short period of time.
	137	*/
	138
	139	struct _ds_spam_signature
	140	{
	141	void *data;
	142	unsigned long length;
	143	};
	144
	145	/*
	146	* struct _ds_signature_token - An entry in the classification signature
	147	*
	148	* A signature token is a single entry in the binary _ds_spam_signature
	149	* data blob, representing a single data point from the original
	150	* training instance.
	151	*
	152	* token
	153	* The checksum of the original token in the message
	154	*
	155	* frequency
	156	* The token's frequency in the original message
	157	*/
	158
	159	struct _ds_signature_token
	160	{
	161	unsigned long long token;
	162	unsigned char frequency;
	163	};
	164
	165	/*
	166	* struct _ds_config - libdspam attributes configuration
	167	*
	168	* Each classification context may have an attributes configuration
	169	* which is read by various components of libdspam. This structure
	170	* contains an array of attributes and the size of the array.
	171	*/
	172
	173	struct _ds_config
	174	{
	175	config_t attributes;
	176	long size;
	177	};
	178
	179	/*
	180	* DSPAM_CTX - The DSPAM Classification Context
	181	*
	182	* A classification context is attached directly to a filter instance
	183	* and supplies the entire context for the filter instance to operate
	184	* under. This includes the user and group, operational flags,
	185	* training mode, and the message being operated on. The filter
	186	* instance also sets specific output variables within the context
	187	* such as the result of a classification, confidence level, and
	188	* etcetera.
	189	*
	190	* username, group (input)
	191	* The current username and group that is being operated on.
	192	*
	193	* totals (output)
	194	* The set of statistics loaded when dspam_init() is called.
	195	*
	196	* signature (input, output)
	197	* The signature represents a DSPAM signature, and can be supplied
	198	* as an input variable for retraining (e.g. in the event of a
	199	* misclassification) or used as an output variable to store a
	200	* signature generated by the filter instance during normal
	201	* classification.
	202	*
	203	* message (input)
	204	* The message being operated on, post-actualization. This can be
	205	* left NULL, and libdspam will automatically actualize the message
	206	*
	207	* probability (output)
	208	* The probability of the resulting operation. This is generally a
	209	* floating point number between 0 and 1, 1 being the highest
	210	* probability of high order classification.
	211	*
	212	* result (output)
	213	* The final result of the requested operation. This is generally
	214	* either DSR_ISSPAM, DSR_ISINNOCENT, or DSR_WHITELISTED.
	215	*
	216	* confidence (output)
	217	* The confidence that the filter has in its returned result.
	218	* NOTE: Confidence is not always supported, and may be zero.
	219	*
	220	* operating_mode (input)
	221	* Sets the operating mode of the filter instance. This can be one
	222	* of the following:
	223	*
	224	* DSM_PROCESS Classify and learn the supplied message using
	225	* whatever training mode is specified
	226	*
	227	* DSM_CLASSIFY Classify the supplied message only; do not
	228	* learn or update any counters.
	229	*
	230	* DSM_TOOLS Identifies that the calling function is from
	231	* a utility, and no operation will be requested.
	232	*
	233	* training_mode (input)
	234	* The training mode sets the type of training the filter instance
	235	* should apply to the process. This can be one of:
	236	*
	237	* DST_TEFT Train-on-Everything
	238	* Trains every single message processed
	239	*
	240	* DST_TOE Train-on-Error
	241	* Trains only on a misclassification or
	242	* corpus-fed message.
	243	*
	244	* DST_TUM Train-until-Mature
	245	* Trains individual tokens based on the
	246	* maturity of the user's dictionary
	247	*
	248	* DST_NOTRAIN No Training
	249	* Process the message but do not perform
	250	* any training.
	251	* training_buffer (input)
	252	* Sets the amount of training-loop buffering. This number is a
	253	* range from 0-10 and changes the amount of token sedation used
	254	* during the training loop. The higher the number, the more token
	255	* statistics are watered down during initial training to prevent
	256	* false positives. Setting this value to zero results in no
	257	* sedation being performed.
	258	*
	259	* flags (input)
	260	* Applies different fine-tuning behavior to the context:
	261	*
	262	* DSF_NOISE Apply Bayesian Noise Reduction logic
	263	* DSF_SIGNATURE Signature is provided/requested
	264	* DSF_WHITELIST Use automatic whitelisting logic
	265	* DSF_MERGED Merge user/group data in memory
	266	* DSF_UNLEARN Unlearn the message
	267	* DSF_BIAS Assign processor bias to unknown tokens
	268	*
	269	* tokenizer (input)
	270	* Specifies which tokenizer to use
	271	*
	272	* DSZ_WORD Use WORD (uniGram) tokenizer
	273	* DSZ_CHAIN Use CHAIN (biGram) tokenizer
	274	* DSZ_SBPH Use SBPH (Sparse Binary Polynomial Hashing) tokenizer
	275	* DSZ_OSB Use OSB (Orthogonal Sparse biGram) tokenizer
	276	*
	277	* algorithms (input)
	278	* Optional API to override the default algorithms. This value is set
	279	* with the default compiled values whenever dspam_create() is called.
	280	*
	281	* DSA_GRAHAM Graham-Bayesian
	282	* DSA_BURTON Burton-Bayesian
	283	* DSA_ROBINSON Robinson's Geometric Mean Test
	284	* DSA_CHI_SQUARE Fisher-Robinson's Chi-Square
	285	* DSA_NAIVE Naive-Bayesian
	286	*
	287	* P-Value Computations:
	288	*
	289	* DSP_ROBINSON Robinson's Technique
	290	* DSP_GRAHAM Graham's Technique
	291	* DSP_MARKOV Markov Weighted Technique
	292	*
	293	* locked (output)
	294	* Identifies that the user's storage is presently locked
	295	*/
	296
	297	typedef struct
	298	{
	299	struct _ds_spam_totals totals;
	300	struct _ds_spam_signature * signature;
	301	struct _ds_message * message;
	302	struct _ds_config * config;
	303
	304	char *username;
	305	char *group;
	306	char home; / DSPAM Home */
	307	int operating_mode; /* DSM_ */
	308	int training_mode; /* DST_ */
	309	int training_buffer; /* 0-10 */
	310	int wh_threshold; /* Whitelisting Threshold (default 10) */
	311	int classification; /* DSR_ */
	312	int source; /* DSS_ */
	313	int learned; /* Did we actually learn something? */
	314	int tokenizer; /* DSZ_ */
	315	u_int32_t flags;
	316	u_int32_t algorithms;
	317
	318	int result;
	319	char class[32];
	320	float probability;
	321	float confidence;
	322
	323	int locked;
	324	void * storage;
	325	time_t _process_start;
	326	int _sig_provided;
	327
	328	struct nt * factors;
	329
	330	} DSPAM_CTX;
	331
	332	/* Processing Flags */
	333
	334	#define DSF_SIGNATURE 0x02
	335	#define DSF_BIAS 0x04
	336	#define DSF_NOISE 0x08
	337	#define DSF_WHITELIST 0x10
	338	#define DSF_MERGED 0x20
	339	#define DSF_UNLEARN 0x80
	340
	341	/* Tokenizers */
	342
	343	#define DSZ_WORD 0x01
	344	#define DSZ_CHAIN 0x02
	345	#define DSZ_SBPH 0x03
	346	#define DSZ_OSB 0x04
	347
	348	/* Algorithms */
	349
	350	#define DSA_GRAHAM 0x01
	351	#define DSA_BURTON 0x02
	352	#define DSA_ROBINSON 0x04
	353	#define DSA_CHI_SQUARE 0x08
	354	#define DSP_ROBINSON 0x10
	355	#define DSP_GRAHAM 0x20
	356	#define DSP_MARKOV 0x40
	357	#define DSA_NAIVE 0x80
	358
	359	/* Operating Modes */
	360
	361	#define DSM_PROCESS 0x00
	362	#define DSM_TOOLS 0x01
	363	#define DSM_CLASSIFY 0x02
	364	#define DSM_NONE 0xFF
	365
	366	/* Training Modes */
	367
	368	#define DST_TEFT 0x00
	369	#define DST_TOE 0x01
	370	#define DST_TUM 0x02
	371	#define DST_NOTRAIN 0xFE
	372
	373	/* Classification Results */
	374
	375	#define DSR_ISSPAM 0x01
	376	#define DSR_ISINNOCENT 0x02
	377	#define DSR_NONE 0xFF
	378
	379	/* Classification Sources */
	380
	381	#define DSS_ERROR 0x00 /* Retraining an error */
	382	#define DSS_CORPUS 0x01 /* Training a message from corpus */
	383	#define DSS_INOCULATION 0x02 /* Message is an inoculation */
	384	#define DSS_NONE 0xFF /* Standard inbound processing */
	385
	386	/* Statuses for token-status bit */
	387	#define TST_DISK 0x01
	388	#define TST_DIRTY 0x02
	389
	390	/* Token Types */
	391	#define DTT_DEFAULT 0x00
	392	#define DTT_BNR 0x01
	393
	394	#define DSP_UNCALCULATED -1
	395
	396	#define BURTON_WINDOW_SIZE 27
	397
	398	#endif /* _LIBDSPAM_OBJECTS */

Note: See TracBrowser for help on using the repository browser.

Download in other formats: