source: npl/mailserver/dspam/dspam-3.10.2/src/libdspam_objects.h @ c5c522c

gcc484ntopperl-5.22
Last change on this file since c5c522c was c5c522c, checked in by Edwin Eefting <edwin@datux.nl>, 8 years ago

initial commit, transferred from cleaned syn3 svn tree

  • Property mode set to 100644
File size: 11.7 KB
Line 
1/* $Id: libdspam_objects.h,v 1.27 2011/07/11 21:29:57 sbajic Exp $ */
2
3/*
4 DSPAM
5 COPYRIGHT (C) 2002-2012 DSPAM PROJECT
6
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU Affero General Public License as
9 published by the Free Software Foundation, either version 3 of the
10 License, or (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU Affero General Public License for more details.
16
17 You should have received a copy of the GNU Affero General Public License
18 along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20*/
21
22#ifndef _LIBDSPAM_OBJECTS_H
23#  define _LIBDSPAM_OBJECTS_H
24
25#ifdef HAVE_CONFIG_H
26#include <auto-config.h>
27#endif
28
29#include <time.h>
30#include "config.h"
31#include "config_shared.h"
32#include "decode.h"
33
34#if ((defined(__sun__) && defined(__svr4__)) || (defined(__sun) && defined(__SUNPRO_C))) && !defined(u_int32_t) && !defined(__BIT_TYPES_DEFINED__)
35#define __BIT_TYPES_DEFINED__
36typedef unsigned long long u_int64_t;
37typedef unsigned int u_int32_t;
38typedef unsigned short u_int16_t;
39typedef unsigned char u_int8_t;
40#endif
41
42#ifdef _WIN32
43typedef unsigned int u_int32_t;
44typedef u_int32_t uid_t;
45#endif
46
47extern void *_drv_handle; /* Handle to storage driver library */
48
49/*
50 *  struct dspam_factor - A single determining factor
51 *
52 *  An element containing a determining factor in the dominant calculation of
53 *  a message.  An array of these are returned to the calling  application to
54 *  explain libdspam's final classification decision.
55 */
56
57struct dspam_factor {
58  char *token_name;
59  float value;
60};
61
62/*
63 *  struct _ds_spam_totals - User spam totals
64 *
65 *  Spam totals loaded into the user's filter context upon a call to
66 *  dspam_init().  This structure represents the user's cumulative statistics.
67 *
68 *  spam_learned, innocent_learned
69 *    The total number of messages trained on.
70 *
71 *  spam_misclassified, innocent_misclassified
72 *    The total number of messages that were misclassified by DSPAM, and
73 *    submitted for retraining.
74 *
75 *  spam_classified, innocent_classified
76 *    The total number of messages that were classified by DSPAM, but not
77 *    learned.  Used exclusively with Train-on-Error mode.
78 *
79 *  spam_corpusfed, innocent_corpusfed
80 *    The total number of messages supplied by the end-user for training.
81 *
82 *  NOTE: The ordering  of the variables  in the  structure must remain
83 *        consistent to ensure backward-compatibility with some storage
84 *        drivers (such as the Berkeley DB drivers)
85 */
86
87struct _ds_spam_totals
88{
89  long spam_learned;
90  long innocent_learned;
91  long spam_misclassified;
92  long innocent_misclassified;
93  long spam_corpusfed;
94  long innocent_corpusfed;
95  long spam_classified;
96  long innocent_classified;
97};
98
99/*
100 *  struct _ds_spam_stat - Statistics for a single token:
101 *
102 *  probability
103 *    The calculated probability of the token based on the active pvalue
104 *    algorithm (selected at configure-time).
105 *
106 *  spam_hits, innocent_hits
107 *    The total  number of times the token has appeared in each class  of
108 *    message. If Train-on-Error or Train-until-Mature training modes are
109 *    employed,  these values will not  necessarily be updated for  every
110 *    message.
111 *
112 *  status
113 *    TST_DISK  Value was loaded from the storage interface
114 *    TST_DIRTY Statistic is dirty (not written to disk since last modified)
115 */
116
117typedef struct _ds_spam_stat
118{
119  double probability;
120  long spam_hits;
121  long innocent_hits;
122  char status;
123  unsigned long offset;
124} *ds_spam_stat_t;
125
126/*
127 *  struct _ds_spam_signature - A historical classification signature
128 *
129 *  A binary representation of the original training instance.  The spam
130 *  signature  contains all the  metadata used  in the original decision
131 *  about the  message, so  that a 1:1 retraining  can take place if the
132 *  message  is submitted for  retraining (e.g. was  misclassified). The
133 *  signature contains a series of _ds_signature_token structures, which
134 *  house the  original set of tokens used and their frequency counts in
135 *  the message.  A spam signature is a temporary  piece of data that is
136 *  usually purged from disk after a short period of time.
137 */
138
139struct _ds_spam_signature
140{
141  void *data;
142  unsigned long length;
143};
144
145/*
146 *  struct _ds_signature_token - An entry in the classification signature
147 *
148 *  A signature token is a single entry in the binary _ds_spam_signature
149 *  data  blob,  representing  a single  data point  from  the  original
150 *  training instance.
151 *
152 *  token
153 *    The checksum of the original token in the message
154 *
155 *  frequency
156 *    The token's frequency in the original message
157 */
158
159struct _ds_signature_token
160{
161  unsigned long long token;
162  unsigned char frequency;
163};
164
165/*
166 *  struct _ds_config - libdspam attributes configuration
167 *
168 *  Each  classification context may have an attributes  configuration
169 *  which  is read by various  components of libdspam.  This structure
170 *  contains an array of attributes and the size of the array.
171 */
172
173struct _ds_config
174{
175  config_t attributes;
176  long size;
177};
178
179/*
180 *  DSPAM_CTX - The DSPAM Classification Context
181 *
182 *  A classification context is attached directly to a filter instance
183 *  and supplies the entire context for the filter instance to operate
184 *  under.  This  includes  the  user  and group,  operational  flags,
185 *  training  mode, and  the message  being  operated  on. The  filter
186 *  instance also  sets specific output variables  within the  context
187 *  such  as the  result of a  classification,  confidence  level, and
188 *  etcetera.
189 *
190 *  username, group (input)
191 *    The current username and group that is being operated on.
192 *
193 *  totals (output)
194 *    The set of statistics loaded when dspam_init() is called.
195 *
196 *  signature (input, output)
197 *    The signature represents a DSPAM signature, and can be  supplied
198 *    as  an input  variable for  retraining  (e.g. in the  event of a
199 *    misclassification)  or  used as  an output  variable  to store a
200 *    signature  generated   by  the  filter  instance  during  normal
201 *    classification.
202 *
203 *  message (input)
204 *    The  message being operated on, post-actualization. This can  be
205 *    left NULL, and libdspam will automatically actualize the message
206 *
207 *  probability (output)
208 *    The probability of the resulting operation.  This is generally a
209 *    floating  point number  between 0 and  1, 1  being  the  highest
210 *    probability of high order classification.
211 *
212 *  result (output)
213 *    The  final result of the requested operation.  This is generally
214 *    either DSR_ISSPAM, DSR_ISINNOCENT, or DSR_WHITELISTED.
215 *
216 *  confidence (output)
217 *    The  confidence  that the  filter has  in  its  returned  result.
218 *    NOTE: Confidence is not always supported, and may be zero.
219 *
220 *  operating_mode (input)
221 *    Sets the operating mode of the filter instance.  This can be one
222 *    of the following:
223 *
224 *      DSM_PROCESS     Classify and learn the  supplied message using
225 *                      whatever training mode is specified
226 *
227 *      DSM_CLASSIFY    Classify the  supplied  message  only; do  not
228 *                      learn or update any counters.
229 *
230 *      DSM_TOOLS       Identifies that  the calling function is  from
231 *                      a utility, and no operation will be requested.
232 *
233 *  training_mode (input)
234 *    The training mode sets the type of training the filter  instance
235 *    should apply to the process. This can be one of:
236 *
237 *      DST_TEFT                Train-on-Everything
238 *                              Trains every single message  processed
239 *
240 *      DST_TOE                 Train-on-Error
241 *                              Trains only on a misclassification  or
242 *                              corpus-fed message.
243 *
244 *      DST_TUM                 Train-until-Mature
245 *                              Trains individual tokens based on  the
246 *                              maturity of the user's dictionary
247 *
248 *      DST_NOTRAIN             No Training
249 *                              Process the message but do not perform
250 *                              any training.
251 *  training_buffer (input)
252 *    Sets the amount  of training-loop buffering.  This  number is  a
253 *    range from 0-10  and changes  the amount of  token sedation used
254 *    during the training loop.  The higher the number, the more token
255 *    statistics are watered down  during initial  training to prevent
256 *    false  positives.  Setting  this  value to  zero results  in  no
257 *    sedation being performed.
258 *
259 *  flags (input)
260 *    Applies different fine-tuning behavior to the context:
261 *
262 *      DSF_NOISE               Apply Bayesian Noise Reduction logic
263 *      DSF_SIGNATURE           Signature is provided/requested
264 *      DSF_WHITELIST           Use automatic whitelisting logic
265 *      DSF_MERGED              Merge user/group data in memory
266 *      DSF_UNLEARN             Unlearn the message
267 *      DSF_BIAS                Assign processor bias to unknown tokens
268 *
269 *  tokenizer (input)
270 *    Specifies which tokenizer to use
271 *
272 *      DSZ_WORD                Use WORD (uniGram) tokenizer
273 *      DSZ_CHAIN               Use CHAIN (biGram) tokenizer
274 *      DSZ_SBPH                Use SBPH (Sparse Binary Polynomial Hashing) tokenizer
275 *      DSZ_OSB                 Use OSB (Orthogonal Sparse biGram) tokenizer
276 *
277 *  algorithms (input)
278 *    Optional API to override the default algorithms. This value is set
279 *    with the default compiled values whenever dspam_create() is called.
280 *
281 *      DSA_GRAHAM              Graham-Bayesian
282 *      DSA_BURTON              Burton-Bayesian
283 *      DSA_ROBINSON            Robinson's Geometric Mean Test
284 *      DSA_CHI_SQUARE          Fisher-Robinson's Chi-Square
285 *      DSA_NAIVE               Naive-Bayesian
286 *
287 *    P-Value Computations:
288 *
289 *      DSP_ROBINSON            Robinson's Technique
290 *      DSP_GRAHAM              Graham's Technique
291 *      DSP_MARKOV              Markov Weighted Technique
292 *
293 *  locked (output)
294 *    Identifies that the user's storage is presently locked
295 */
296
297typedef struct
298{
299  struct _ds_spam_totals        totals;
300  struct _ds_spam_signature *   signature;
301  struct _ds_message *          message;
302  struct _ds_config *           config;
303
304  char          *username;
305  char          *group;
306  char          *home;           /* DSPAM Home */
307  int           operating_mode;  /* DSM_ */
308  int           training_mode;   /* DST_ */
309  int           training_buffer; /* 0-10 */
310  int           wh_threshold;    /* Whitelisting Threshold (default 10) */
311  int           classification;  /* DSR_ */
312  int           source;          /* DSS_ */
313  int           learned;         /* Did we actually learn something? */
314  int           tokenizer;       /* DSZ_ */
315  u_int32_t     flags;
316  u_int32_t     algorithms;
317
318  int           result;
319  char          class[32];
320  float         probability;
321  float         confidence;
322
323  int           locked;
324  void *        storage;
325  time_t        _process_start;
326  int           _sig_provided;
327
328  struct nt *   factors;
329
330} DSPAM_CTX;
331
332/* Processing Flags */
333
334#define DSF_SIGNATURE           0x02
335#define DSF_BIAS                0x04
336#define DSF_NOISE               0x08
337#define DSF_WHITELIST           0x10
338#define DSF_MERGED              0x20
339#define DSF_UNLEARN             0x80
340
341/* Tokenizers */
342
343#define DSZ_WORD                0x01
344#define DSZ_CHAIN               0x02
345#define DSZ_SBPH                0x03
346#define DSZ_OSB                 0x04
347
348/* Algorithms */
349
350#define DSA_GRAHAM              0x01
351#define DSA_BURTON              0x02
352#define DSA_ROBINSON            0x04
353#define DSA_CHI_SQUARE          0x08
354#define DSP_ROBINSON            0x10
355#define DSP_GRAHAM              0x20
356#define DSP_MARKOV              0x40
357#define DSA_NAIVE               0x80
358
359/* Operating Modes */
360
361#define DSM_PROCESS             0x00
362#define DSM_TOOLS               0x01
363#define DSM_CLASSIFY            0x02
364#define DSM_NONE                0xFF
365
366/* Training Modes */
367
368#define DST_TEFT                0x00
369#define DST_TOE                 0x01
370#define DST_TUM                 0x02
371#define DST_NOTRAIN             0xFE
372
373/* Classification Results */
374
375#define DSR_ISSPAM              0x01
376#define DSR_ISINNOCENT          0x02
377#define DSR_NONE                0xFF
378
379/* Classification Sources */
380
381#define DSS_ERROR       0x00 /* Retraining an error */
382#define DSS_CORPUS      0x01 /* Training a message from corpus */
383#define DSS_INOCULATION 0x02 /* Message is an inoculation */
384#define DSS_NONE        0xFF /* Standard inbound processing */
385
386/* Statuses for token-status bit */
387#define TST_DISK        0x01
388#define TST_DIRTY       0x02
389
390/* Token Types */
391#define DTT_DEFAULT     0x00
392#define DTT_BNR         0x01
393
394#define DSP_UNCALCULATED        -1
395
396#define BURTON_WINDOW_SIZE      27
397
398#endif /* _LIBDSPAM_OBJECTS */
Note: See TracBrowser for help on using the repository browser.