source: npl/mailserver/dspam/dspam-3.10.2/src/bnr.c

Last change on this file was c5c522c, checked in by Edwin Eefting <edwin@datux.nl>, 8 years ago

initial commit, transferred from cleaned syn3 svn tree

  • Property mode set to 100644
File size: 8.8 KB
Line 
1/* $Id: bnr.c,v 1.32 2011/06/28 00:13:48 sbajic Exp $ */
2
3/*
4 DSPAM
5 COPYRIGHT (C) 2002-2012 DSPAM PROJECT
6
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU Affero General Public License as
9 published by the Free Software Foundation, either version 3 of the
10 License, or (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU Affero General Public License for more details.
16
17 You should have received a copy of the GNU Affero General Public License
18 along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20*/
21
22/*
23 * bnr.c - bayesian noise reduction - contextual symmetry logic
24 *
25 * http://bnr.nuclearelephant.com
26 *
27 */
28
29#ifdef HAVE_CONFIG_H
30#include <auto-config.h>
31#endif
32
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <ctype.h>
37#include <math.h>
38#ifdef HAVE_UNISTD_H
39#include <unistd.h>
40#endif
41#include <sys/types.h>
42#include <sys/stat.h>
43#include <errno.h>
44
45#include "bnr.h"
46
47/*
48 * bnr_init(): Create and initialize a new noise reduction context
49 * parameters:  type (int)      BNR_CHAR:  Token identifier as character arrays
50 *                              BNR_INDEX: Token identifiers as pointers
51 *              identifier (char)       An identifier to add to the pattern
52 *                                      name to identify the type of stream
53 *                             
54 * returns:     pointer to the new context
55 */
56
57BNR_CTX *bnr_init(int type, char identifier)
58{
59  BNR_CTX *BTX;
60
61  BTX = calloc(1, sizeof(BNR_CTX));
62  if (BTX == NULL) {
63    perror("memory allocation error: bnr_init() failed");
64    return NULL;
65  }
66
67  BTX->identifier  = identifier;
68  BTX->window_size = 3;
69  BTX->ex_radius   = 0.25;
70  BTX->in_radius   = 0.33;
71  BTX->stream     = bnr_list_create(type);
72  BTX->patterns   = bnr_hash_create(1543ul);
73  if (BTX->stream == NULL || BTX->patterns == NULL) {
74    perror("memory allocation error: bnr_init() failed");
75    bnr_list_destroy(BTX->stream);
76    bnr_hash_destroy(BTX->patterns);
77    free(BTX);
78    return NULL;
79  }
80
81  return BTX;
82}
83
84/*
85 * bnr_destroy(): Destroys a noise reduction context no longer being used
86 * parameters:  BTX (BNR_CTX *) The context to destroy
87 * returns:     0 on success
88 */
89
90int bnr_destroy(BNR_CTX *BTX) {
91  bnr_list_destroy(BTX->stream);
92  bnr_hash_destroy(BTX->patterns);
93  free(BTX);
94  return 0;
95}
96
97/*
98 * bnr_add(): Adds a token to the noise reduction stream. This function
99 *   should be called once for each token in the message body (in order).
100 *
101 * parameters:  BTX (BNR_CTX *) The noise reduction context to use
102 *              token (void *)  The token's name, or pointer if NT_INDEX
103 *              value (float)   The token's probability
104 * returns:     0 on success
105 */
106
107int bnr_add(BNR_CTX *BTX, void *token, float value) {
108
109  return (bnr_list_insert(BTX->stream, token, value) != NULL) ? 0 : EFAILURE;
110}
111
112/*
113 * bnr_instantiate(): Instantiates a series of patterns for the given stream.
114 *   This function should be called after all tokens are added to the stream.
115 *
116 * parameters:  BTX (BNR_CTX *)         The noise reduction context to use
117 * returns:     0 on success
118 */
119
120int bnr_instantiate(BNR_CTX *BTX) {
121  int BNR_SIZE = BTX->window_size;
122  float previous_bnr_probs[BNR_SIZE];
123  struct bnr_list_node *node_list;
124  struct bnr_list_c c_list;
125  char bnr_token[64];
126  int i;
127
128  for(i=0;i<BNR_SIZE;i++)
129    previous_bnr_probs[i] = 0.00000;
130
131  node_list = c_bnr_list_first(BTX->stream, &c_list);
132  while(node_list != NULL) {
133   
134    for(i=1;i<BNR_SIZE;i++) {
135      previous_bnr_probs[i-1] = previous_bnr_probs[i];
136    }
137
138    previous_bnr_probs[BNR_SIZE-1] = _bnr_round(node_list->value);
139    sprintf(bnr_token, "bnr.%c|", BTX->identifier);
140    for(i=0;i<BNR_SIZE;i++) {
141      char x[6];
142      snprintf(x, 6, "%01.2f_", previous_bnr_probs[i]);
143      strcat(bnr_token, x);
144    }
145
146#ifdef LIBBNR_VERBOSE_DEBUG
147    fprintf(stderr, "libbnr: instantiating pattern '%s'\n", bnr_token);
148#endif
149
150    bnr_hash_hit (BTX->patterns, bnr_token);
151    node_list = c_bnr_list_next(BTX->stream, &c_list);
152  }
153
154  return 0;
155}
156
157/*
158 * bnr_get_pattern(): Retrieves the next instantiated pattern.
159 *   This function should be called after a call to bnr_instantiate(). Each
160 *   call to bnr_get_pattern() will return the next instantiated pattern, which
161 *   should then be looked up by your classifier and assigned a value using
162 *   bnr_set_pattern().
163 *
164 * parameters:  BTX (BNR_CTX *) The noise reduction context to use
165 * returns:     The name of the next instantiated pattern in the context
166 */
167 
168char *bnr_get_pattern(BNR_CTX *BTX) {
169  struct bnr_hash_node *node;
170 
171  if (!BTX->pattern_iter) {
172    node = c_bnr_hash_first(BTX->patterns, &BTX->c_pattern);
173    BTX->pattern_iter = 1;
174  } else {
175    node = c_bnr_hash_next(BTX->patterns, &BTX->c_pattern);
176  }
177
178  if (node)
179    return node->name;
180
181  BTX->pattern_iter = 0;
182  return NULL;
183}
184
185/*
186 * bnr_set_pattern(): Sets the value of a pattern
187 *   This function should be called once for each pattern instantiated. The
188 *   name of the patterns can be retrieved using repeated calls to
189 *   bnr_get_pattern(). The value of the pattern should then be looked up by
190 *   the classifier and set in the context using this function.
191 *
192 * parameters:  BTX (BNR_CTX *)         The noise reduction context to use
193 *              name (const char *)     The name of the pattern to set
194 *              value (float)           The p-value of the pattern
195 * returns:     0 on success
196 */
197
198int bnr_set_pattern(BNR_CTX *BTX, const char *name, float value) {
199  return bnr_hash_set(BTX->patterns, name, value);
200}
201
202/*
203 * bnr_get_token() Retrieves the next token from the stream.
204 *   This function should be called after a call to bnr_finalize(). Each
205 *   call to bnr_get_token() will return the next token and set its elimination
206 *   status (by way of the passed-in variable).
207 * parameters:  BTX (BNR_CTX *)         The noise reduction context to use
208 * returns:     The name (or pointer) of the next non-eliminated token
209 */
210
211void *bnr_get_token(BNR_CTX *BTX, int *eliminated) {
212  struct bnr_list_node *node;
213
214  if (BTX->stream_iter == 0) {
215    BTX->stream_iter = 1;
216    node = c_bnr_list_first(BTX->stream, &BTX->c_stream);
217  } else {
218    node = c_bnr_list_next(BTX->stream, &BTX->c_stream);
219  }
220
221  if (node) {
222    if (node->eliminated)
223      *eliminated = 1;
224    else
225      *eliminated = 0;
226    return node->ptr;
227  }
228
229  BTX->stream_iter = 0;
230  return NULL;
231}
232
233/*
234 * _bnr_round(): [internal] Round value to the nearest 0.05
235 * parameters:  value (float)   Value to be rounded
236 * returns:     Rounded value as a float
237 */
238
239float _bnr_round(float n) {
240  int r = (n*100);
241  while(r % 5)
242    r++;
243  return (r/100.0);
244}
245
246/*
247 * bnr_finalize() Finalizes the noise reduction context and performs dubbing
248 *   This function should be called after all calls to bnr_set_pattern() have
249 *   completed. This function performs the actual noise reduction process
250 *   after which calls to bnr_get_token() may be called.
251 *
252 * parameters:  BTX (BNR_CTX *) The noise reduction context to use
253 * returns:     0 on success
254 */
255
256int bnr_finalize(BNR_CTX *BTX) {
257  int BNR_SIZE = BTX->window_size;
258  struct bnr_list_node * previous_bnr_tokens[BNR_SIZE];
259  float previous_bnr_probs[BNR_SIZE];
260  struct bnr_list_node *node_list;
261  struct bnr_list_c c_list;
262  char bnr_token[64];
263  int i, interesting;
264
265  for(i=0;i<BNR_SIZE;i++) {
266    previous_bnr_probs[i] = 0.00000;
267    previous_bnr_tokens[i] = NULL;
268  }
269
270  node_list = c_bnr_list_first(BTX->stream, &c_list);
271  while(node_list != NULL) {
272    float pattern_value;
273
274    for(i=1;i<BNR_SIZE;i++) {
275      previous_bnr_probs[i-1] = previous_bnr_probs[i];
276      previous_bnr_tokens[i-1] = previous_bnr_tokens[i];
277    }
278
279    previous_bnr_probs[BNR_SIZE-1] = _bnr_round(node_list->value);
280    previous_bnr_tokens[BNR_SIZE-1] = node_list;
281
282    sprintf(bnr_token, "bnr.%c|", BTX->identifier);
283    for(i=0;i<BNR_SIZE;i++) {
284      char x[6];
285      snprintf(x, 6, "%01.2f_", previous_bnr_probs[i]);
286      strcat(bnr_token, x);
287    }
288
289    /* Identify interesting patterns */
290   
291    pattern_value = bnr_hash_value(BTX->patterns, bnr_token);
292    interesting = (fabs(0.5-pattern_value) > BTX->ex_radius);
293
294    if (interesting) {
295
296#ifdef LIBBNR_VERBOSE_DEBUG
297      fprintf(stderr, "Analyzing Pattern '%s' P-Value: %1.5f\n", bnr_token,
298        pattern_value);
299#endif
300
301      /* Eliminate inconsistent tokens */
302      for(i=0;i<BNR_SIZE;i++) {
303        if (previous_bnr_tokens[i]) {
304
305          /* If the token is inconsistent with the current pattern */
306          if (fabs(previous_bnr_tokens[i]->value - pattern_value) > BTX->in_radius)
307          {
308#ifdef LIBBNR_VERBOSE_DEBUG
309            fprintf(stderr, "\tEliminating '%s' P-Value: %1.5f\n",
310              (const char *) previous_bnr_tokens[i]->ptr,
311              previous_bnr_tokens[i]->value);
312#endif
313            BTX->eliminations++;
314            previous_bnr_tokens[i]->eliminated = 1;
315          }
316        }
317      }
318    }
319
320    node_list = c_bnr_list_next(BTX->stream, &c_list);
321  }
322
323  return 0;
324}
325
Note: See TracBrowser for help on using the repository browser.