Context Navigation

source: npl/mailserver/dspam/dspam-3.10.2/src/libdspam.c @ c5c522c

gcc484ntopperl-5.22

Last change on this file since c5c522c was c5c522c, checked in by Edwin Eefting <edwin@datux.nl>, 8 years ago
initial commit, transferred from cleaned syn3 svn tree
Property mode set to `100644`
File size: 72.9 KB

Line
1	/* $Id: libdspam.c,v 1.205 2011/07/13 00:51:46 sbajic Exp $ */
2
3	/*
4	DSPAM
5	COPYRIGHT (C) 2002-2012 DSPAM PROJECT
6
7	This program is free software: you can redistribute it and/or modify
8	it under the terms of the GNU Affero General Public License as
9	published by the Free Software Foundation, either version 3 of the
10	License, or (at your option) any later version.
11
12	This program is distributed in the hope that it will be useful,
13	but WITHOUT ANY WARRANTY; without even the implied warranty of
14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	GNU Affero General Public License for more details.
16
17	You should have received a copy of the GNU Affero General Public License
18	along with this program. If not, see <http://www.gnu.org/licenses/>.
19
20	*/
21
22	/*
23	* libdspam.c - DSPAM core analytical engine
24	*
25	* DESCRIPTION
26	* libdspam is at the core of the decision making process and is called
27	* by the agent to perform all tasks related to message classification.
28	* The libdspam API functions are documented in libdspam(1).
29	*/
30
31	#ifndef STATIC_DRIVER
32	void *_drv_handle;
33	#endif
34
35	#ifdef HAVE_CONFIG_H
36	#include <auto-config.h>
37	#endif
38
39	#include <stdio.h>
40	#include <stdlib.h>
41	#include <math.h>
42	#include <ctype.h>
43	#include <errno.h>
44	#include <string.h>
45	#ifdef HAVE_UNISTD_H
46	#include <unistd.h>
47	#endif
48	#include <sys/types.h>
49	#include <sys/stat.h>
50	#include <fcntl.h>
51	#include <dlfcn.h>
52
53	#ifdef TIME_WITH_SYS_TIME
54	# include <sys/time.h>
55	# include <time.h>
56	#else
57	# ifdef HAVE_SYS_TIME_H
58	# include <sys/time.h>
59	# else
60	# include <time.h>
61	# endif
62	#endif
63
64	#include "config.h"
65	#include "libdspam_objects.h"
66	#include "libdspam.h"
67	#include "nodetree.h"
68	#include "config.h"
69	#include "base64.h"
70	#include "bnr.h"
71	#include "util.h"
72	#include "storage_driver.h"
73	#include "buffer.h"
74	#include "heap.h"
75	#include "error.h"
76	#include "decode.h"
77	#include "language.h"
78
79	#define CHI_S 0.1 /* Chi-Sq Strength */
80	#define CHI_X 0.5000 /* Chi-Sq Assumed Probability */
81
82	#define C1 16 /* Markov C1 */
83	#define C2 1 /* Markov C2 */
84
85	#ifdef DEBUG
86	int DO_DEBUG = 0;
87	#endif
88
89	/*
90	* dspam_init()
91	*
92	* DESCRIPTION
93	* The dspam_init() function creates and initializes a new classification
94	* context and attaches the context to whatever backend storage facility
95	* was configured. The user and group arguments provided are used to read
96	* and write information stored for the user and group specified. The home
97	* argument is used to configure libdspam's storage around the base direc-
98	* tory specified. The mode specifies the operating mode to initialize the
99	* classification context with and may be one of:
100	*
101	* DSM_PROCESS Process the message and return a result
102	* DSM_CLASSIFY Classify message only, no learning
103	* DSM_TOOLS No processing, attach to storage only
104	*
105	* The flags provided further tune the classification context for a spe-
106	* cific function. Multiple flags may be OR'd together.
107	*
108	* DSF_SIGNATURE A binary signature is requested/provided
109	* DSF_NOISE Apply Bayesian Noise Reduction logic
110	* DSF_WHITELIST Use automatic whitelisting logic
111	* DSF_MERGED Merge group metadata with user's in memory
112	*
113	* RETURN VALUES
114	* Upon successful completion, dspam_init() will return a pointer to a new
115	* classification context structure containing a copy of the configuration
116	* passed into dspam_init(), a connected storage driver handle, and a set
117	* of preliminary user control data read from storage.
118	*/
119
120	DSPAM_CTX * dspam_init (
121	const char *username,
122	const char *group,
123	const char *home,
124	int operating_mode,
125	u_int32_t flags)
126	{
127	DSPAM_CTX *CTX = dspam_create(username, group, home, operating_mode, flags);
128
129	if (CTX == NULL)
130	return NULL;
131
132	if (!dspam_attach(CTX, NULL))
133	return CTX;
134
135	dspam_destroy(CTX);
136
137	return NULL;
138	}
139
140	/* dspam_create()
141	*
142	* DESCRIPTION
143	* The dspam_create() function performs in exactly the same manner as the
144	* dspam_init() function, but does not attach to storage. Instead, the
145	* caller must also call dspam_attach() after setting any storage- spe-
146	* cific attributes using dspam_addattribute(). This is useful for cases
147	* where the implementor would prefer to configure storage internally
148	* rather than having libdspam read a configuration from a file.
149	*
150	* RETURN VALUES
151	* Upon successful completion, dspam_create() will return a pointer to a new
152	* classification context structure containing a copy of the configuration
153	* passed into dspam_create(). At this point, dspam_attach() must be called
154	* for further processing.
155	*/
156
157	DSPAM_CTX * dspam_create (
158	const char *username,
159	const char *group,
160	const char *home,
161	int operating_mode,
162	u_int32_t flags)
163	{
164	DSPAM_CTX *CTX;
165
166	CTX = calloc (1, sizeof (DSPAM_CTX));
167	if (CTX == NULL) {
168	LOG(LOG_WARNING, "dspam_create: unable to allocate space for classification context");
169	return NULL;
170	}
171
172	CTX->config = calloc(1, sizeof(struct _ds_config));
173	if (CTX->config == NULL) {
174	LOG(LOG_WARNING, "dspam_create: unable to allocate space for classification context configuration");
175	LOG(LOG_CRIT, ERR_MEM_ALLOC);
176	goto bail;
177	}
178
179	CTX->config->size = 128;
180	CTX->config->attributes = calloc(1, sizeof(attribute_t)*(CTX->config->size));
181	if (CTX->config->attributes == NULL) {
182	LOG(LOG_WARNING, "dspam_create: unable to allocate space for classification context attributes");
183	LOG(LOG_CRIT, ERR_MEM_ALLOC);
184	goto bail;
185	}
186
187	if (home != NULL && home[0] != 0)
188	CTX->home = strdup (home);
189	else {
190	#ifdef DSPAM_HOME
191	CTX->home = strdup(DSPAM_HOME);
192	#else
193	CTX->home = NULL;
194	#endif
195	}
196
197	if (username != NULL && username[0] != 0)
198	CTX->username = strdup (username);
199	else
200	CTX->username = NULL;
201
202	if (group != NULL && group[0] != 0)
203	CTX->group = strdup (group);
204	else
205	CTX->group = NULL;
206
207	CTX->probability = DSP_UNCALCULATED;
208	CTX->operating_mode = operating_mode;
209	CTX->flags = flags;
210	CTX->message = NULL;
211	CTX->confidence = 0;
212	CTX->training_mode = DST_TEFT;
213	CTX->wh_threshold = 10;
214	CTX->training_buffer = 0;
215	CTX->classification = DSR_NONE;
216	CTX->source = DSS_NONE;
217	CTX->_sig_provided = 0;
218	CTX->factors = NULL;
219	CTX->algorithms = 0;
220	CTX->tokenizer = DSZ_WORD;
221
222	return CTX;
223
224	bail:
225	if (CTX != NULL) {
226	if (CTX->config != NULL) {
227	if (CTX->config->attributes != NULL)
228	_ds_destroy_config(CTX->config->attributes);
229	free(CTX->config);
230	}
231	if (CTX->username != NULL)
232	free(CTX->username);
233	if (CTX->group != NULL)
234	free(CTX->group);
235	if (CTX->home != NULL)
236	free(CTX->home);
237	free(CTX);
238	}
239	return NULL;
240	}
241
242	/*
243	* dspam_clearattributes()
244	*
245	* DESCRIPTION
246	* The dspam_clearattributes() function is called to clear any attributes
247	* previously set using dspam_addattribute() within the classification
248	* context. It is necessary to call this function prior to replacing any
249	* attributes already written.
250	*
251	* RETURN VALUES
252	* returns 0 on success, standard errors on failure
253	*
254	*/
255
256	int dspam_clearattributes (DSPAM_CTX * CTX) {
257
258	if (CTX->config) {
259	_ds_destroy_config(CTX->config->attributes);
260	free(CTX->config);
261	} else {
262	return EFAILURE;
263	}
264
265	CTX->config = calloc(1, sizeof(struct _ds_config));
266	if (CTX->config == NULL)
267	goto bail;
268	CTX->config->size = 128;
269	CTX->config->attributes = calloc(1, sizeof(attribute_t)*(CTX->config->size));
270	if (CTX->config->attributes == NULL)
271	goto bail;
272
273	return 0;
274
275	bail:
276	if (CTX->config != NULL) {
277	free(CTX->config);
278	CTX->config = NULL;
279	}
280	LOG(LOG_CRIT, ERR_MEM_ALLOC);
281	return EUNKNOWN;
282	}
283
284	/*
285	* dspam_addattribute()
286	*
287	* DESCRIPTION
288	* The dspam_addattribute() function is called to set attributes within
289	* the classification context. Some storage drivers support the use of
290	* passing specific attributes such as server connect information. The
291	* driver-independent attributes supported by DSPAM include:
292	*
293	* IgnoreHeader Specify a specific header to ignore
294	* LocalMX Specify a local mail exchanger to assist in
295	* correct results from dspam_getsource().
296	*
297	* Only driver-dependent attributes need be set prior to a call to
298	* dspam_attach(). Driver-independent attributes may be set both before
299	* and after storage has been attached.
300	*
301	* RETURN VALUES
302	* returns 0 on success, standard errors on failure
303	*/
304	int dspam_addattribute (DSPAM_CTX * CTX, const char key, const char value) {
305	int i, j = 0;
306
307	if (_ds_find_attribute(CTX->config->attributes, key))
308	return _ds_add_attribute(CTX->config->attributes, key, value);
309
310	for(i=0;CTX->config->attributes[i];i++)
311	j++;
312
313	if (j >= CTX->config->size) {
314	config_t ptr;
315	CTX->config->size *= 2;
316	ptr = realloc(CTX->config->attributes,
317	1+(sizeof(attribute_t)*CTX->config->size));
318	if (ptr) {
319	CTX->config->attributes = ptr;
320	} else {
321	LOG(LOG_CRIT, ERR_MEM_ALLOC);
322	return EFAILURE;
323	}
324	}
325
326	return _ds_add_attribute(CTX->config->attributes, key, value);
327	}
328
329	/*
330	* dspam_attach()
331	*
332	* DESCRIPTION
333	* The dspam_attach() function attaches the storage interface to the clas-
334	* sification context and alternatively established an initial connection
335	* with storage if dbh is NULL. Some storage drivers support only a NULL
336	* value for dbh, while others (such as mysql_drv, pgsql_drv, and
337	* sqlite_drv) allow an open database handle to be attached. This function
338	* should only be called after an initial call to dspam_create() and
339	* should never be called if using dspam_init(), as storage is automati-
340	* cally attached by a call to dspam_init().
341	*
342	* RETURN VALUES
343	* returns 0 on success, standard errors on failure
344	*/
345
346	int dspam_attach (DSPAM_CTX CTX, void dbh) {
347	if (!_ds_init_storage (CTX, dbh))
348	return 0;
349
350	return EFAILURE;
351	}
352
353	/*
354	* dspam_detach()
355	*
356	* DESCRIPTION
357	* The dspam_detach() function can be called when a detachment from stor-
358	* age is desired, but the context is still needed. The storage driver is
359	* closed, leaving the classification context in place. Once the context
360	* is no longer needed, another call to dspam_destroy() should be made. If
361	* you are closing storage and destroying the context at the same time, it
362	* is not necessary to call this function. Instead you may call
363	* dspam_destroy() directly.
364	*
365	* RETURN VALUES
366	* returns 0 on success, standard errors on failure
367	*/
368
369	int
370	dspam_detach (DSPAM_CTX * CTX)
371	{
372	if (CTX->storage != NULL) {
373
374	/* Sanity check totals before our shutdown call writes them */
375
376	if (CTX->totals.spam_learned < 0)
377	CTX->totals.spam_learned = 0;
378	if (CTX->totals.innocent_learned < 0)
379	CTX->totals.innocent_learned = 0;
380	if (CTX->totals.spam_misclassified < 0)
381	CTX->totals.spam_misclassified = 0;
382	if (CTX->totals.innocent_misclassified < 0)
383	CTX->totals.innocent_misclassified = 0;
384	if (CTX->totals.spam_classified < 0)
385	CTX->totals.spam_classified = 0;
386	if (CTX->totals.innocent_classified < 0)
387	CTX->totals.innocent_classified = 0;
388
389	_ds_shutdown_storage (CTX);
390	free(CTX->storage);
391	CTX->storage = NULL;
392	}
393
394	return 0;
395	}
396
397	/*
398	* dspam_destroy()
399	*
400	* The dspam_destroy() function should be called when the context is no
401	* longer needed. If a connection was established to storage internally,
402	* the connection is closed and all data is flushed and written. If a han-
403	* dle was attached, the handle will remain open.
404	*/
405
406	void
407	dspam_destroy (DSPAM_CTX * CTX)
408	{
409	if (CTX->storage != NULL)
410	dspam_detach(CTX);
411
412	_ds_factor_destroy(CTX->factors);
413	if (CTX->config && CTX->config->attributes)
414	_ds_destroy_config (CTX->config->attributes);
415
416	free (CTX->config);
417	free (CTX->username);
418	free (CTX->group);
419	free (CTX->home);
420
421	if (! CTX->_sig_provided && CTX->signature != NULL) {
422	if (CTX->signature->data != NULL)
423	free (CTX->signature->data);
424	free (CTX->signature);
425	}
426
427	if (CTX->message)
428	_ds_destroy_message(CTX->message);
429	free (CTX);
430	return;
431	}
432
433	/*
434	* dspam_process()
435	*
436	* DESCRIPTION
437	* The dspam_process() function performs analysis of the message passed
438	* into it and will return zero on successful completion. If successful,
439	* CTX->result will be set to one of three classification results:
440	*
441	* DSR_ISSPAM Message was classified as spam
442	* DSR_ISINNOCENT Message was classified as nonspam
443	*
444	* RETURN VALUES
445	* returns 0 on success
446	*
447	* EINVAL An invalid call or invalid parameter used.
448	* EUNKNOWN Unexpected error, such as malloc() failure
449	* EFILE Error opening or writing to a file or file handle
450	* ELOCK Locking failure
451	* EFAILURE The operation itself has failed
452	*/
453
454	int
455	dspam_process (DSPAM_CTX * CTX, const char *message)
456	{
457	#ifdef DEBUG
458	struct timeval tp1, tp2;
459	struct timezone tzp;
460	#endif
461	buffer header, body;
462	int spam_result = 0, is_toe = 0, is_undertrain = 0, retcode = 0;
463
464	#ifdef DEBUG
465	gettimeofday(&tp1, &tzp);
466	#endif
467
468	if (CTX->signature != NULL)
469	CTX->_sig_provided = 1;
470
471	/* Sanity check context behavior */
472
473	if (CTX->operating_mode == DSM_CLASSIFY && CTX->classification != DSR_NONE)
474	{
475	LOG(LOG_WARNING, "DSM_CLASSIFY can't be used with a classification");
476	return EINVAL;
477	}
478
479	if (CTX->algorithms == 0)
480	{
481	LOG(LOG_WARNING, "No algorithms configured. Use CTX->algorithms and DSA_");
482	return EINVAL;
483	}
484
485	if (CTX->classification != DSR_NONE && CTX->source == DSS_NONE)
486	{
487	LOG(LOG_WARNING, "A classification requires a source be specified");
488	return EINVAL;
489	}
490
491	if (CTX->classification == DSR_NONE && CTX->source != DSS_NONE)
492	{
493	LOG(LOG_WARNING, "A source requires a classification be specified");
494	return EINVAL;
495	}
496
497	/* Set TOE mode pretrain option if we haven't seen many messages yet */
498	if (CTX->training_mode == DST_TOE
499	&& (CTX->totals.innocent_learned <= 100 \|\| CTX->totals.spam_learned <= 100)
500	&& (!(CTX->algorithms & DSP_MARKOV)))
501	{
502	is_undertrain = 1;
503	CTX->training_mode = DST_TEFT;
504	}
505
506	/* Classify only for TOE / NOTRAIN mode setting if data is mature enough */
507	if ( CTX->operating_mode == DSM_PROCESS
508	&& CTX->classification == DSR_NONE
509	&& (CTX->training_mode == DST_TOE \|\| CTX->training_mode == DST_NOTRAIN))
510	{
511	CTX->operating_mode = DSM_CLASSIFY;
512	is_toe = 1;
513	}
514
515	/* A signature has been presented for training; process it */
516	/* Non-SPBH Signature */
517	if (CTX->operating_mode == DSM_PROCESS
518	&& CTX->classification != DSR_NONE
519	&& CTX->flags & DSF_SIGNATURE
520	&& (CTX->tokenizer != DSZ_SBPH))
521	{
522	retcode = _ds_process_signature (CTX);
523	goto restore_mode;
524	}
525
526	header = buffer_create (NULL);
527	body = buffer_create (NULL);
528	if (header == NULL \|\| body == NULL)
529	{
530	LOG (LOG_CRIT, ERR_MEM_ALLOC);
531	buffer_destroy (header);
532	buffer_destroy (body);
533	retcode = EUNKNOWN;
534	goto restore_mode;
535	}
536
537	/* Parse the message if it hasn't already been by the client app */
538	if (!CTX->message && message)
539	CTX->message = _ds_actualize_message (message);
540
541	/* Analyze and filter (unless it's a signature based classification) */
542	if (! (CTX->flags & DSF_SIGNATURE
543	&& CTX->operating_mode == DSM_CLASSIFY
544	&& CTX->signature != NULL))
545	{
546	_ds_degenerate_message(CTX, header, body);
547	}
548
549	/* Perform statistical operations and get a classification result */
550
551	/* Initialize */
552	CTX->result = DSR_NONE;
553
554	/* If SBPH reclassification, recall and operate on saved SBPH text */
555
556	if ( CTX->tokenizer == DSZ_SBPH
557	&& CTX->operating_mode != DSM_CLASSIFY
558	&& CTX->classification != DSR_NONE
559	&& CTX->flags & DSF_SIGNATURE)
560	{
561	char y, h, *b;
562	char *ptrptr = NULL;
563
564	y = strdup((const char *) CTX->signature->data);
565	h = strtok_r(y, "\001", &ptrptr);
566	b = strtok_r(NULL, "\001", &ptrptr);
567	spam_result = _ds_operate (CTX, h, b);
568	free(y);
569
570	/* Otherwise, operate on the input message */
571
572	} else {
573	spam_result = _ds_operate (CTX, header->data, body->data);
574	}
575
576	/* Clean up */
577	buffer_destroy (header);
578	buffer_destroy (body);
579
580	/* _ds_operate() was unable to process message. Restore operating and training mode. */
581	if (spam_result != DSR_ISSPAM && spam_result != DSR_ISINNOCENT) {
582	LOG(LOG_WARNING, "received invalid result (!DSR_ISSPAM && !DSR_ISINNOCENT)"
583	": %d", spam_result);
584	retcode = EFAILURE;
585	goto restore_mode;
586	}
587
588	/* Force decision if a classification was specified */
589	if (CTX->classification != DSR_NONE) {
590	if (CTX->classification == DSR_ISINNOCENT)
591	spam_result = DSR_ISINNOCENT;
592	else if (CTX->classification == DSR_ISSPAM)
593	spam_result = DSR_ISSPAM;
594	}
595
596	/* Apply results to context */
597	CTX->result = spam_result;
598	if (CTX->class[0] == 0) {
599	if (spam_result == DSR_ISSPAM)
600	strcpy(CTX->class, LANG_CLASS_SPAM);
601	else if (spam_result == DSR_ISINNOCENT)
602	strcpy(CTX->class, LANG_CLASS_INNOCENT);
603	}
604
605	/* Restore operating mode and/or training mode */
606	restore_mode:
607
608	if (is_toe)
609	CTX->operating_mode = DSM_PROCESS;
610	if (is_undertrain)
611	CTX->training_mode = DST_TOE;
612
613	#ifdef DEBUG
614	if (DO_DEBUG) {
615	if (CTX->source == DSS_NONE) {
616	gettimeofday(&tp2, &tzp);
617	LOGDEBUG("total processing time: %01.5fs",
618	(double) (tp2.tv_sec + (tp2.tv_usec / 1000000.0)) -
619	(double) (tp1.tv_sec + (tp1.tv_usec / 1000000.0)));
620	}
621	}
622	#endif
623
624	return retcode;
625	}
626
627	/*
628	* dspam_getsource()
629	*
630	* DESCRIPTION
631	*
632	* The dspam_getsource() function extracts the source sender from the mes-
633	* sage passed in during a call to dspam_process() and writes not more
634	* than size bytes to buf.
635	*
636	* RETURN VALUES
637	* returns 0 on success, standard errors on failure
638	*/
639
640	int
641	dspam_getsource (
642	DSPAM_CTX * CTX,
643	char *buf,
644	size_t size)
645	{
646	ds_message_part_t current_block;
647	ds_header_t current_heading = NULL;
648	struct nt_node *node_nt;
649	struct nt_c c;
650	char qmailmode = 0;
651
652	if (CTX->message == NULL)
653	return EINVAL;
654
655	node_nt = c_nt_first (CTX->message->components, &c);
656	if (node_nt == NULL)
657	return EINVAL;
658
659	current_block = (ds_message_part_t) node_nt->ptr;
660
661	node_nt = c_nt_first (current_block->headers, &c);
662	while (node_nt != NULL)
663	{
664	current_heading = (ds_header_t) node_nt->ptr;
665	if (!strcmp (current_heading->heading, "Received"))
666	{
667	char data, ptr, *tok;
668
669	// detect and skip "Received: (qmail..." lines
670	if (!strncmp(current_heading->data, "(qmail", 6))
671	{
672	qmailmode = 1;
673	node_nt = c_nt_next (current_block->headers, &c);
674	continue;
675	}
676
677	data = strdup (current_heading->data);
678	ptr = strstr (data, "from");
679
680	if (ptr != NULL)
681	{
682	if (strchr(data, '[')) // found a non-qmail header
683	{
684	qmailmode = 0;
685	}
686
687	// qmail puts the sending IP inside the last "()" pair of the line
688	if (qmailmode)
689	{
690	tok = strrchr(data, ')');
691
692	if (tok != NULL)
693	{
694	*tok = 0;
695	tok = strrchr(data, '(');
696	if (tok != NULL)
697	tok++;
698	}
699	}
700	else
701	{
702	char *ptrptr = NULL;
703	tok = strtok_r (ptr, "[", &ptrptr);
704
705	if (tok != NULL)
706	{
707	tok = strtok_r (NULL, "]", &ptrptr);
708	}
709	}
710	if (tok != NULL)
711	{
712	int whitelisted = 0;
713	if (!strncmp (tok, "127.",4) \|\| // ignore localhost
714	!strncmp (tok, "10.", 3) \|\| // ignore RFC 1918 private addresses
715	!strncmp (tok, "172.16.", 7) \|\|
716	!strncmp (tok, "192.168.", 8) \|\|
717	!strncmp (tok, "169.254.", 8)) // ignore local-link
718	whitelisted = 1;
719
720	if (_ds_match_attribute(CTX->config->attributes, "LocalMX", tok))
721	whitelisted = 1;
722
723	if (!whitelisted)
724	{
725	strlcpy (buf, tok, size);
726	free (data);
727	return 0;
728	}
729	}
730	}
731	free (data);
732	}
733	node_nt = c_nt_next (current_block->headers, &c);
734	}
735
736	return EFAILURE;
737	}
738
739	/*
740	* _ds_operate() - operate on the message
741	*
742	* DESCRIPTION
743	* calculate the statistical probability the email is spam
744	* update tokens in dictionary according to result/mode
745	*
746	* INPUT ARGUMENTS
747	* DSPAM_CTX *CTX pointer to context
748	* char *header pointer to message header
749	* char *body pointer to message body
750	*
751	* RETURN VALUES
752	* standard errors on failure
753	*
754	* DSR_ISSPAM message is spam
755	* DSR_ISINNOCENT message is innocent
756	*/
757
758	int
759	_ds_operate (DSPAM_CTX * CTX, char headers, char body)
760	{
761	int errcode = 0;
762
763	/* Create our diction (lexical data in message) and patterns */
764
765	ds_diction_t diction = ds_diction_create(24593ul);
766	ds_diction_t bnr_patterns = NULL;
767	ds_term_t ds_term;
768	ds_cursor_t ds_c;
769
770	ds_heap_t heap_sort = NULL; /* Heap sort for top N tokens */
771
772	#ifdef LIBBNR_DEBUG
773	ds_heap_t heap_nobnr = NULL;
774	#endif
775
776	unsigned long long whitelist_token = 0;
777	int do_whitelist = 0;
778	int result;
779	unsigned int heap_sort_items = 0;
780
781	if (CTX->algorithms & DSA_BURTON)
782	heap_sort = ds_heap_create(BURTON_WINDOW_SIZE, HP_DELTA);
783	else if (CTX->algorithms & DSA_ROBINSON)
784	heap_sort = ds_heap_create(25, HP_DELTA);
785	else
786	heap_sort = ds_heap_create(15, HP_DELTA);
787
788	/* Allocate SBPH signature (stored as message text) */
789
790	if ( CTX->tokenizer == DSZ_SBPH
791	&& CTX->flags & DSF_SIGNATURE
792	&& ( ( CTX->operating_mode != DSM_CLASSIFY
793	&& CTX->classification == DSR_NONE)
794	\|\| ! (CTX->_sig_provided))
795	&& CTX->source != DSS_CORPUS)
796	{
797	if (CTX->signature) {
798	if (CTX->signature->data)
799	free(CTX->signature->data);
800	free(CTX->signature);
801	CTX->signature = NULL;
802	}
803	CTX->signature = calloc (1, sizeof (struct _ds_spam_signature));
804	if (CTX->signature == NULL)
805	{
806	LOG (LOG_CRIT, "memory allocation error");
807	errcode = EUNKNOWN;
808	goto bail;
809	}
810
811	CTX->signature->length = strlen(headers)+strlen(body)+2;
812	CTX->signature->data = malloc(CTX->signature->length);
813
814	if (CTX->signature->data == NULL)
815	{
816	LOG (LOG_CRIT, "memory allocation error");
817	free (CTX->signature);
818	CTX->signature = NULL;
819	errcode = EUNKNOWN;
820	goto bail;
821	}
822
823	strcpy(CTX->signature->data, headers);
824	strcat(CTX->signature->data, "\001");
825	strcat(CTX->signature->data, body);
826	}
827
828	if (!diction)
829	{
830	LOG (LOG_CRIT, ERR_MEM_ALLOC);
831	errcode = EUNKNOWN;
832	goto bail;
833	}
834
835	#ifdef LIBBNR_DEBUG
836	heap_nobnr = ds_heap_create (heap_sort->size, HP_DELTA);
837	if (heap_nobnr == NULL) {
838	LOG (LOG_CRIT, ERR_MEM_ALLOC);
839	errcode = EUNKNOWN;
840	goto bail;
841	}
842	#endif
843
844	CTX->result =
845	(CTX->classification == DSR_ISSPAM) ? DSR_ISSPAM : DSR_ISINNOCENT;
846
847	/* If we are classifying based on a signature, preprogram the tree */
848
849	if (CTX->flags & DSF_SIGNATURE &&
850	CTX->operating_mode == DSM_CLASSIFY &&
851	CTX->_sig_provided)
852	{
853	int num_tokens =
854	CTX->signature->length / sizeof (struct _ds_signature_token);
855	struct _ds_signature_token t;
856
857	int i;
858	for (i = 0; i < num_tokens; i++)
859	{
860	char x[128];
861	memcpy (&t,
862	(char *) CTX->signature->data +
863	(i * sizeof (struct _ds_signature_token)),
864	sizeof (struct _ds_signature_token));
865	snprintf (x, sizeof (x), "E: %" LLU_FMT_SPEC, t.token);
866	ds_term = ds_diction_touch(diction, t.token, x, 0);
867	if (ds_term)
868	ds_term->frequency = t.frequency;
869	}
870	}
871
872	/* Otherwise, tokenize the message and propagate the tree */
873
874	else
875	{
876	if (_ds_tokenize(CTX, headers, body, diction)) {
877	LOG(LOG_CRIT, "tokenizer failed");
878	}
879	whitelist_token = diction->whitelist_token;
880	}
881
882
883	/* Load all token statistics */
884	if (_ds_getall_spamrecords (CTX, diction))
885	{
886	LOGDEBUG ("_ds_getall_spamrecords() failed");
887	errcode = EUNKNOWN;
888	goto bail;
889	}
890
891	/* Apply Bayesian Noise Reduction */
892	if (CTX->flags & DSF_NOISE)
893	{
894	ds_diction_t p = _ds_apply_bnr(CTX, diction);
895	if (p)
896	ds_diction_destroy(p);
897	}
898
899	if (CTX->flags & DSF_WHITELIST)
900	{
901	LOGDEBUG("Whitelist threshold: %d", CTX->wh_threshold);
902	}
903
904	/* Create a heap sort based on the token's delta from .5 */
905	ds_c = ds_diction_cursor(diction);
906	ds_term = ds_diction_next(ds_c);
907	while(ds_term)
908	{
909
910	if (ds_term->key == CONTROL_TOKEN) {
911	ds_term = ds_diction_next(ds_c);
912	continue;
913	}
914
915	if (ds_term->s.probability == 0.00000 \|\| CTX->classification != DSR_NONE)
916	_ds_calc_stat (CTX, ds_term, &ds_term->s, DTT_DEFAULT, NULL);
917
918	if (CTX->flags & DSF_WHITELIST) {
919	if (ds_term->key == whitelist_token &&
920	ds_term->s.spam_hits <= (ds_term->s.innocent_hits / 15) &&
921	ds_term->s.innocent_hits > CTX->wh_threshold &&
922	CTX->classification == DSR_NONE)
923	{
924	do_whitelist = 1;
925	}
926	}
927
928	if (ds_term->frequency > 0 && ds_term->type == 'D')
929	{
930	ds_heap_insert (heap_sort, ds_term->s.probability, ds_term->key,
931	ds_term->frequency, _ds_compute_complexity(ds_term->name));
932	}
933
934	#ifdef LIBBNR_DEBUG
935	if (ds_term->type == 'D')
936	{
937	ds_heap_insert (heap_nobnr, ds_term->s.probability, ds_term->key,
938	ds_term->frequency, _ds_compute_complexity(ds_term->name));
939	}
940	#endif
941
942	#ifdef VERBOSE
943	LOGDEBUG ("Token: %s [%f] SH %ld IH %ld", ds_term->name, ds_term->s.probability, ds_term->s.spam_hits, ds_term->s.innocent_hits);
944	#endif
945
946	ds_term = ds_diction_next(ds_c);
947	}
948	ds_diction_close(ds_c);
949
950	/* Keep track of items in heap_sort. We need that info later on when freeing the signature */
951	heap_sort_items = heap_sort->items;
952
953	/* Take the 15 most interesting tokens and generate a score */
954
955	if (heap_sort->items == 0)
956	{
957	LOGDEBUG ("no tokens found in message");
958	errcode = EINVAL;
959	goto bail;
960	}
961
962	/* Initialize Non-SBPH signature, if requested */
963
964	if ( CTX->tokenizer != DSZ_SBPH
965	&& CTX->flags & DSF_SIGNATURE
966	&& (CTX->operating_mode != DSM_CLASSIFY \|\| ! CTX->_sig_provided))
967	{
968	if (CTX->signature) {
969	if (CTX->signature->data)
970	free(CTX->signature->data);
971	free(CTX->signature);
972	CTX->signature = NULL;
973	}
974	CTX->signature = calloc (1, sizeof (struct _ds_spam_signature));
975	if (CTX->signature == NULL)
976	{
977	LOG (LOG_CRIT, "memory allocation error");
978	errcode = EUNKNOWN;
979	goto bail;
980	}
981
982	CTX->signature->length =
983	sizeof (struct _ds_signature_token) * diction->items;
984	CTX->signature->data = malloc (CTX->signature->length);
985	if (CTX->signature->data == NULL)
986	{
987	LOG (LOG_CRIT, "memory allocation error");
988	free (CTX->signature);
989	CTX->signature = NULL;
990	errcode = EUNKNOWN;
991	goto bail;
992	}
993	}
994
995	#ifdef LIBBNR_DEBUG
996	{
997	int x = CTX->result;
998	int nobnr_result = 0;
999
1000	if (CTX->flags & DSF_NOISE) {
1001	nobnr_result = _ds_calc_result(CTX, heap_nobnr, diction);
1002
1003	if (CTX->factors) {
1004	_ds_factor_destroy(CTX->factors);
1005	CTX->factors = NULL;
1006	}
1007	CTX->result = x;
1008	CTX->probability = DSP_UNCALCULATED;
1009	}
1010	#endif
1011
1012	result = _ds_calc_result(CTX, heap_sort, diction);
1013
1014	#ifdef LIBBNR_DEBUG
1015	if (CTX->flags & DSF_NOISE) {
1016	if (nobnr_result == result) {
1017	LOGDEBUG("BNR Decision Concurs");
1018	} else {
1019	LOGDEBUG("BNR Decision Conflicts: %d (BNR) / %d (No BNR)", result, nobnr_result);
1020	}
1021	}
1022	}
1023	#endif
1024
1025	if (CTX->flags & DSF_WHITELIST && do_whitelist) {
1026	LOGDEBUG("auto-whitelisting this message");
1027	CTX->result = DSR_ISINNOCENT;
1028	strcpy(CTX->class, LANG_CLASS_WHITELISTED);
1029	}
1030
1031	/* Update Totals */
1032
1033	/* SPAM */
1034	if (CTX->result == DSR_ISSPAM && CTX->operating_mode != DSM_CLASSIFY)
1035	{
1036	if (!(CTX->flags & DSF_UNLEARN)) {
1037	CTX->totals.spam_learned++;
1038	CTX->learned = 1;
1039	}
1040
1041	if (CTX->classification == DSR_ISSPAM)
1042	{
1043	if (CTX->flags & DSF_UNLEARN) {
1044	CTX->totals.spam_learned -= (CTX->totals.spam_learned > 0) ? 1 : 0;
1045	} else if (CTX->source == DSS_CORPUS \|\| CTX->source == DSS_INOCULATION) {
1046	CTX->totals.spam_corpusfed++;
1047	}
1048	else if (SPAM_MISS(CTX))
1049	{
1050	CTX->totals.spam_misclassified++;
1051	if (CTX->training_mode != DST_TOE && CTX->training_mode != DST_NOTRAIN)
1052	{
1053	CTX->totals.innocent_learned -=
1054	(CTX->totals.innocent_learned > 0) ? 1 : 0;
1055	}
1056	}
1057	}
1058
1059	/* INNOCENT */
1060	}
1061	else if ((CTX->result == DSR_ISINNOCENT) &&
1062	CTX->operating_mode != DSM_CLASSIFY)
1063	{
1064	if (!(CTX->flags & DSF_UNLEARN)) {
1065	CTX->totals.innocent_learned++;
1066	CTX->learned = 1;
1067	}
1068
1069	if (CTX->source == DSS_CORPUS \|\| CTX->source == DSS_INOCULATION)
1070	{
1071	CTX->totals.innocent_corpusfed++;
1072	}
1073	else if (FALSE_POSITIVE(CTX))
1074	{
1075	if (CTX->flags & DSF_UNLEARN) {
1076	CTX->totals.innocent_learned -= (CTX->totals.innocent_learned >0) ? 1:0;
1077	} else {
1078	CTX->totals.innocent_misclassified++;
1079	if (CTX->training_mode != DST_TOE && CTX->training_mode != DST_NOTRAIN)
1080	{
1081	CTX->totals.spam_learned -= (CTX->totals.spam_learned > 0) ? 1 : 0;
1082	}
1083	}
1084	}
1085	}
1086
1087	/* TOE mode increments 'classified' totals */
1088	if (CTX->training_mode == DST_TOE && CTX->operating_mode == DSM_CLASSIFY) {
1089	if (CTX->result == DSR_ISSPAM)
1090	CTX->totals.spam_classified++;
1091	else if (CTX->result == DSR_ISINNOCENT)
1092	CTX->totals.innocent_classified++;
1093	}
1094
1095	_ds_increment_tokens(CTX, diction);
1096
1097	/* Store all tokens */
1098	if (CTX->training_mode != DST_NOTRAIN) {
1099	if (_ds_setall_spamrecords (CTX, diction))
1100	{
1101	LOGDEBUG ("_ds_setall_spamrecords() failed");
1102	errcode = EUNKNOWN;
1103	goto bail;
1104	}
1105	}
1106
1107	ds_diction_destroy (diction);
1108	ds_heap_destroy (heap_sort);
1109	#ifdef LIBBNR_DEBUG
1110	ds_heap_destroy (heap_nobnr);
1111	#endif
1112
1113	/* One final sanity check */
1114
1115	if (CTX->classification == DSR_ISINNOCENT)
1116	{
1117	CTX->probability = 0.0;
1118	CTX->result = DSR_ISINNOCENT;
1119	}
1120	else if (CTX->classification == DSR_ISSPAM)
1121	{
1122	CTX->probability = 1.0;
1123	CTX->result = DSR_ISSPAM;
1124	}
1125
1126	return CTX->result;
1127
1128	bail:
1129	LOG(LOG_ERR, "bailing on error %d", errcode);
1130	ds_heap_destroy (heap_sort);
1131	#ifdef LIBBNR_DEBUG
1132	ds_heap_destroy (heap_nobnr);
1133	#endif
1134	ds_diction_destroy(diction);
1135	ds_diction_destroy(bnr_patterns);
1136	if (CTX->signature != NULL) {
1137	if (CTX->signature->data != NULL) {
1138	free(CTX->signature->data);
1139	CTX->signature->data = NULL;
1140	}
1141	if (CTX->signature != NULL && heap_sort_items > 0)
1142	free (CTX->signature);
1143	CTX->signature = NULL;
1144	}
1145	return errcode;
1146	}
1147
1148	/*
1149	* _ds_process_signature()
1150	*
1151	* DESCRIPTION
1152	* process an erroneously classified message processing based on signature
1153	*
1154	* INPUT ARGUMENTS
1155	* parameters: DSPAM_CTX *CTX Pointer to context containing signature
1156	*/
1157
1158	int
1159	_ds_process_signature (DSPAM_CTX * CTX)
1160	{
1161	struct _ds_signature_token t;
1162	int num_tokens, i;
1163	ds_diction_t diction = ds_diction_create(24593ul);
1164	ds_term_t ds_term;
1165	ds_cursor_t ds_c;
1166	int occurrence = _ds_match_attribute(CTX->config->attributes,
1167	"ProcessorWordFrequency", "occurrence");
1168
1169	if (diction == NULL) {
1170	LOG (LOG_CRIT, ERR_MEM_ALLOC);
1171	return EUNKNOWN;
1172	}
1173
1174	if (CTX->signature == NULL) {
1175	LOG(LOG_WARNING, "DSF_SIGNATURE specified, but no signature provided.");
1176	ds_diction_destroy(diction);
1177	return EINVAL;
1178	}
1179
1180	LOGDEBUG ("processing signature. length: %ld", CTX->signature->length);
1181
1182	CTX->result = DSR_NONE;
1183
1184	if (!(CTX->flags & DSF_UNLEARN))
1185	CTX->learned = 1;
1186
1187	/* INNOCENT */
1188	if (CTX->classification == DSR_ISINNOCENT &&
1189	CTX->operating_mode != DSM_CLASSIFY)
1190	{
1191	if (CTX->flags & DSF_UNLEARN) {
1192	CTX->totals.innocent_learned -= (CTX->totals.innocent_learned) > 0 ? 1:0;
1193	} else {
1194	if (CTX->source == DSS_ERROR) {
1195	CTX->totals.innocent_misclassified++;
1196	if (CTX->training_mode != DST_TOE && CTX->training_mode != DST_NOTRAIN)
1197	{
1198	CTX->totals.spam_learned -= (CTX->totals.spam_learned > 0) ? 1:0;
1199	}
1200	} else {
1201	CTX->totals.innocent_corpusfed++;
1202	}
1203
1204	CTX->totals.innocent_learned++;
1205	}
1206	}
1207
1208	/* SPAM */
1209	else if (CTX->classification == DSR_ISSPAM &&
1210	CTX->operating_mode != DSM_CLASSIFY)
1211	{
1212	if (CTX->flags & DSF_UNLEARN) {
1213	CTX->totals.spam_learned -= (CTX->totals.spam_learned > 0) ? 1 : 0;
1214	} else {
1215	if (CTX->source == DSS_ERROR) {
1216	CTX->totals.spam_misclassified++;
1217	if (CTX->training_mode != DST_TOE && CTX->training_mode != DST_NOTRAIN)
1218	{
1219	CTX->totals.innocent_learned -= (CTX->totals.innocent_learned > 0) ? 1:0;
1220	}
1221	} else {
1222	CTX->totals.spam_corpusfed++;
1223	}
1224	CTX->totals.spam_learned++;
1225	}
1226	}
1227
1228	num_tokens = CTX->signature->length / sizeof (struct _ds_signature_token);
1229
1230	if (CTX->class[0] == 0) {
1231	if (CTX->classification == DSR_ISSPAM)
1232	strcpy(CTX->class, LANG_CLASS_SPAM);
1233	else if (CTX->classification == DSR_ISINNOCENT)
1234	strcpy(CTX->class, LANG_CLASS_INNOCENT);
1235	}
1236
1237	/* Don't retrain if no tokens where loaded from the signature */
1238	if (num_tokens == 0)
1239	{
1240	LOG (LOG_WARNING, "Skipping retraining for signature with %d tokens", num_tokens);
1241	LOGDEBUG ("Skipping retraining for signature with %d tokens", num_tokens);
1242	} else {
1243	LOGDEBUG ("Reversing %d tokens", num_tokens);
1244	for (i = 0; i < num_tokens; i++)
1245	{
1246	memcpy (&t,
1247	(char *) CTX->signature->data +
1248	(i * sizeof (struct _ds_signature_token)),
1249	sizeof (struct _ds_signature_token));
1250	ds_term = ds_diction_touch (diction, t.token, "-", 0);
1251	if (ds_term)
1252	{
1253	ds_term->frequency = t.frequency;
1254	}
1255	}
1256
1257	if (_ds_getall_spamrecords (CTX, diction)) {
1258	ds_diction_destroy(diction);
1259	return EUNKNOWN;
1260	}
1261
1262	ds_c = ds_diction_cursor(diction);
1263	ds_term = ds_diction_next(ds_c);
1264	while(ds_term)
1265	{
1266	/* INNOCENT */
1267	if (CTX->classification == DSR_ISINNOCENT)
1268	{
1269	if (CTX->flags & DSF_UNLEARN)
1270	{
1271	if (occurrence)
1272	{
1273	ds_term->s.innocent_hits -= ds_term->frequency;
1274	if (ds_term->s.innocent_hits < 0)
1275	ds_term->s.innocent_hits = 0;
1276	} else {
1277	ds_term->s.innocent_hits -= (ds_term->s.innocent_hits>0) ? 1:0;
1278	}
1279	} else {
1280	if (CTX->source == DSS_ERROR &&
1281	CTX->training_mode != DST_NOTRAIN &&
1282	CTX->training_mode != DST_TOE)
1283	{
1284	if (occurrence)
1285	{
1286	ds_term->s.spam_hits -= ds_term->frequency;
1287	if (ds_term->s.spam_hits < 0)
1288	ds_term->s.spam_hits = 0;
1289	} else {
1290	ds_term->s.spam_hits -= (ds_term->s.spam_hits>0) ? 1:0;
1291	}
1292	}
1293
1294	if (CTX->source == DSS_INOCULATION)
1295	{
1296	if (ds_term->s.spam_hits < 2 && ds_term->s.innocent_hits < 5)
1297	{
1298	ds_term->s.innocent_hits += 5;
1299	}
1300	else
1301	{
1302	ds_term->s.innocent_hits += 2;
1303	}
1304	} else /* ERROR or CORPUS */
1305	{
1306	if (occurrence)
1307	{
1308	ds_term->s.innocent_hits += ds_term->frequency;
1309	} else {
1310	ds_term->s.innocent_hits++;
1311	}
1312	}
1313	}
1314	}
1315
1316	/* SPAM */
1317	else if (CTX->classification == DSR_ISSPAM)
1318	{
1319	if (CTX->flags & DSF_UNLEARN)
1320	{
1321	if (occurrence)
1322	{
1323	ds_term->s.spam_hits -= ds_term->frequency;
1324	if (ds_term->s.spam_hits < 0)
1325	ds_term->s.spam_hits = 0;
1326	} else {
1327	ds_term->s.spam_hits -= (ds_term->s.spam_hits>0) ? 1:0;
1328	}
1329	} else {
1330	if (CTX->source == DSS_ERROR &&
1331	CTX->training_mode != DST_NOTRAIN &&
1332	CTX->training_mode != DST_TOE)
1333	{
1334	if (occurrence)
1335	{
1336	ds_term->s.innocent_hits -= ds_term->frequency;
1337	if (ds_term->s.innocent_hits < 0)
1338	ds_term->s.innocent_hits = 0;
1339	} else {
1340	ds_term->s.innocent_hits -= (ds_term->s.innocent_hits>0) ? 1:0;
1341	}
1342	}
1343
1344	if (CTX->source == DSS_INOCULATION)
1345	{
1346	if (ds_term->s.innocent_hits < 2 && ds_term->s.spam_hits < 5)
1347	{
1348	ds_term->s.spam_hits += 5;
1349	}
1350	else
1351	{
1352	ds_term->s.spam_hits += 2;
1353	}
1354	} else /* ERROR or CORPUS */
1355	{
1356	if (occurrence)
1357	{
1358	ds_term->s.spam_hits += ds_term->frequency;
1359	} else {
1360	ds_term->s.spam_hits++;
1361	}
1362	}
1363	}
1364	}
1365
1366	ds_term->s.status \|= TST_DIRTY;
1367	ds_term = ds_diction_next(ds_c);
1368	}
1369	ds_diction_close(ds_c);
1370
1371	if (CTX->training_mode != DST_NOTRAIN) {
1372	if (_ds_setall_spamrecords (CTX, diction)) {
1373	ds_diction_destroy(diction);
1374	return EUNKNOWN;
1375	}
1376	}
1377	}
1378
1379	if (CTX->classification == DSR_ISSPAM)
1380	{
1381	CTX->probability = 1.0;
1382	CTX->result = DSR_ISSPAM;
1383	LOGDEBUG ("Message classification/result: SPAM");
1384	}
1385	else
1386	{
1387	CTX->probability = 0.0;
1388	CTX->result = DSR_ISINNOCENT;
1389	LOGDEBUG ("Message classification/result: INNOCENT");
1390	}
1391
1392	ds_diction_destroy(diction);
1393	return 0;
1394	}
1395
1396	/*
1397	* _ds_calc_stat() - Calculate the probability of a token
1398	*
1399	* DESCRIPTION
1400	*
1401	* Calculates the probability of an individual token based on the
1402	* pvalue algorithm chosen. The resulting value largely depends on
1403	* the total amount of ham/spam in the user's corpus. The result
1404	* is written to s.
1405	*
1406	* INPUT ARGUMENTS
1407	* CTX DSPAM context
1408	* term ds_term_t
1409	* token_type DTT_ value specifying token type
1410	* bnr_tot BNR totals structure
1411	*/
1412
1413	int
1414	_ds_calc_stat (
1415	DSPAM_CTX * CTX,
1416	ds_term_t term,
1417	struct _ds_spam_stat *s,
1418	int token_type,
1419	struct _ds_spam_stat *bnr_tot)
1420	{
1421	int min_hits, sed_hits = 0;
1422	unsigned long ti, ts;
1423
1424	if (token_type == DTT_BNR) {
1425	min_hits = 25; /* Bayesian Noise Reduction patterns */
1426
1427	} else {
1428	min_hits = 5; /* "Standard" token threshold */
1429	}
1430
1431	/* Statistical Sedation: Adjust hapaxial threshold to compensate for a
1432	* spam corpus imbalance
1433	*/
1434
1435	ti = CTX->totals.innocent_learned + CTX->totals.innocent_classified;
1436	ts = CTX->totals.spam_learned + CTX->totals.spam_classified;
1437	if (CTX->training_buffer>0) {
1438	if (ti < 1000 && ti < ts)
1439	{
1440	sed_hits = min_hits+(CTX->training_buffer/2)+
1441	(CTX->training_buffer*((ts-ti)/200));
1442	}
1443
1444	if (ti < 2500 && ti >=1000 && ts > ti)
1445	{
1446	float spams = (ts * 1.0 / (ts * 1.0 + ti * 1.0)) * 100;
1447	sed_hits = min_hits+(CTX->training_buffer/2)+
1448	(CTX->training_buffer*(spams/20));
1449	}
1450	} else if (! CTX->training_buffer) {
1451	min_hits = 5;
1452	}
1453
1454	if (token_type != DTT_DEFAULT \|\| sed_hits > min_hits)
1455	min_hits = sed_hits;
1456
1457	/* TUM mode training only records up to 20 hits so we need to make sure we
1458	* don't require more than that.
1459	*/
1460
1461	if (CTX->training_mode == DST_TUM && min_hits > 20)
1462	min_hits = 20;
1463
1464	if (CTX->classification == DSR_ISSPAM)
1465	s->probability = .7;
1466	else
1467	s->probability = (CTX->algorithms & DSP_MARKOV) ? .5 : .4;
1468
1469	/* Markovian Weighting */
1470
1471	if (CTX->algorithms & DSP_MARKOV) {
1472	unsigned int weight;
1473	long num, den;
1474
1475	/* some utilities don't provide the token name, and so we can't compute
1476	* a probability. just return something neutral.
1477	*/
1478	if (term == NULL) {
1479	s->probability = .5;
1480	return 0;
1481	}
1482
1483	/* return neutral probability for BNR patterns */
1484	if (token_type == DTT_BNR \|\| term->type == 'B' \|\| !strncmp(term->name, "bnr.", 4)) {
1485	s->probability = .5;
1486	return 0;
1487	}
1488
1489	/* return neutral probability for frequency tokens */
1490	if (!strncmp(term->name, "E: ", 3)) {
1491	s->probability = .5;
1492	return 0;
1493	}
1494
1495	/* return neutral probability for "From" tokens (used for when whitelisting) */
1496	if (!strncmp(term->name, "From*", 5)) {
1497	s->probability = .5;
1498	return 0;
1499	}
1500
1501	/* return neutral probability for control tokens */
1502	if (!strncmp(term->name, "$$CONTROL$$", 11)) {
1503	s->probability = .5;
1504	return 0;
1505	}
1506
1507	weight = _ds_compute_weight(term->name);
1508
1509	if (CTX->flags & DSF_BIAS) {
1510	num = weight * (s->spam_hits - (s->innocent_hits*2));
1511	den = C1 * (s->spam_hits + (s->innocent_hits2) + C2) 256;
1512	s->probability = 0.49 + ((double) num / (double) den);
1513	} else {
1514	num = (s->spam_hits - s->innocent_hits) * weight;
1515	den = C1 * (s->spam_hits + s->innocent_hits + C2) * 256;
1516	s->probability = 0.5 + ((double) num / (double) den);
1517	}
1518
1519	/* Graham and Robinson Start Here */
1520
1521	} else {
1522	int ih = 1;
1523	if (CTX->flags & DSF_BIAS)
1524	ih = 2;
1525
1526	if (CTX->totals.spam_learned > 0 && CTX->totals.innocent_learned > 0)
1527	{
1528	if (token_type == DTT_BNR) {
1529	s->probability =
1530	(s->spam_hits * 1.0 / bnr_tot->spam_hits * 1.0) /
1531	((s->spam_hits * 1.0 / bnr_tot->spam_hits * 1.0) +
1532	(s->innocent_hits * 1.0 / bnr_tot->innocent_hits * 1.0));
1533	} else {
1534	s->probability =
1535	(s->spam_hits * 1.0 / CTX->totals.spam_learned * 1.0) /
1536	((s->spam_hits * 1.0 / CTX->totals.spam_learned * 1.0) +
1537	(s->innocent_hits * ih * 1.0 / CTX->totals.innocent_learned * 1.0));
1538	}
1539	}
1540
1541	if (s->spam_hits == 0 && s->innocent_hits > 0) {
1542	s->probability = 0.01;
1543	if (CTX->totals.spam_learned > 0 && CTX->totals.innocent_learned > 0)
1544	{
1545	if ((1.0 / CTX->totals.spam_learned * 1.0) /
1546	((1.0 / CTX->totals.spam_learned * 1.0) +
1547	(s->innocent_hits * ih * 1.0 / CTX->totals.innocent_learned * 1.0))
1548	< 0.01)
1549	{
1550	s->probability = (1.0 / CTX->totals.spam_learned * 1.0) /
1551	((1.0 / CTX->totals.spam_learned * 1.0) +
1552	(s->innocent_hits * ih 1.0 / CTX->totals.innocent_learned 1.0));
1553	}
1554	}
1555	}
1556	else if (s->spam_hits > 0 && s->innocent_hits == 0) {
1557	s->probability = 0.99;
1558	if (CTX->totals.spam_learned > 0 && CTX->totals.innocent_learned > 0)
1559	{
1560	if ((s->spam_hits * 1.0 / CTX->totals.spam_learned * 1.0) /
1561	((s->spam_hits * 1.0 / CTX->totals.spam_learned * 1.0) +
1562	(ih * 1.0 / CTX->totals.innocent_learned * 1.0))
1563	> 0.99)
1564	{
1565	s->probability = (s->spam_hits * 1.0 / CTX->totals.spam_learned * 1.0)
1566	/ ((s->spam_hits * 1.0 / CTX->totals.spam_learned * 1.0)
1567	+ (ih * 1.0 / CTX->totals.innocent_learned * 1.0));
1568	}
1569	}
1570	}
1571
1572	if ( (CTX->flags & DSF_BIAS &&
1573	(s->spam_hits + (2 * s->innocent_hits) < min_hits))
1574	\|\| (!(CTX->flags & DSF_BIAS) &&
1575	(s->spam_hits + s->innocent_hits < min_hits)))
1576	{
1577	s->probability = (CTX->algorithms & DSP_MARKOV) ? .5000 : .4;
1578	}
1579	}
1580
1581	if (s->probability < 0.0001)
1582	s->probability = 0.0001;
1583
1584	if (s->probability > 0.9999)
1585	s->probability = 0.9999;
1586
1587	/* Finish off Robinson */
1588
1589	if (token_type != DTT_BNR && CTX->algorithms & DSP_ROBINSON)
1590	{
1591	unsigned long n = s->spam_hits + s->innocent_hits;
1592	double fw = ((CHI_S * CHI_X) + (n * s->probability))/(CHI_S + n);
1593	s->probability = fw;
1594	}
1595
1596	return 0;
1597	}
1598
1599	/*
1600	* _ds_calc_result()
1601	*
1602	* DESCRIPTION
1603	* Perform statistical combination of the token index
1604	*
1605	* Passed in an index of tokens, this function is responsible for choosing
1606	* and combining the most relevant characteristics (based on the algorithms
1607	* configured) and calculating libdspam's decision about the provided
1608	* message sample.
1609	*/
1610
1611	int
1612	_ds_calc_result(DSPAM_CTX *CTX, ds_heap_t heap_sort, ds_diction_t diction)
1613	{
1614	struct _ds_spam_stat stat;
1615	ds_heap_element_t node_heap;
1616	ds_heap_element_t heap_list[heap_sort->items];
1617
1618	/* Naive-Bayesian */
1619	float nbay_top = 0.0;
1620	float nbay_bot = 0.0;
1621	float nbay_result = -1;
1622	long nbay_used = 0; /* Total tokens used in naive bayes */
1623	struct nt *factor_nbayes = nt_create(NT_PTR);
1624
1625	/* Graham-Bayesian */
1626	float bay_top = 0.0;
1627	float bay_bot = 0.0;
1628	float bay_result = -1;
1629	long bay_used = 0; /* Total tokens used in bayes */
1630	struct nt *factor_bayes = nt_create(NT_PTR);
1631
1632	/* Burton-Bayesian */
1633	double abay_top = 0.0;
1634	double abay_bot = 0.0;
1635	double abay_result = -1;
1636	long abay_used = 0; /* Total tokens used in altbayes */
1637	struct nt *factor_altbayes = nt_create(NT_PTR);
1638
1639	/* Robinson's Geometric Mean, used to calculate confidence */
1640	float rob_top = 0.0; /* Robinson's Geometric Mean */
1641	float rob_bot = 0.0;
1642	float rob_result = -1;
1643	double p = 0.0, q = 0.0, s = 0.0; /* Robinson PQS Calculations */
1644	long rob_used = 0; /* Total tokens used in Robinson's GM */
1645	struct nt *factor_rob = nt_create(NT_PTR);
1646
1647	/* Fisher-Robinson's Chi-Square */
1648	float chi_result = -1;
1649	long chi_used = 0, chi_sx = 0, chi_hx = 0;
1650	double chi_s = 1.0, chi_h = 1.0;
1651	struct nt *factor_chi = nt_create(NT_PTR);
1652	unsigned int i;
1653
1654	/* Invert the heap */
1655	node_heap = heap_sort->root;
1656	for(i=0;i<heap_sort->items;i++) {
1657	heap_list[(heap_sort->items-i)-1] = node_heap;
1658	node_heap = node_heap->next;
1659	}
1660
1661	/* BEGIN Combine Token Values */
1662	for(i=0;i<heap_sort->items;i++)
1663	{
1664	char *token_name;
1665	ds_term_t ds_term;
1666
1667	node_heap = heap_list[i];
1668	ds_term = ds_diction_find(diction, node_heap->token);
1669
1670	if (!ds_term)
1671	continue;
1672
1673	/* Skip BNR patterns */
1674	if (ds_term->type == 'B')
1675	continue;
1676
1677	token_name = ds_term->name;
1678
1679	if (ds_diction_getstat(diction, node_heap->token, &stat) \|\| !token_name)
1680	continue;
1681
1682	/* Set the probability if we've provided a classification */
1683	if (CTX->classification == DSR_ISSPAM)
1684	stat.probability = 1.00;
1685	else if (CTX->classification == DSR_ISINNOCENT)
1686	stat.probability = 0.00;
1687
1688	/* Graham-Bayesian */
1689	if (CTX->algorithms & DSA_GRAHAM && bay_used < 15)
1690	{
1691	LOGDEBUG ("[graham] [%2.6f] %s (%dfrq, %lds, %ldi)",
1692	stat.probability, token_name, ds_term->frequency,
1693	stat.spam_hits, stat.innocent_hits);
1694
1695	_ds_factor(factor_bayes, token_name, stat.probability);
1696
1697	if (bay_used == 0)
1698	{
1699	bay_top = stat.probability;
1700	bay_bot = 1 - stat.probability;
1701	}
1702	else
1703	{
1704	bay_top *= stat.probability;
1705	bay_bot *= (1 - stat.probability);
1706	}
1707
1708	bay_used++;
1709	}
1710
1711	/* Burton Bayesian */
1712	if (CTX->algorithms & DSA_BURTON && abay_used < BURTON_WINDOW_SIZE)
1713	{
1714	LOGDEBUG ("[burton] [%2.6f] %s (%dfrq, %lds, %ldi)",
1715	stat.probability, token_name, ds_term->frequency,
1716	stat.spam_hits, stat.innocent_hits);
1717
1718	_ds_factor(factor_altbayes, token_name, stat.probability);
1719
1720	if (abay_used == 0)
1721	{
1722	abay_top = stat.probability;
1723	abay_bot = (1 - stat.probability);
1724	}
1725	else
1726	{
1727	abay_top *= stat.probability;
1728	abay_bot *= (1 - stat.probability);
1729	}
1730
1731	abay_used++;
1732
1733	if (abay_used < BURTON_WINDOW_SIZE && ds_term->frequency > 1 )
1734	{
1735	LOGDEBUG ("[burton] [%2.6f] %s (%dfrq, %lds, %ldi)",
1736	stat.probability, token_name, ds_term->frequency,
1737	stat.spam_hits, stat.innocent_hits);
1738
1739	_ds_factor(factor_altbayes, token_name, stat.probability);
1740
1741	abay_used++;
1742	abay_top *= stat.probability;
1743	abay_bot *= (1 - stat.probability);
1744	}
1745
1746	}
1747
1748	/* Robinson's Geometric Mean Definitions */
1749
1750	//#define ROB_S 0.010 /* Sensitivity */
1751	//#define ROB_X 0.415 /* Value to use when N = 0 */
1752	//#define ROB_CUTOFF 0.54
1753
1754
1755	#define ROB_S 0.010 /* Sensitivity */
1756	#define ROB_X 0.500 /* Value to use when N = 0 */
1757	#define ROB_CUTOFF 0.50
1758
1759
1760	if (rob_used < 25)
1761	{
1762	float probability;
1763	long n = (heap_sort->items > 25) ? 25 : heap_sort->items;
1764
1765	probability = ((ROB_S * ROB_X) + (n * stat.probability)) / (ROB_S + n);
1766
1767	#ifdef ROBINSON
1768	#ifndef VERBOSE
1769	if (CTX->operating_mode != DSM_CLASSIFY)
1770	{
1771	#endif
1772	LOGDEBUG ("[rob] [%2.6f] %s (%dfrq, %lds, %ldi)",
1773	stat.probability, token_name, ds_term->frequency,
1774	stat.spam_hits, stat.innocent_hits);
1775	#ifndef VERBOSE
1776	}
1777	#endif
1778	#endif
1779
1780	_ds_factor(factor_rob, token_name, stat.probability);
1781
1782	if (probability < 0.3 \|\| probability > 0.7)
1783	{
1784
1785	if (rob_used == 0)
1786	{
1787	rob_top = probability;
1788	rob_bot = (1 - probability);
1789	}
1790	else
1791	{
1792	rob_top *= probability;
1793	rob_bot *= (1 - probability);
1794	}
1795
1796	rob_used++;
1797
1798	if (rob_used < 25 && ds_term->frequency > 1)
1799	{
1800	#ifdef ROBINSON
1801	#ifndef VERBOSE
1802	if (CTX->operating_mode != DSM_CLASSIFY)
1803	{
1804	#endif
1805	LOGDEBUG ("[rob] [%2.6f] %s (%dfrq, %lds, %ldi)",
1806	stat.probability, token_name, ds_term->frequency,
1807	stat.spam_hits, stat.innocent_hits);
1808
1809	#ifndef VERBOSE
1810	}
1811	#endif
1812	#endif
1813
1814	_ds_factor(factor_rob, token_name, stat.probability);
1815
1816	rob_used++;
1817	rob_top *= probability;
1818	rob_bot *= (1 - probability);
1819	}
1820	}
1821	}
1822	}
1823
1824	/* END Combine Token Values */
1825
1826	/* Fisher-Robinson's Inverse Chi-Square */
1827	#define CHI_CUTOFF 0.5010 /* Ham/Spam Cutoff */
1828	#define CHI_EXCR 0.4500 /* Exclusionary Radius */
1829	#define LN2 0.69314718055994530942 /* log e2 */
1830
1831	if (CTX->algorithms & DSA_CHI_SQUARE \|\| CTX->algorithms & DSA_NAIVE)
1832	{
1833	ds_term_t ds_term;
1834	ds_cursor_t ds_c;
1835	double fw;
1836	int n, exp;
1837
1838	ds_c = ds_diction_cursor(diction);
1839	ds_term = ds_diction_next(ds_c);
1840	while(ds_term) {
1841
1842	if (ds_term->key == CONTROL_TOKEN) {
1843	ds_term = ds_diction_next(ds_c);
1844	continue;
1845	}
1846
1847	/* Naive-Bayesian */
1848	if (CTX->algorithms & DSA_NAIVE)
1849	{
1850	LOGDEBUG ("[naive] [%2.6f] %s (%dfrq, %lds, %ldi)",
1851	ds_term->s.probability, ds_term->name, ds_term->frequency,
1852	ds_term->s.spam_hits, ds_term->s.innocent_hits);
1853
1854	_ds_factor(factor_nbayes, ds_term->name, stat.probability);
1855
1856	if (nbay_used == 0)
1857	{
1858	nbay_top = stat.probability;
1859	nbay_bot = 1 - stat.probability;
1860	}
1861	else
1862	{
1863	nbay_top *= stat.probability;
1864	nbay_bot *= (1 - stat.probability);
1865	}
1866
1867	nbay_used++;
1868	}
1869
1870	if (CTX->algorithms & DSA_CHI_SQUARE) {
1871
1872	/* Skip BNR Tokens */
1873	if (ds_term->type == 'B')
1874	goto CHI_NEXT;
1875
1876	/* Convert the p-value */
1877
1878	if (CTX->algorithms & DSP_ROBINSON) {
1879	fw = ds_term->s.probability;
1880	} else {
1881	n = ds_term->s.spam_hits + ds_term->s.innocent_hits;
1882	fw = ((CHI_S * CHI_X) + (n * ds_term->s.probability))/(CHI_S + n);
1883	}
1884
1885	if (fabs(0.5-fw)>CHI_EXCR) {
1886	int iter = 1;
1887
1888	while(iter>0) {
1889	iter --;
1890
1891	#ifndef VERBOSE
1892	if (CTX->operating_mode != DSM_CLASSIFY)
1893	{
1894	#endif
1895	LOGDEBUG ("[chi-sq] [%2.6f] %s (%dfrq, %lds, %ldi)",
1896	fw, ds_term->name, ds_term->frequency,
1897	ds_term->s.spam_hits, ds_term->s.innocent_hits);
1898	#ifndef VERBOSE
1899	}
1900	#endif
1901
1902	_ds_factor(factor_chi, ds_term->name, ds_term->s.probability);
1903
1904	chi_used++;
1905	chi_s *= (1.0 - fw);
1906	chi_h *= fw;
1907	if (chi_s < 1e-200) {
1908	chi_s = frexp(chi_s, &exp);
1909	chi_sx += exp;
1910	}
1911	if (chi_h < 1e-200) {
1912	chi_h = frexp(chi_h, &exp);
1913	chi_hx += exp;
1914	}
1915	}
1916	}
1917	}
1918
1919	CHI_NEXT:
1920	ds_term = ds_diction_next(ds_c);
1921	}
1922	ds_diction_close(ds_c);
1923	}
1924
1925	/* BEGIN Calculate Individual Probabilities */
1926
1927	if (CTX->algorithms & DSA_NAIVE) {
1928	nbay_result = (nbay_top) / (nbay_top + nbay_bot);
1929	LOGDEBUG ("Naive-Bayesian Probability: %f Samples: %ld", nbay_result,
1930	nbay_used);
1931	}
1932
1933	if (CTX->algorithms & DSA_GRAHAM) {
1934	bay_result = (bay_top) / (bay_top + bay_bot);
1935	LOGDEBUG ("Graham-Bayesian Probability: %f Samples: %ld", bay_result,
1936	bay_used);
1937	}
1938
1939	if (CTX->algorithms & DSA_BURTON) {
1940	abay_result = (abay_top) / (abay_top + abay_bot);
1941	LOGDEBUG ("Burton-Bayesian Probability: %f Samples: %ld", abay_result,
1942	abay_used);
1943	}
1944
1945	/* Robinson's */
1946	if (rob_used == 0)
1947	{
1948	p = q = s = 0;
1949	}
1950	else
1951	{
1952	p = 1.0 - pow (rob_bot, 1.0 / rob_used);
1953	q = 1.0 - pow (rob_top, 1.0 / rob_used);
1954	s = (p - q) / (p + q);
1955	s = (s + 1.0) / 2.0;
1956	}
1957
1958	rob_result = s;
1959
1960	if (CTX->algorithms & DSA_ROBINSON) {
1961	LOGDEBUG("Robinson's Geometric Confidence: %f (Spamminess: %f, "
1962	"Non-Spamminess: %f, Samples: %ld)", rob_result, p, q, rob_used);
1963	}
1964
1965	if (CTX->algorithms & DSA_CHI_SQUARE) {
1966	chi_s = log(chi_s) + chi_sx * LN2;
1967	chi_h = log(chi_h) + chi_hx * LN2;
1968
1969	if (chi_used) {
1970	chi_s = 1.0 - chi2Q(-2.0 * chi_s, 2 * chi_used);
1971	chi_h = 1.0 - chi2Q(-2.0 * chi_h, 2 * chi_used);
1972
1973	chi_result = ((chi_s-chi_h)+1.0) / 2.0;
1974	} else {
1975	chi_result = (float)(CHI_CUTOFF-0.1);
1976	}
1977
1978	LOGDEBUG("Chi-Square Confidence: %f", chi_result);
1979	}
1980
1981	/* END Calculate Individual Probabilities */
1982
1983	/* BEGIN Determine Result */
1984
1985	if (CTX->classification == DSR_ISSPAM) {
1986	CTX->result = DSR_ISSPAM;
1987	CTX->probability = 1.0;
1988	} else if (CTX->classification == DSR_ISINNOCENT) {
1989	CTX->result = DSR_ISINNOCENT;
1990	CTX->probability = 0.0;
1991	} else {
1992	struct nt *factor = NULL;
1993
1994	if (CTX->algorithms & DSA_NAIVE) {
1995	factor = factor_nbayes;
1996	if (((CTX->algorithms & DSP_MARKOV) && nbay_result > 0.5000) \|\|
1997	(!(CTX->algorithms & DSP_MARKOV) && nbay_result >= 0.9))
1998	{
1999	CTX->result = DSR_ISSPAM;
2000	CTX->probability = nbay_result;
2001	CTX->factors = factor;
2002	LOGDEBUG("using Naive-Bayes factors");
2003	}
2004	}
2005
2006	if (CTX->algorithms & DSA_GRAHAM) {
2007	factor = factor_bayes;
2008	if (((CTX->algorithms & DSP_MARKOV) && bay_result > 0.5000) \|\|
2009	(!(CTX->algorithms & DSP_MARKOV) && bay_result >= 0.9))
2010	{
2011	CTX->result = DSR_ISSPAM;
2012	CTX->probability = bay_result;
2013	CTX->factors = factor;
2014	LOGDEBUG("using Graham factors");
2015	}
2016	}
2017
2018	if (CTX->algorithms & DSA_BURTON) {
2019	factor = factor_altbayes;
2020	if (((CTX->algorithms & DSP_MARKOV) && abay_result > 0.5000) \|\|
2021	(!(CTX->algorithms & DSP_MARKOV) && abay_result >= 0.9))
2022	{
2023	CTX->result = DSR_ISSPAM;
2024	CTX->probability = abay_result;
2025	if (!CTX->factors) {
2026	CTX->factors = factor;
2027	LOGDEBUG("using Burton factors");
2028	}
2029	}
2030	}
2031
2032	if (CTX->algorithms & DSA_ROBINSON) {
2033	factor = factor_rob;
2034	if (((CTX->algorithms & DSP_MARKOV) && rob_result > 0.5000) \|\|
2035	(!(CTX->algorithms & DSP_MARKOV) && rob_result >= ROB_CUTOFF))
2036	{
2037	CTX->result = DSR_ISSPAM;
2038	if (CTX->probability < 0)
2039	CTX->probability = rob_result;
2040	if (!CTX->factors) {
2041	CTX->factors = factor;
2042	LOGDEBUG("using Robinson-Geom factors");
2043	}
2044	}
2045	}
2046
2047	if (CTX->algorithms & DSA_CHI_SQUARE) {
2048	factor = factor_chi;
2049	if (((CTX->algorithms & DSP_MARKOV) && chi_result > 0.5000) \|\|
2050	(!(CTX->algorithms & DSP_MARKOV) && chi_result >= CHI_CUTOFF))
2051	{
2052	CTX->result = DSR_ISSPAM;
2053	if (CTX->probability < 0)
2054	CTX->probability = chi_result;
2055	if (!CTX->factors) {
2056	CTX->factors = factor;
2057	LOGDEBUG("using Chi-Square factors");
2058	}
2059	}
2060	}
2061
2062	if (!CTX->factors) {
2063	CTX->factors = factor;
2064	LOGDEBUG("no factors specified; using default");
2065	}
2066	}
2067
2068	if (CTX->factors != factor_nbayes)
2069	_ds_factor_destroy(factor_nbayes);
2070	if (CTX->factors != factor_bayes)
2071	_ds_factor_destroy(factor_bayes);
2072	if (CTX->factors != factor_altbayes)
2073	_ds_factor_destroy(factor_altbayes);
2074	if (CTX->factors != factor_rob)
2075	_ds_factor_destroy(factor_rob);
2076	if (CTX->factors != factor_chi)
2077	_ds_factor_destroy(factor_chi);
2078
2079	/* If somehow we haven't yet assigned a probability, assign one */
2080	if (CTX->probability == DSP_UNCALCULATED)
2081	{
2082	if (CTX->algorithms & DSA_GRAHAM)
2083	CTX->probability = bay_result;
2084
2085	if (CTX->algorithms & DSA_NAIVE)
2086	CTX->probability = nbay_result;
2087
2088	if (CTX->probability < 0 && CTX->algorithms & DSA_BURTON)
2089	CTX->probability = abay_result;
2090
2091	if (CTX->probability < 0 && CTX->algorithms & DSA_ROBINSON)
2092	CTX->probability = rob_result;
2093
2094	if (CTX->probability < 0 && CTX->algorithms & DSA_CHI_SQUARE)
2095	CTX->probability = chi_result;
2096	}
2097
2098	#ifdef VERBOSE
2099	if (DO_DEBUG && (!(CTX->algorithms & DSP_MARKOV))) {
2100	if (abay_result >= 0.9 && bay_result < 0.9)
2101	{
2102	LOGDEBUG ("CATCH: Burton Bayesian");
2103	}
2104	else if (abay_result < 0.9 && bay_result >= 0.9)
2105	{
2106	LOGDEBUG ("MISS: Burton Bayesian");
2107	}
2108
2109	if (rob_result >= ROB_CUTOFF && bay_result < 0.9)
2110	{
2111	LOGDEBUG ("CATCH: Robinson's");
2112	}
2113	else if (rob_result < ROB_CUTOFF && bay_result >= 0.9)
2114	{
2115	LOGDEBUG ("MISS: Robinson's");
2116	}
2117
2118	if (chi_result >= CHI_CUTOFF && bay_result < 0.9)
2119	{
2120	LOGDEBUG("CATCH: Chi-Square");
2121	}
2122	else if (chi_result < CHI_CUTOFF && bay_result >= 0.9)
2123	{
2124	LOGDEBUG("MISS: Chi-Square");
2125	}
2126	}
2127	#endif
2128
2129	/* Calculate Confidence */
2130
2131	if (CTX->algorithms & DSP_MARKOV) {
2132	if (CTX->result == DSR_ISSPAM)
2133	{
2134	CTX->confidence = CTX->probability;
2135	}
2136	else
2137	{
2138	CTX->confidence = 1.0 - CTX->probability;
2139	}
2140	} else {
2141	if (CTX->result == DSR_ISSPAM)
2142	{
2143	CTX->confidence = rob_result;
2144	}
2145	else
2146	{
2147	CTX->confidence = 1.0 - rob_result;
2148	}
2149	}
2150
2151	LOGDEBUG("Result Confidence: %1.2f", CTX->confidence);
2152	return CTX->result;
2153	}
2154
2155	/*
2156	* _ds_factor()
2157	*
2158	* DESCRIPTION
2159	* Factors a token/value into a set
2160	*
2161	* Adds a token/value pair to a factor set. The factor set of the dominant
2162	* calculation is provided to the client in order to explain libdspam's
2163	* final decision about the message's classification.
2164	*/
2165
2166	int _ds_factor(struct nt set, char token_name, float value) {
2167	struct dspam_factor *f;
2168	f = calloc(1, sizeof(struct dspam_factor));
2169	if (!f)
2170	return EUNKNOWN;
2171	f->token_name = strdup(token_name);
2172	f->value = value;
2173	nt_add(set, (void *) f);
2174	return 0;
2175	}
2176
2177	/*
2178	* _ds_factor_destroy - destroy a factor tree
2179	*
2180	*/
2181
2182	void _ds_factor_destroy(struct nt *factors) {
2183	struct dspam_factor *f;
2184	struct nt_node *node;
2185	struct nt_c c;
2186
2187	if (factors == NULL)
2188	return;
2189
2190	node = c_nt_first(factors, &c);
2191	while(node != NULL) {
2192	f = (struct dspam_factor *) node->ptr;
2193	if (f)
2194	free(f->token_name);
2195	node = c_nt_next(factors, &c);
2196	}
2197	nt_destroy(factors);
2198
2199	return;
2200	}
2201
2202	int libdspam_init(const char *driver) {
2203
2204	#ifndef STATIC_DRIVER
2205	if (driver == NULL) {
2206	LOG(LOG_CRIT, "dlopen() failed: Can not load NULL driver");
2207	return EFAILURE;
2208	} else if (driver) {
2209	if ((_drv_handle = dlopen(driver, RTLD_NOW))==NULL) {
2210	LOG(LOG_CRIT, "dlopen() failed: %s: %s", driver, dlerror());
2211	return EFAILURE;
2212	}
2213	}
2214	#endif
2215
2216	return 0;
2217	}
2218
2219	int libdspam_shutdown(void) {
2220
2221	#ifndef STATIC_DRIVER
2222	if (_drv_handle) {
2223	int r;
2224	if ((r=dlclose(_drv_handle))) {
2225	LOG(LOG_CRIT, "dlclose() failed: %s", dlerror());
2226	return r;
2227	}
2228	}
2229	#endif
2230
2231	return 0;
2232	}
2233
2234	int _ds_instantiate_bnr(
2235	DSPAM_CTX *CTX,
2236	ds_diction_t patterns,
2237	struct nt *stream,
2238	char identifier)
2239	{
2240	float previous_bnr_probs[BNR_SIZE];
2241	ds_term_t ds_term, ds_touch;
2242	struct nt_node *node_nt;
2243	struct nt_c c_nt;
2244	unsigned long long crc;
2245	char bnr_token[64];
2246	int i;
2247
2248	for(i=0;i<BNR_SIZE;i++)
2249	previous_bnr_probs[i] = 0.00000;
2250
2251	node_nt = c_nt_first(stream, &c_nt);
2252	while(node_nt != NULL) {
2253	ds_term = node_nt->ptr;
2254
2255	_ds_calc_stat (CTX, ds_term, &ds_term->s, DTT_DEFAULT, NULL);
2256
2257	for(i=0;i<BNR_SIZE-1;i++)
2258	previous_bnr_probs[i] = previous_bnr_probs[i+1];
2259
2260	previous_bnr_probs[BNR_SIZE-1] = _ds_round(ds_term->s.probability);
2261	sprintf(bnr_token, "bnr.%c\|", identifier);
2262	for(i=0;i<BNR_SIZE;i++) {
2263	char x[6];
2264	snprintf(x, 6, "%01.2f_", previous_bnr_probs[i]);
2265	strlcat(bnr_token, x, sizeof(bnr_token));
2266	}
2267
2268	crc = _ds_getcrc64 (bnr_token);
2269	#ifdef VERBOSE
2270	LOGDEBUG ("BNR pattern instantiated: '%s'", bnr_token);
2271	#endif
2272	ds_touch = ds_diction_touch(patterns, crc, bnr_token, 0);
2273	ds_touch->type = 'B';
2274	node_nt = c_nt_next(stream, &c_nt);
2275	}
2276	return 0;
2277	}
2278
2279	ds_diction_t _ds_apply_bnr (DSPAM_CTX *CTX, ds_diction_t diction) {
2280
2281	/*
2282	Bayesian Noise Reduction - Contextual Symmetry Logic
2283	http://bnr.nuclearelephant.com
2284	*/
2285
2286	ds_diction_t bnr_patterns = ds_diction_create(3079);
2287	struct _ds_spam_stat bnr_tot;
2288	unsigned long long crc;
2289	BNR_CTX BTX_S, BTX_C;
2290	struct nt_node *node_nt;
2291	struct nt_c c_nt;
2292	ds_term_t ds_term, ds_touch;
2293	ds_cursor_t ds_c;
2294
2295	if (!bnr_patterns)
2296	{
2297	LOG (LOG_CRIT, ERR_MEM_ALLOC);
2298	return NULL;
2299	}
2300
2301	BTX_S = bnr_init(BNR_INDEX, 's');
2302	BTX_C = bnr_init(BNR_INDEX, 'c');
2303
2304	if (!BTX_S \|\| !BTX_C) {
2305	LOGDEBUG("bnr_init() failed");
2306	bnr_destroy(BTX_S);
2307	bnr_destroy(BTX_C);
2308	ds_diction_destroy(bnr_patterns);
2309	return NULL;
2310	}
2311
2312	BTX_S->window_size = BNR_SIZE;
2313	BTX_C->window_size = BNR_SIZE;
2314
2315	_ds_instantiate_bnr(CTX, bnr_patterns, diction->order, 's');
2316	_ds_instantiate_bnr(CTX, bnr_patterns, diction->chained_order, 'c');
2317
2318	/* Add BNR totals to the list of load elements */
2319	memset(&bnr_tot, 0, sizeof(struct _ds_spam_stat));
2320	crc = _ds_getcrc64("bnr.t\|");
2321	ds_touch = ds_diction_touch(bnr_patterns, crc, "bnr.t\|", 0);
2322	ds_touch->type = 'B';
2323
2324	/* Load BNR patterns */
2325	LOGDEBUG("Loading %ld BNR patterns", bnr_patterns->items);
2326	if (_ds_getall_spamrecords (CTX, bnr_patterns)) {
2327	LOGDEBUG ("_ds_getall_spamrecords() failed");
2328	ds_diction_destroy(bnr_patterns);
2329	return NULL;
2330	}
2331
2332	/* Perform BNR Processing */
2333
2334	if (CTX->classification == DSR_NONE &&
2335	CTX->_sig_provided == 0 &&
2336	CTX->totals.innocent_learned + CTX->totals.innocent_classified > 2500)
2337	{
2338	int elim;
2339	#ifdef LIBBNR_DEBUG
2340	char fn[MAX_FILENAME_LENGTH];
2341	FILE *file;
2342	#endif
2343
2344	node_nt = c_nt_first(diction->order, &c_nt);
2345	while(node_nt != NULL) {
2346	ds_term = node_nt->ptr;
2347	bnr_add(BTX_S, ds_term->name, ds_term->s.probability);
2348	node_nt = c_nt_next(diction->order, &c_nt);
2349	}
2350
2351	node_nt = c_nt_first(diction->chained_order, &c_nt);
2352	while(node_nt != NULL) {
2353	ds_term = node_nt->ptr;
2354	bnr_add(BTX_C, ds_term->name, ds_term->s.probability);
2355	node_nt = c_nt_next(diction->chained_order, &c_nt);
2356	}
2357
2358	bnr_instantiate(BTX_S);
2359	bnr_instantiate(BTX_C);
2360
2361	/* Calculate pattern p-values */
2362	ds_diction_getstat(bnr_patterns, crc, &bnr_tot);
2363	ds_c = ds_diction_cursor(bnr_patterns);
2364	ds_term = ds_diction_next(ds_c);
2365	while(ds_term) {
2366	_ds_calc_stat(CTX, ds_term, &ds_term->s, DTT_BNR, &bnr_tot);
2367	if (ds_term->name[4] == 's')
2368	bnr_set_pattern(BTX_S, ds_term->name, ds_term->s.probability);
2369	else if (ds_term->name[4] == 'c')
2370	bnr_set_pattern(BTX_C, ds_term->name, ds_term->s.probability);
2371	ds_term = ds_diction_next(ds_c);
2372	}
2373	ds_diction_close(ds_c);
2374
2375	bnr_finalize(BTX_S);
2376	bnr_finalize(BTX_C);
2377
2378	/* Propagate eliminations to DSPAM */
2379
2380	node_nt = c_nt_first(diction->order, &c_nt);
2381	while(node_nt != NULL) {
2382	ds_term = node_nt->ptr;
2383	bnr_get_token(BTX_S, &elim);
2384	if (elim)
2385	ds_term->frequency--;
2386	node_nt = c_nt_next(diction->order, &c_nt);
2387	}
2388
2389	node_nt = c_nt_first(diction->chained_order, &c_nt);
2390	while(node_nt != NULL) {
2391	ds_term = node_nt->ptr;
2392	bnr_get_token(BTX_C, &elim);
2393	if (elim)
2394	ds_term->frequency--;
2395	node_nt = c_nt_next(diction->chained_order, &c_nt);
2396	}
2397
2398	#ifdef LIBBNR_DEBUG
2399	float snr;
2400	if (BTX_S->stream->items + BTX_C->stream->items +
2401	BTX_S->eliminations + BTX_C->eliminations > 0)
2402	{
2403	snr = 100.0*((BTX_S->eliminations + BTX_C->eliminations + 0.0)/
2404	(BTX_S->stream->items + BTX_C->stream->items +
2405	BTX_S->eliminations + BTX_C->eliminations));
2406	} else {
2407	snr = 0;
2408	}
2409
2410	LOGDEBUG("bnr reported snr of %02.3f", snr);
2411
2412	#ifdef LIBBNR_GRAPH_OUTPUT
2413	printf("BEFORE\n\n");
2414	node_nt = c_nt_first(diction->order, &c_nt);
2415	while(node_nt != NULL) {
2416	ds_term = node_nt->ptr;
2417	printf("%1.5f\n", ds_term->s.probability);
2418	node_nt = c_nt_next(diction->order, &c_nt);
2419	}
2420
2421	printf("\n\nAFTER\n\n");
2422	node_nt = c_nt_first(diction->order, &c_nt);
2423	while(node_nt != NULL) {
2424	ds_term = node_nt->ptr;
2425	if (ds_term->frequency > 0)
2426	printf("%1.5f\n", ds_term->s.probability);
2427	node_nt = c_nt_next(diction->order, &c_nt);
2428	}
2429	printf("\n");
2430	#endif
2431
2432
2433	snprintf(fn, sizeof(fn), "%s/bnr.log", LOGDIR);
2434	file = fopen(fn, "a");
2435	if (file != NULL) {
2436	fprintf(file, "-- BNR Filter Process Results --\n");
2437	fprintf(file, "Eliminations:\n");
2438	node_nt = c_nt_first(diction->order, &c_nt);
2439	while(node_nt != NULL) {
2440	ds_term = node_nt->ptr;
2441	if (ds_term->frequency <= 0)
2442	fprintf(file, "%s ", ds_term->name);
2443	node_nt = c_nt_next(diction->order, &c_nt);
2444	}
2445	fprintf(file, "\n[");
2446	node_nt = c_nt_first(diction->order, &c_nt);
2447	while(node_nt != NULL) {
2448	ds_term = node_nt->ptr;
2449	if (ds_term->frequency <= 0)
2450	fprintf(file, "%1.2f ", ds_term->s.probability);
2451	node_nt = c_nt_next(diction->order, &c_nt);
2452	}
2453
2454	fprintf(file, "]\n\nRemaining:\n");
2455	node_nt = c_nt_first(diction->order, &c_nt);
2456	while(node_nt != NULL) {
2457	ds_term = node_nt->ptr;
2458	if (ds_term->frequency > 0)
2459	fprintf(file, "%s ", ds_term->name);
2460	node_nt = c_nt_next(diction->order, &c_nt);
2461	}
2462	fprintf(file, "\n[");
2463	node_nt = c_nt_first(diction->order, &c_nt);
2464	while(node_nt != NULL) {
2465	ds_term = node_nt->ptr;
2466	if (ds_term->frequency > 0)
2467	fprintf(file, "%1.2f ", ds_term->s.probability);
2468	node_nt = c_nt_next(diction->order, &c_nt);
2469	}
2470
2471	fprintf(file, "]\nProcessed for: %s\n\n", CTX->username);
2472
2473	fprintf(file, "-- Chained Tokens --\n");
2474	fprintf(file, "Eliminations:\n");
2475	node_nt = c_nt_first(diction->chained_order, &c_nt);
2476	while(node_nt != NULL) {
2477	ds_term = node_nt->ptr;
2478	if (ds_term->frequency <= 0)
2479	fprintf(file, "%s ", ds_term->name);
2480	node_nt = c_nt_next(diction->chained_order, &c_nt);
2481	}
2482	fprintf(file, "\n[");
2483	node_nt = c_nt_first(diction->chained_order, &c_nt);
2484	while(node_nt != NULL) {
2485	ds_term = node_nt->ptr;
2486	if (ds_term->frequency <= 0)
2487	fprintf(file, "%1.2f ", ds_term->s.probability);
2488	node_nt = c_nt_next(diction->chained_order, &c_nt);
2489	}
2490
2491	fprintf(file, "]\n\nRemaining:\n");
2492	node_nt = c_nt_first(diction->chained_order, &c_nt);
2493	while(node_nt != NULL) {
2494	ds_term = node_nt->ptr;
2495	if (ds_term->frequency > 0)
2496	fprintf(file, "%s ", ds_term->name);
2497	node_nt = c_nt_next(diction->chained_order, &c_nt);
2498	}
2499	fprintf(file, "\n[");
2500	node_nt = c_nt_first(diction->chained_order, &c_nt);
2501	while(node_nt != NULL) {
2502	ds_term = node_nt->ptr;
2503	if (ds_term->frequency > 0)
2504	fprintf(file, "%1.2f ", ds_term->s.probability);
2505	node_nt = c_nt_next(diction->chained_order, &c_nt);
2506	}
2507
2508
2509	fprintf(file, "]\nProcessed for: %s\n\n", CTX->username);
2510	fclose(file);
2511	}
2512	#endif
2513
2514	}
2515
2516	bnr_destroy(BTX_S);
2517	bnr_destroy(BTX_C);
2518
2519	/* Add BNR pattern to token hash */
2520	if (CTX->totals.innocent_learned + CTX->totals.innocent_classified > 1000) {
2521	ds_c = ds_diction_cursor(bnr_patterns);
2522	ds_term = ds_diction_next(ds_c);
2523	while(ds_term) {
2524	ds_term_t t = ds_diction_touch(diction, ds_term->key, ds_term->name, 0);
2525	t->type = 'B';
2526	ds_diction_setstat(diction, ds_term->key, &ds_term->s);
2527	if (t)
2528	t->frequency = 1;
2529
2530	#ifdef LIBBNR_DEBUG
2531	if (fabs(0.5-ds_term->s.probability)>0.25) {
2532	LOGDEBUG("Interesting BNR Pattern: %s %01.5f %lds %ldi",
2533	ds_term->name,
2534	ds_term->s.probability,
2535	ds_term->s.spam_hits,
2536	ds_term->s.innocent_hits);
2537	}
2538	#endif
2539
2540	ds_term = ds_diction_next(ds_c);
2541	}
2542	ds_diction_close(ds_c);
2543	}
2544
2545	return bnr_patterns;
2546	}
2547
2548	int _ds_increment_tokens(DSPAM_CTX *CTX, ds_diction_t diction) {
2549	ds_cursor_t ds_c;
2550	ds_term_t ds_term;
2551	int i = 0;
2552	int occurrence = _ds_match_attribute(CTX->config->attributes,
2553	"ProcessorWordFrequency", "occurrence");
2554
2555	ds_c = ds_diction_cursor(diction);
2556	ds_term = ds_diction_next(ds_c);
2557	while(ds_term) {
2558	unsigned long long crc;
2559
2560	crc = ds_term->key;
2561
2562	/* Create a signature if we're processing a message */
2563
2564	if (CTX->tokenizer != DSZ_SBPH
2565	&& CTX->flags & DSF_SIGNATURE
2566	&& (CTX->operating_mode != DSM_CLASSIFY \|\| !(CTX->_sig_provided)))
2567	{
2568	struct _ds_signature_token t;
2569
2570	memset(&t, 0, sizeof(t));
2571	t.token = crc;
2572	t.frequency = ds_term->frequency;
2573	memcpy ((char *) CTX->signature->data +
2574	(i * sizeof (struct _ds_signature_token)), &t,
2575	sizeof (struct _ds_signature_token));
2576	}
2577
2578	/* If classification was provided, force probabilities */
2579	if (CTX->classification == DSR_ISSPAM)
2580	ds_term->s.probability = 1.00;
2581	else if (CTX->classification == DSR_ISINNOCENT)
2582	ds_term->s.probability = 0.00;
2583
2584	if (ds_term->type == 'D' &&
2585	( CTX->training_mode != DST_TUM \|\|
2586	CTX->source == DSS_ERROR \|\|
2587	CTX->source == DSS_INOCULATION \|\|
2588	ds_term->s.spam_hits + ds_term->s.innocent_hits < 50 \|\|
2589	ds_term->key == diction->whitelist_token \|\|
2590	CTX->confidence < 0.70))
2591	{
2592	ds_term->s.status \|= TST_DIRTY;
2593	}
2594
2595	if (ds_term->type == 'B' &&
2596	CTX->totals.innocent_learned + CTX->totals.innocent_classified > 500 &&
2597	CTX->flags & DSF_NOISE &&
2598	CTX->_sig_provided == 0)
2599	{
2600	ds_term->s.status \|= TST_DIRTY;
2601	}
2602
2603	/* SPAM */
2604	if (CTX->result == DSR_ISSPAM)
2605	{
2606	/* Inoculations increase token count considerably */
2607	if (CTX->source == DSS_INOCULATION)
2608	{
2609	if (ds_term->s.innocent_hits < 2 && ds_term->s.spam_hits < 5)
2610	ds_term->s.spam_hits += 5;
2611	else
2612	ds_term->s.spam_hits += 2;
2613	}
2614
2615	/* Standard increase */
2616	else
2617	{
2618	if (CTX->flags & DSF_UNLEARN) {
2619	if (CTX->classification == DSR_ISSPAM)
2620	{
2621	if (occurrence)
2622	{
2623	ds_term->s.spam_hits -= ds_term->frequency;
2624	if (ds_term->s.spam_hits < 0)
2625	ds_term->s.spam_hits = 0;
2626	} else {
2627	ds_term->s.spam_hits -= (ds_term->s.spam_hits>0) ? 1:0;
2628	}
2629	}
2630	} else {
2631	if (occurrence)
2632	{
2633	ds_term->s.spam_hits += ds_term->frequency;
2634	} else {
2635	ds_term->s.spam_hits++;
2636	}
2637	}
2638	}
2639
2640	if (SPAM_MISS(CTX) &&
2641	!(CTX->flags & DSF_UNLEARN) &&
2642	CTX->training_mode != DST_TOE &&
2643	CTX->training_mode != DST_NOTRAIN)
2644	{
2645	if (occurrence)
2646	{
2647	ds_term->s.innocent_hits -= ds_term->frequency;
2648	if (ds_term->s.innocent_hits < 0)
2649	ds_term->s.innocent_hits = 0;
2650	} else {
2651	ds_term->s.innocent_hits -= (ds_term->s.innocent_hits>0) ? 1:0;
2652	}
2653	}
2654	}
2655
2656	/* INNOCENT */
2657	else
2658	{
2659	if (CTX->flags & DSF_UNLEARN) {
2660	if (CTX->classification == DSR_ISINNOCENT)
2661	{
2662	if (occurrence)
2663	{
2664	ds_term->s.innocent_hits -= ds_term->frequency;
2665	if (ds_term->s.innocent_hits < 0)
2666	ds_term->s.innocent_hits = 0;
2667	} else {
2668	ds_term->s.innocent_hits -= (ds_term->s.innocent_hits>0) ? 1:0;
2669	}
2670	}
2671	} else {
2672	if (occurrence)
2673	{
2674	ds_term->s.innocent_hits += ds_term->frequency;
2675	} else {
2676	ds_term->s.innocent_hits++;
2677	}
2678	}
2679
2680	if (FALSE_POSITIVE(CTX) &&
2681	!(CTX->flags & DSF_UNLEARN) &&
2682	CTX->training_mode != DST_TOE &&
2683	CTX->training_mode != DST_NOTRAIN)
2684	{
2685
2686	if (occurrence)
2687	{
2688	ds_term->s.spam_hits -= ds_term->frequency;
2689	if (ds_term->s.spam_hits < 0)
2690	ds_term->s.spam_hits = 0;
2691	} else {
2692	ds_term->s.spam_hits -= (ds_term->s.spam_hits>0) ? 1:0;
2693	}
2694
2695	}
2696	}
2697
2698	ds_term = ds_diction_next(ds_c);
2699	i++;
2700	}
2701	ds_diction_close(ds_c);
2702	return 0;
2703	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: