Context Navigation

source: npl/mailserver/dspam/dspam-3.10.2/src/tokenizer.c @ c5c522c

gcc484ntopperl-5.22

Last change on this file since c5c522c was c5c522c, checked in by Edwin Eefting <edwin@datux.nl>, 9 years ago
initial commit, transferred from cleaned syn3 svn tree
Property mode set to `100644`
File size: 23.3 KB

Line
1	/* $Id: tokenizer.c,v 1.301 2011/06/28 00:13:48 sbajic Exp $ */
2
3	/*
4	DSPAM
5	COPYRIGHT (C) 2002-2012 DSPAM PROJECT
6
7	This program is free software: you can redistribute it and/or modify
8	it under the terms of the GNU Affero General Public License as
9	published by the Free Software Foundation, either version 3 of the
10	License, or (at your option) any later version.
11
12	This program is distributed in the hope that it will be useful,
13	but WITHOUT ANY WARRANTY; without even the implied warranty of
14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	GNU Affero General Public License for more details.
16
17	You should have received a copy of the GNU Affero General Public License
18	along with this program. If not, see <http://www.gnu.org/licenses/>.
19
20	*/
21
22	/*
23	* tokenizer.c - tokenizer functions
24	*
25	* DESCRIPTION
26	* The tokenizer subroutines are responsible for decomposing a message into
27	* its colloquial components. All components are stored collectively in
28	* a diction object, passed into the function.
29	*
30	*/
31
32	#ifdef HAVE_CONFIG_H
33	#include <auto-config.h>
34	#endif
35
36	#include <stdio.h>
37	#include <stdlib.h>
38	#include <math.h>
39	#include <ctype.h>
40	#include <errno.h>
41	#include <string.h>
42	#ifdef HAVE_UNISTD_H
43	#include <unistd.h>
44	#endif
45	#include <sys/types.h>
46	#include <sys/stat.h>
47
48	#ifdef TIME_WITH_SYS_TIME
49	# include <sys/time.h>
50	# include <time.h>
51	#else
52	# ifdef HAVE_SYS_TIME_H
53	# include <sys/time.h>
54	# else
55	# include <time.h>
56	# endif
57	#endif
58
59	#include "config.h"
60	#include "tokenizer.h"
61	#include "util.h"
62	#include "libdspam.h"
63	#include "language.h"
64
65	/*
66	* _ds_tokenize() - tokenize the message
67	*
68	* DESCRIPTION
69	* tokenizes the supplied message
70	*
71	* INPUT ARGUMENTS
72	* DSPAM_CTX *CTX pointer to context
73	* char *header pointer to message header
74	* char *body pointer to message body
75	* ds_diction_t diction to store components
76	*
77	* RETURN VALUES
78	* standard errors on failure
79	* zero if successful
80	*
81	*/
82
83	int
84	_ds_tokenize (DSPAM_CTX * CTX, char headers, char body, ds_diction_t diction)
85	{
86	if (diction == NULL)
87	return EINVAL;
88
89	if (CTX->tokenizer == DSZ_SBPH \|\| CTX->tokenizer == DSZ_OSB)
90	return _ds_tokenize_sparse(CTX, headers, body, diction);
91	else
92	return _ds_tokenize_ngram(CTX, headers, body, diction);
93	}
94
95	int _ds_tokenize_ngram(
96	DSPAM_CTX *CTX,
97	char *headers,
98	char *body,
99	ds_diction_t diction)
100	{
101	char token; / current token */
102	char previous_token = NULL; / used for bigrams (chained tokens) */
103	char line = NULL; / header broken up into lines */
104	char *ptrptr;
105	char heading[128]; /* current heading */
106	int l, tokenizer = CTX->tokenizer;
107
108	struct nt *header = NULL;
109	struct nt_node *node_nt;
110	struct nt_c c_nt;
111
112	/* Tokenize URLs in message */
113
114	if (_ds_match_attribute(CTX->config->attributes, "ProcessorURLContext", "on")) {
115	_ds_url_tokenize(diction, body, "http://");
116	_ds_url_tokenize(diction, body, "www.");
117	_ds_url_tokenize(diction, body, "href=");
118	}
119
120	/*
121	* Header Tokenization
122	*/
123
124	header = nt_create (NT_CHAR);
125	if (header == NULL)
126	{
127	LOG (LOG_CRIT, ERR_MEM_ALLOC);
128	return EUNKNOWN;
129	}
130
131	line = strtok_r (headers, "\n", &ptrptr);
132	while (line) {
133	nt_add (header, line);
134	line = strtok_r (NULL, "\n", &ptrptr);
135	}
136
137	node_nt = c_nt_first (header, &c_nt);
138	heading[0] = 0;
139	while (node_nt) {
140	int multiline;
141
142	#ifdef VERBOSE
143	LOGDEBUG("processing line: %s", node_nt->ptr);
144	#endif
145
146	line = node_nt->ptr;
147	token = strtok_r (line, ":", &ptrptr);
148	if (token && token[0] != 32 && token[0] != 9 && !strstr (token, " "))
149	{
150	multiline = 0;
151	strlcpy (heading, token, 128);
152	previous_token = NULL;
153	} else {
154	multiline = 1;
155	}
156
157	#ifdef VERBOSE
158	LOGDEBUG ("Reading '%s' header from: '%s'", heading, line);
159	#endif
160
161	if (CTX->flags & DSF_WHITELIST) {
162	/* Use the entire From: line for auto-whitelisting */
163
164	if (!strcmp(heading, "From")) {
165	char wl[256];
166	char *fromline = line + 5;
167	unsigned long long whitelist_token;
168
169	if (fromline[0] == 32)
170	fromline++;
171	snprintf(wl, sizeof(wl), "%s*%s", heading, fromline);
172	whitelist_token = _ds_getcrc64(wl);
173	ds_diction_touch(diction, whitelist_token, wl, 0);
174	diction->whitelist_token = whitelist_token;
175	}
176	}
177
178	/* Received headers use a different set of delimiters to preserve things
179	like ip addresses */
180
181	token = strtok_r ((multiline) ? line : NULL, DELIMITERS_HEADING, &ptrptr);
182
183	while (token)
184	{
185	l = strlen(token);
186
187	if (l >= 1 && l < 50)
188	{
189	#ifdef VERBOSE
190	LOGDEBUG ("Processing '%s' token in '%s' header", token, heading);
191	#endif
192
193	/* Process "current" token */
194	if (!_ds_process_header_token
195	(CTX, token, previous_token, diction, heading) &&
196	(tokenizer == DSZ_CHAIN))
197	{
198	previous_token = token;
199	}
200	}
201
202	token = strtok_r (NULL, DELIMITERS_HEADING, &ptrptr);
203	}
204
205	previous_token = NULL;
206	node_nt = c_nt_next (header, &c_nt);
207	}
208
209	nt_destroy (header);
210
211	/*
212	* Body Tokenization
213	*/
214
215	#ifdef VERBOSE
216	LOGDEBUG("parsing message body");
217	#endif
218
219	token = strtok_r (body, DELIMITERS, &ptrptr);
220	while (token != NULL)
221	{
222	l = strlen (token);
223	if (l >= 1 && l < 50)
224	{
225	#ifdef VERBOSE
226	LOGDEBUG ("Processing body token '%s'", token);
227	#endif
228
229	/* Process "current" token */
230	if ( !_ds_process_body_token(CTX, token, previous_token, diction)
231	&& tokenizer == DSZ_CHAIN)
232	{
233	previous_token = token;
234	}
235	}
236	token = strtok_r (NULL, DELIMITERS, &ptrptr);
237	}
238
239	#ifdef VERBOSE
240	LOGDEBUG("Finished tokenizing (ngram) message");
241	#endif
242
243	/* Final token reassembly (anything left in the buffer) */
244
245	return 0;
246	}
247
248	int _ds_tokenize_sparse(
249	DSPAM_CTX *CTX,
250	char *headers,
251	char *body,
252	ds_diction_t diction)
253	{
254	int i;
255	char token; / current token */
256	char previous_tokens[SPARSE_WINDOW_SIZE]; / sparse chain */
257
258	char line = NULL; / header broken up into lines */
259	char *ptrptr;
260	char *bitpattern;
261
262	char heading[128]; /* current heading */
263	int l;
264
265	struct nt *header = NULL;
266	struct nt_node *node_nt;
267	struct nt_c c_nt;
268
269	for(i=0;i<SPARSE_WINDOW_SIZE;i++)
270	previous_tokens[i] = NULL;
271
272	bitpattern = _ds_generate_bitpattern(_ds_pow2(SPARSE_WINDOW_SIZE));
273
274	/* Tokenize URLs in message */
275
276	if (_ds_match_attribute(CTX->config->attributes, "ProcessorURLContext", "on"))
277	{
278	_ds_url_tokenize(diction, body, "http://");
279	_ds_url_tokenize(diction, body, "www.");
280	_ds_url_tokenize(diction, body, "href=");
281	}
282
283	/*
284	* Header Tokenization
285	*/
286
287	header = nt_create (NT_CHAR);
288	if (header == NULL)
289	{
290	LOG (LOG_CRIT, ERR_MEM_ALLOC);
291	free(bitpattern);
292	return EUNKNOWN;
293	}
294
295	line = strtok_r (headers, "\n", &ptrptr);
296	while (line) {
297	nt_add (header, line);
298	line = strtok_r (NULL, "\n", &ptrptr);
299	}
300
301	node_nt = c_nt_first (header, &c_nt);
302	heading[0] = 0;
303	while (node_nt) {
304	int multiline;
305
306	#ifdef VERBOSE
307	LOGDEBUG("processing line: %s", node_nt->ptr);
308	#endif
309
310	_ds_sparse_clear(previous_tokens);
311
312	line = node_nt->ptr;
313	token = strtok_r (line, ":", &ptrptr);
314	if (token && token[0] != 32 && token[0] != 9 && !strstr (token, " "))
315	{
316	multiline = 0;
317	strlcpy (heading, token, 128);
318	_ds_sparse_clear(previous_tokens);
319	} else {
320	multiline = 1;
321	}
322
323	#ifdef VERBOSE
324	LOGDEBUG ("Reading '%s' header from: '%s'", heading, line);
325	#endif
326
327	if (CTX->flags & DSF_WHITELIST) {
328	/* Use the entire From: line for auto-whitelisting */
329
330	if (!strcmp(heading, "From")) {
331	char wl[256];
332	char *fromline = line + 5;
333	unsigned long long whitelist_token;
334
335	if (fromline[0] == 32)
336	fromline++;
337	snprintf(wl, sizeof(wl), "%s*%s", heading, fromline);
338	whitelist_token = _ds_getcrc64(wl);
339	ds_diction_touch(diction, whitelist_token, wl, 0);
340	diction->whitelist_token = whitelist_token;
341	}
342	}
343
344	/* Received headers use a different set of delimiters to preserve things
345	like ip addresses */
346
347	token = strtok_r ((multiline) ? line : NULL, SPARSE_DELIMITERS_HEADING, &ptrptr);
348
349	while (token)
350	{
351	l = strlen(token);
352
353	if (l > 0 && l < 50)
354	{
355	#ifdef VERBOSE
356	LOGDEBUG ("Processing '%s' token in '%s' header", token, heading);
357	#endif
358	_ds_map_header_token (CTX, token, previous_tokens, diction, heading, bitpattern);
359	}
360
361	token = strtok_r (NULL, SPARSE_DELIMITERS_HEADING, &ptrptr);
362	}
363
364	for(i=0;i<SPARSE_WINDOW_SIZE;i++) {
365	_ds_map_header_token(CTX, NULL, previous_tokens, diction, heading, bitpattern);
366	}
367
368	_ds_sparse_clear(previous_tokens);
369	node_nt = c_nt_next (header, &c_nt);
370	}
371	nt_destroy (header);
372
373	/*
374	* Body Tokenization
375	*/
376
377	#ifdef VERBOSE
378	LOGDEBUG("parsing message body");
379	#endif
380
381	token = strtok_r (body, SPARSE_DELIMITERS, &ptrptr);
382	while (token != NULL)
383	{
384	l = strlen (token);
385	if (l > 0 && l < 50)
386	{
387	#ifdef VERBOSE
388	LOGDEBUG ("Processing body token '%s'", token);
389	#endif
390
391	/* Process "current" token */
392	_ds_map_body_token (CTX, token, previous_tokens, diction, bitpattern);
393	}
394	token = strtok_r (NULL, SPARSE_DELIMITERS, &ptrptr);
395	}
396
397	for(i=0;i<SPARSE_WINDOW_SIZE;i++) {
398	_ds_map_body_token(CTX, NULL, previous_tokens, diction, bitpattern);
399	}
400
401	_ds_sparse_clear(previous_tokens);
402
403	free(bitpattern);
404
405	#ifdef VERBOSE
406	LOGDEBUG("Finished tokenizing (sparse) message");
407	#endif
408
409	return 0;
410	}
411
412	/*
413	* _ds_{process,map}_{header,body}_token()
414	*
415	* DESCRIPTION
416	* Token processing and mapping functions
417	* _ds_process_header_token
418	* _ds_process_body_token
419	* _ds_map_header_token
420	* _ds_map_body_token
421	*
422	* These functions are responsible to converting the input words into
423	* full blown tokens with CRCs, probabilities, and producing variants
424	* based on the tokenizer approach applied.
425	*/
426
427	int
428	_ds_process_header_token (DSPAM_CTX * CTX, char *token,
429	const char *previous_token, ds_diction_t diction,
430	const char *heading)
431	{
432	char combined_token[256];
433	unsigned long long crc;
434	char *tweaked_token;
435
436	if (_ds_match_attribute(CTX->config->attributes, "IgnoreHeader", heading))
437	return 0;
438
439	if (!strncmp(heading, "X-DSPAM-", 8))
440	return 0;
441
442	/* This is where we used to ignore certain headings */
443
444	if (heading[0] != 0)
445	snprintf (combined_token, sizeof (combined_token),
446	"%s*%s", heading, token);
447	else
448	strlcpy (combined_token, token, sizeof (combined_token));
449
450	tweaked_token = _ds_truncate_token(token);
451	if (tweaked_token == NULL)
452	return EUNKNOWN;
453
454	snprintf(combined_token, sizeof(combined_token), "%s*%s", heading, tweaked_token);
455
456	crc = _ds_getcrc64 (combined_token);
457	#ifdef VERBOSE
458	LOGDEBUG ("Token Hit: '%s'", combined_token);
459	#endif
460	ds_diction_touch(diction, crc, combined_token, 0);
461
462	if (CTX->tokenizer == DSZ_CHAIN && previous_token != NULL)
463	{
464	char *tweaked_previous;
465
466	tweaked_previous = _ds_truncate_token(previous_token);
467	if (tweaked_previous == NULL) {
468	free(tweaked_token);
469	return EUNKNOWN;
470	}
471
472	snprintf (combined_token, sizeof (combined_token),
473	"%s*%s+%s", heading, tweaked_previous, tweaked_token);
474	crc = _ds_getcrc64 (combined_token);
475
476	ds_diction_touch(diction, crc, combined_token, DSD_CHAINED);
477	free(tweaked_previous);
478	}
479
480	free(tweaked_token);
481	return 0;
482	}
483
484	int
485	_ds_process_body_token (DSPAM_CTX * CTX, char *token,
486	const char *previous_token, ds_diction_t diction)
487	{
488	char combined_token[256];
489	unsigned long long crc;
490	char *tweaked_token;
491
492	tweaked_token = _ds_truncate_token(token);
493	if (tweaked_token == NULL)
494	return EUNKNOWN;
495
496	crc = _ds_getcrc64 (tweaked_token);
497
498	ds_diction_touch(diction, crc, tweaked_token, DSD_CONTEXT);
499
500	if (CTX->tokenizer == DSZ_CHAIN && previous_token != NULL)
501	{
502	char *tweaked_previous = _ds_truncate_token(previous_token);
503	if (tweaked_previous == NULL) {
504	free(tweaked_token);
505	return EUNKNOWN;
506	}
507
508	snprintf (combined_token, sizeof (combined_token), "%s+%s",
509	tweaked_previous, tweaked_token);
510	crc = _ds_getcrc64 (combined_token);
511
512	ds_diction_touch(diction, crc, combined_token, DSD_CHAINED \| DSD_CONTEXT);
513	free(tweaked_previous);
514	}
515	free(tweaked_token);
516
517	return 0;
518	}
519
520
521	int
522	_ds_map_header_token (DSPAM_CTX * CTX, char *token,
523	char **previous_tokens, ds_diction_t diction,
524	const char heading, const char bitpattern)
525	{
526	int i, t, keylen, breadth;
527	u_int32_t mask;
528	unsigned long long crc;
529	char key[256];
530	int active = 0, top, tokenizer = CTX->tokenizer;
531
532	if (_ds_match_attribute(CTX->config->attributes, "IgnoreHeader", heading))
533	return 0;
534
535	if (!strncmp(heading, "X-DSPAM-", 8))
536	return 0;
537
538	/* Shift all previous tokens up */
539	for(i=0;i<SPARSE_WINDOW_SIZE-1;i++) {
540	previous_tokens[i] = previous_tokens[i+1];
541	if (previous_tokens[i])
542	active++;
543	}
544
545	previous_tokens[SPARSE_WINDOW_SIZE-1] = token;
546
547	if (token)
548	active++;
549
550	breadth = _ds_pow2(active);
551
552	/* Iterate and generate all keys necessary */
553	for (mask=0; mask < (u_int32_t)breadth; mask++) {
554	int terms = 0;
555
556	key[0] = 0;
557	keylen = 0;
558	t = 0;
559	top = 1;
560
561	/* Each Bit */
562	for(i=0;i<SPARSE_WINDOW_SIZE;i++) {
563
564	if (t) {
565	if ((size_t)keylen < (sizeof(key)-1)) {
566	key[keylen] = '+';
567	key[++keylen] = 0;
568	}
569	}
570
571	if (bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] == 1) {
572	if (previous_tokens[i] == NULL \|\| previous_tokens[i][0] == 0) {
573	if ((size_t)keylen < (sizeof(key)-1)) {
574	key[keylen] = '#';
575	key[++keylen] = 0;
576	}
577	}
578	else
579	{
580	int tl = strlen(previous_tokens[i]);
581	if ((size_t)(keylen + tl) < (sizeof(key)-1)) {
582	strcpy(key+keylen, previous_tokens[i]);
583	keylen += tl;
584	}
585	terms++;
586	}
587	} else {
588	if ((size_t)keylen < (sizeof(key)-1)) {
589	key[keylen] = '#';
590	key[++keylen] = 0;
591	}
592	}
593	t++;
594	}
595
596	/* If the bucket has at least 1 literal, hit it */
597	if ((tokenizer == DSZ_SBPH && terms != 0) \|\|
598	(tokenizer == DSZ_OSB && terms == 2))
599	{
600	char hkey[256];
601	char *k = key;
602	while(keylen>2 && !strcmp((key+keylen)-2, "+#")) {
603	key[keylen-2] = 0;
604	keylen -=2;
605	}
606	while(!strncmp(k, "#+", 2)) {
607	top = 0;
608	k+=2;
609	keylen -= 2;
610	}
611
612	if (top) {
613	snprintf(hkey, sizeof(hkey), "%s*%s", heading, k);
614	crc = _ds_getcrc64(hkey);
615	ds_diction_touch(diction, crc, hkey, DSD_CONTEXT);
616	}
617	}
618	}
619
620	return 0;
621	}
622
623	int
624	_ds_map_body_token (
625	DSPAM_CTX * CTX,
626	char *token,
627	char **previous_tokens,
628	ds_diction_t diction,
629	const char *bitpattern)
630	{
631	int i, t, keylen, breadth;
632	int top, tokenizer = CTX->tokenizer;
633	unsigned long long crc;
634	char key[256];
635	int active = 0;
636	u_int32_t mask;
637
638	/* Shift all previous tokens up */
639	for(i=0;i<SPARSE_WINDOW_SIZE-1;i++) {
640	previous_tokens[i] = previous_tokens[i+1];
641	if (previous_tokens[i])
642	active++;
643	}
644
645	previous_tokens[SPARSE_WINDOW_SIZE-1] = token;
646	if (token)
647	active++;
648
649	breadth = _ds_pow2(active);
650
651	/* Iterate and generate all keys necessary */
652
653	for(mask=0;mask < (u_int32_t)breadth;mask++) {
654	int terms = 0;
655	t = 0;
656
657	key[0] = 0;
658	keylen = 0;
659	top = 1;
660
661	/* Each Bit */
662	for(i=0;i<SPARSE_WINDOW_SIZE;i++) {
663	if (t) {
664	if ((size_t)keylen < (sizeof(key)-1)) {
665	key[keylen] = '+';
666	key[++keylen] = 0;
667	}
668	}
669	if (bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] == 1) {
670	if (previous_tokens[i] == NULL \|\| previous_tokens[i][0] == 0) {
671	if ((size_t)keylen < (sizeof(key)-1)) {
672	key[keylen] = '#';
673	key[++keylen] = 0;
674	}
675	}
676	else
677	{
678	int tl = strlen(previous_tokens[i]);
679	if ((size_t)(keylen + tl) < (sizeof(key)-1)) {
680	strcpy(key+keylen, previous_tokens[i]);
681	keylen += tl;
682	}
683	terms++;
684	}
685	} else {
686	if ((size_t)keylen < (sizeof(key)-1)) {
687	key[keylen] = '#';
688	key[++keylen] = 0;
689	}
690	}
691	t++;
692	}
693
694	/* If the bucket has at least 1 literal, hit it */
695	if ((tokenizer == DSZ_SBPH && terms != 0) \|\|
696	(tokenizer == DSZ_OSB && terms == 2))
697	{
698	char *k = key;
699	while(keylen>2 && !strcmp((key+keylen)-2, "+#")) {
700	key[keylen-2] = 0;
701	keylen -=2;
702	}
703	while(!strncmp(k, "#+", 2)) {
704	top = 0;
705	k+=2;
706	keylen -=2;
707	}
708
709	if (top) {
710	crc = _ds_getcrc64(k);
711	ds_diction_touch(diction, crc, k, DSD_CONTEXT);
712	}
713	}
714	}
715
716	return 0;
717	}
718
719	/*
720	* _ds_degenerate_message()
721	*
722	* DESCRIPTION
723	* Degenerate the message into headers, body and tokenizable pieces
724	*
725	* This function is responsible for analyzing the actualized message and
726	* degenerating it into only the components which are tokenizable. This
727	* process effectively eliminates much HTML noise, special symbols, or
728	* other non-tokenizable/non-desirable components. What is left is the
729	* bulk of the message and only desired tags, URLs, and other data.
730	*
731	* INPUT ARGUMENTS
732	* header pointer to buffer containing headers
733	* body pointer to buffer containing message body
734	*/
735
736	int _ds_degenerate_message(DSPAM_CTX CTX, buffer header, buffer * body)
737	{
738	char *decode = NULL;
739	struct nt_node node_nt, node_header;
740	struct nt_c c_nt, c_nt2;
741	int i = 0;
742	char heading[1024];
743
744	if (! CTX->message)
745	{
746	LOG (LOG_WARNING, "_ds_degenerate_message() failed: CTX->message is NULL");
747	return EUNKNOWN;
748	}
749
750	/* Iterate through each component and create large header/body buffers */
751
752	node_nt = c_nt_first (CTX->message->components, &c_nt);
753	while (node_nt != NULL)
754	{
755	struct _ds_message_part block = (struct _ds_message_part ) node_nt->ptr;
756
757	#ifdef VERBOSE
758	LOGDEBUG ("Processing component %d", i);
759	#endif
760
761	if (! block->headers \|\| ! block->headers->items)
762	{
763	#ifdef VERBOSE
764	LOGDEBUG (" : End of Message Identifier");
765	#endif
766	}
767
768	else
769	{
770	struct _ds_header_field *current_header;
771
772	/* Accumulate the headers */
773	node_header = c_nt_first (block->headers, &c_nt2);
774	while (node_header != NULL)
775	{
776	current_header = (struct _ds_header_field *) node_header->ptr;
777	snprintf (heading, sizeof (heading),
778	"%s: %s\n", current_header->heading,
779	current_header->data);
780	buffer_cat (header, heading);
781	node_header = c_nt_next (block->headers, &c_nt2);
782	}
783
784	decode = block->body->data;
785
786	if (block->media_type == MT_TEXT \|\|
787	block->media_type == MT_MESSAGE \|\|
788	block->media_type == MT_UNKNOWN \|\|
789	(block->media_type == MT_MULTIPART && !i))
790	{
791	/* Accumulate the bodies, skip attachments */
792
793	if (
794	( block->encoding == EN_BASE64
795	\|\| block->encoding == EN_QUOTED_PRINTABLE)
796	&& ! block->original_signed_body)
797	{
798	if (block->content_disposition != PCD_ATTACHMENT)
799	{
800	LOGDEBUG ("decoding message block from encoding type %d",
801	block->encoding);
802	decode = _ds_decode_block (block);
803	}
804	}
805
806	/* We found a tokenizable body component, add prefilters */
807
808	if (decode)
809	{
810	char *decode2 = NULL;
811	char *decode3 = NULL;
812
813	/* -- PREFILTERS BEGIN -- */
814
815	/* Hexadecimal 8-Bit Encodings */
816
817	if (block->encoding == EN_8BIT) {
818	decode2 = _ds_decode_hex8bit(decode);
819	} else {
820	decode2 = strdup(decode);
821	}
822
823	/* HTML-Specific Filters */
824
825	if (decode2) {
826	if (block->media_subtype == MST_HTML) {
827	decode3 = _ds_strip_html(decode2);
828	} else {
829	decode3 = strdup(decode2);
830	}
831	free(decode2);
832	}
833
834	/* -- PREFILTERS END -- */
835
836	if (decode3) {
837	buffer_cat (body, decode3);
838	free(decode3);
839	}
840
841	/* If we've decoded the body, save the original copy */
842	if (decode != block->body->data)
843	{
844	block->original_signed_body = block->body;
845	block->body = buffer_create (decode);
846	free (decode);
847	}
848	}
849	}
850	}
851	#ifdef VERBOSE
852	LOGDEBUG ("Getting next message component");
853	#endif
854	node_nt = c_nt_next (CTX->message->components, &c_nt);
855	i++;
856	} /* while (node_nt != NULL) */
857
858	if (header->data == NULL)
859	buffer_cat (header, " ");
860
861	if (body->data == NULL)
862	buffer_cat (body, " ");
863
864	return 0;
865	}
866
867	int _ds_url_tokenize(ds_diction_t diction, char body, const char key)
868	{
869	char token, url_ptr, url_token, ptr;
870	char combined_token[256];
871	unsigned long long crc;
872	int key_len = strlen(key);
873
874	#ifdef VERBOSE
875	LOGDEBUG("scanning for urls: %s\n", key);
876	#endif
877	if (!body)
878	return EINVAL;
879	url_ptr = body;
880
881	token = strcasestr(url_ptr, key);
882	while (token != NULL)
883	{
884	int i = 0, old;
885
886	while(token[i]
887	&& token[i] > 32
888	&& token[i] != '>'
889	&& ((token[i] != '\"' && token[i] != '\'') \|\| i <= key_len))
890	i++;
891	old = token[i];
892	token[i] = 0; /* parse in place */
893
894	/* Tokenize URL */
895	url_token = strtok_r (token, DELIMITERS, &ptr);
896	while (url_token != NULL)
897	{
898	snprintf (combined_token, sizeof (combined_token), "Url*%s", url_token);
899	crc = _ds_getcrc64 (combined_token);
900	ds_diction_touch(diction, crc, combined_token, 0);
901	url_token = strtok_r (NULL, DELIMITERS, &ptr);
902	}
903	memset (token, 32, i);
904	token[i] = old;
905	url_ptr = token + i;
906	token = strcasestr(url_ptr, key);
907	}
908	return 0;
909	}
910
911	/* Truncate tokens with EOT delimiters */
912	char * _ds_truncate_token(const char *token) {
913	char *tweaked;
914	int i;
915
916	if (token == NULL)
917	return NULL;
918
919	tweaked = strdup(token);
920
921	if (tweaked == NULL)
922	return NULL;
923
924	i = strlen(tweaked);
925	while(i>1 && strspn(tweaked+i-2, DELIMITERS_EOT)) {
926	tweaked[i-1] = 0;
927	i--;
928	}
929
930	return tweaked;
931	}
932
933	/*
934	* _ds_spbh_clear
935	*
936	* DESCRIPTION
937	* Clears the SBPH stack
938	*
939	* Clears and frees all of the tokens in the SBPH stack. Used when a
940	* boundary has been crossed (such as a new message header) where
941	* tokens from the previous boundary are no longer useful.
942	*/
943
944	void _ds_sparse_clear(char **previous_tokens) {
945	int i;
946	for(i=0;i<SPARSE_WINDOW_SIZE;i++)
947	previous_tokens[i] = NULL;
948	return;
949	}
950
951	/*
952	* _ds_generate_bitpattern
953	*
954	* DESCRIPTION
955	* Generates a sparse bitpattern for SPARSE_WINDOW_SIZE
956	*
957	* This pattern is then used to create token patterns when using SBPH or OSB
958	*
959	*/
960
961	char *_ds_generate_bitpattern(int breadth) {
962	char *bitpattern;
963	u_int32_t mask;
964	unsigned long exp;
965	int i;
966
967	bitpattern = malloc(SPARSE_WINDOW_SIZE * breadth);
968
969	for(mask=0;mask<(u_int32_t)breadth;mask++) {
970	for(i=0;i<SPARSE_WINDOW_SIZE;i++) {
971	exp = (i) ? _ds_pow2(i) : 1;
972	/* Reverse pos = SPARSE_WINDOW_SIZE - (i+1); */
973	if (mask & exp)
974	{
975	bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] = 1;
976	}
977	else
978	{
979	bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] = 0;
980	}
981	}
982	}
983
984	return bitpattern;
985	}
986

Note: See TracBrowser for help on using the repository browser.

Download in other formats: