Context Navigation

source: npl/mailserver/dspam/dspam-3.10.2/src/tokenizer.c @ c5c522c

gcc484ntopperl-5.22

Last change on this file since c5c522c was c5c522c, checked in by Edwin Eefting <edwin@datux.nl>, 8 years ago
initial commit, transferred from cleaned syn3 svn tree
Property mode set to `100644`
File size: 23.3 KB

Rev	Line
[c5c522c]	1	/* $Id: tokenizer.c,v 1.301 2011/06/28 00:13:48 sbajic Exp $ */
	2
	3	/*
	4	DSPAM
	5	COPYRIGHT (C) 2002-2012 DSPAM PROJECT
	6
	7	This program is free software: you can redistribute it and/or modify
	8	it under the terms of the GNU Affero General Public License as
	9	published by the Free Software Foundation, either version 3 of the
	10	License, or (at your option) any later version.
	11
	12	This program is distributed in the hope that it will be useful,
	13	but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	GNU Affero General Public License for more details.
	16
	17	You should have received a copy of the GNU Affero General Public License
	18	along with this program. If not, see <http://www.gnu.org/licenses/>.
	19
	20	*/
	21
	22	/*
	23	* tokenizer.c - tokenizer functions
	24	*
	25	* DESCRIPTION
	26	* The tokenizer subroutines are responsible for decomposing a message into
	27	* its colloquial components. All components are stored collectively in
	28	* a diction object, passed into the function.
	29	*
	30	*/
	31
	32	#ifdef HAVE_CONFIG_H
	33	#include <auto-config.h>
	34	#endif
	35
	36	#include <stdio.h>
	37	#include <stdlib.h>
	38	#include <math.h>
	39	#include <ctype.h>
	40	#include <errno.h>
	41	#include <string.h>
	42	#ifdef HAVE_UNISTD_H
	43	#include <unistd.h>
	44	#endif
	45	#include <sys/types.h>
	46	#include <sys/stat.h>
	47
	48	#ifdef TIME_WITH_SYS_TIME
	49	# include <sys/time.h>
	50	# include <time.h>
	51	#else
	52	# ifdef HAVE_SYS_TIME_H
	53	# include <sys/time.h>
	54	# else
	55	# include <time.h>
	56	# endif
	57	#endif
	58
	59	#include "config.h"
	60	#include "tokenizer.h"
	61	#include "util.h"
	62	#include "libdspam.h"
	63	#include "language.h"
	64
	65	/*
	66	* _ds_tokenize() - tokenize the message
	67	*
	68	* DESCRIPTION
	69	* tokenizes the supplied message
	70	*
	71	* INPUT ARGUMENTS
	72	* DSPAM_CTX *CTX pointer to context
	73	* char *header pointer to message header
	74	* char *body pointer to message body
	75	* ds_diction_t diction to store components
	76	*
	77	* RETURN VALUES
	78	* standard errors on failure
	79	* zero if successful
	80	*
	81	*/
	82
	83	int
	84	_ds_tokenize (DSPAM_CTX * CTX, char headers, char body, ds_diction_t diction)
	85	{
	86	if (diction == NULL)
	87	return EINVAL;
	88
	89	if (CTX->tokenizer == DSZ_SBPH \|\| CTX->tokenizer == DSZ_OSB)
	90	return _ds_tokenize_sparse(CTX, headers, body, diction);
	91	else
	92	return _ds_tokenize_ngram(CTX, headers, body, diction);
	93	}
	94
	95	int _ds_tokenize_ngram(
	96	DSPAM_CTX *CTX,
	97	char *headers,
	98	char *body,
	99	ds_diction_t diction)
	100	{
	101	char token; / current token */
	102	char previous_token = NULL; / used for bigrams (chained tokens) */
	103	char line = NULL; / header broken up into lines */
	104	char *ptrptr;
	105	char heading[128]; /* current heading */
	106	int l, tokenizer = CTX->tokenizer;
	107
	108	struct nt *header = NULL;
	109	struct nt_node *node_nt;
	110	struct nt_c c_nt;
	111
	112	/* Tokenize URLs in message */
	113
	114	if (_ds_match_attribute(CTX->config->attributes, "ProcessorURLContext", "on")) {
	115	_ds_url_tokenize(diction, body, "http://");
	116	_ds_url_tokenize(diction, body, "www.");
	117	_ds_url_tokenize(diction, body, "href=");
	118	}
	119
	120	/*
	121	* Header Tokenization
	122	*/
	123
	124	header = nt_create (NT_CHAR);
	125	if (header == NULL)
	126	{
	127	LOG (LOG_CRIT, ERR_MEM_ALLOC);
	128	return EUNKNOWN;
	129	}
	130
	131	line = strtok_r (headers, "\n", &ptrptr);
	132	while (line) {
	133	nt_add (header, line);
	134	line = strtok_r (NULL, "\n", &ptrptr);
	135	}
	136
	137	node_nt = c_nt_first (header, &c_nt);
	138	heading[0] = 0;
	139	while (node_nt) {
	140	int multiline;
	141
	142	#ifdef VERBOSE
	143	LOGDEBUG("processing line: %s", node_nt->ptr);
	144	#endif
	145
	146	line = node_nt->ptr;
	147	token = strtok_r (line, ":", &ptrptr);
	148	if (token && token[0] != 32 && token[0] != 9 && !strstr (token, " "))
	149	{
	150	multiline = 0;
	151	strlcpy (heading, token, 128);
	152	previous_token = NULL;
	153	} else {
	154	multiline = 1;
	155	}
	156
	157	#ifdef VERBOSE
	158	LOGDEBUG ("Reading '%s' header from: '%s'", heading, line);
	159	#endif
	160
	161	if (CTX->flags & DSF_WHITELIST) {
	162	/* Use the entire From: line for auto-whitelisting */
	163
	164	if (!strcmp(heading, "From")) {
	165	char wl[256];
	166	char *fromline = line + 5;
	167	unsigned long long whitelist_token;
	168
	169	if (fromline[0] == 32)
	170	fromline++;
	171	snprintf(wl, sizeof(wl), "%s*%s", heading, fromline);
	172	whitelist_token = _ds_getcrc64(wl);
	173	ds_diction_touch(diction, whitelist_token, wl, 0);
	174	diction->whitelist_token = whitelist_token;
	175	}
	176	}
	177
	178	/* Received headers use a different set of delimiters to preserve things
	179	like ip addresses */
	180
	181	token = strtok_r ((multiline) ? line : NULL, DELIMITERS_HEADING, &ptrptr);
	182
	183	while (token)
	184	{
	185	l = strlen(token);
	186
	187	if (l >= 1 && l < 50)
	188	{
	189	#ifdef VERBOSE
	190	LOGDEBUG ("Processing '%s' token in '%s' header", token, heading);
	191	#endif
	192
	193	/* Process "current" token */
	194	if (!_ds_process_header_token
	195	(CTX, token, previous_token, diction, heading) &&
	196	(tokenizer == DSZ_CHAIN))
	197	{
	198	previous_token = token;
	199	}
	200	}
	201
	202	token = strtok_r (NULL, DELIMITERS_HEADING, &ptrptr);
	203	}
	204
	205	previous_token = NULL;
	206	node_nt = c_nt_next (header, &c_nt);
	207	}
	208
	209	nt_destroy (header);
	210
	211	/*
	212	* Body Tokenization
	213	*/
	214
	215	#ifdef VERBOSE
	216	LOGDEBUG("parsing message body");
	217	#endif
	218
	219	token = strtok_r (body, DELIMITERS, &ptrptr);
	220	while (token != NULL)
	221	{
	222	l = strlen (token);
	223	if (l >= 1 && l < 50)
	224	{
	225	#ifdef VERBOSE
	226	LOGDEBUG ("Processing body token '%s'", token);
	227	#endif
	228
	229	/* Process "current" token */
	230	if ( !_ds_process_body_token(CTX, token, previous_token, diction)
	231	&& tokenizer == DSZ_CHAIN)
	232	{
	233	previous_token = token;
	234	}
	235	}
	236	token = strtok_r (NULL, DELIMITERS, &ptrptr);
	237	}
	238
	239	#ifdef VERBOSE
	240	LOGDEBUG("Finished tokenizing (ngram) message");
	241	#endif
	242
	243	/* Final token reassembly (anything left in the buffer) */
	244
	245	return 0;
	246	}
	247
	248	int _ds_tokenize_sparse(
	249	DSPAM_CTX *CTX,
	250	char *headers,
	251	char *body,
	252	ds_diction_t diction)
	253	{
	254	int i;
	255	char token; / current token */
	256	char previous_tokens[SPARSE_WINDOW_SIZE]; / sparse chain */
	257
	258	char line = NULL; / header broken up into lines */
	259	char *ptrptr;
	260	char *bitpattern;
	261
	262	char heading[128]; /* current heading */
	263	int l;
	264
	265	struct nt *header = NULL;
	266	struct nt_node *node_nt;
	267	struct nt_c c_nt;
	268
	269	for(i=0;i<SPARSE_WINDOW_SIZE;i++)
	270	previous_tokens[i] = NULL;
	271
	272	bitpattern = _ds_generate_bitpattern(_ds_pow2(SPARSE_WINDOW_SIZE));
	273
	274	/* Tokenize URLs in message */
	275
	276	if (_ds_match_attribute(CTX->config->attributes, "ProcessorURLContext", "on"))
	277	{
	278	_ds_url_tokenize(diction, body, "http://");
	279	_ds_url_tokenize(diction, body, "www.");
	280	_ds_url_tokenize(diction, body, "href=");
	281	}
	282
	283	/*
	284	* Header Tokenization
	285	*/
	286
	287	header = nt_create (NT_CHAR);
	288	if (header == NULL)
	289	{
	290	LOG (LOG_CRIT, ERR_MEM_ALLOC);
	291	free(bitpattern);
	292	return EUNKNOWN;
	293	}
	294
	295	line = strtok_r (headers, "\n", &ptrptr);
	296	while (line) {
	297	nt_add (header, line);
	298	line = strtok_r (NULL, "\n", &ptrptr);
	299	}
	300
	301	node_nt = c_nt_first (header, &c_nt);
	302	heading[0] = 0;
	303	while (node_nt) {
	304	int multiline;
	305
	306	#ifdef VERBOSE
	307	LOGDEBUG("processing line: %s", node_nt->ptr);
	308	#endif
	309
	310	_ds_sparse_clear(previous_tokens);
	311
	312	line = node_nt->ptr;
	313	token = strtok_r (line, ":", &ptrptr);
	314	if (token && token[0] != 32 && token[0] != 9 && !strstr (token, " "))
	315	{
	316	multiline = 0;
	317	strlcpy (heading, token, 128);
	318	_ds_sparse_clear(previous_tokens);
	319	} else {
	320	multiline = 1;
	321	}
	322
	323	#ifdef VERBOSE
	324	LOGDEBUG ("Reading '%s' header from: '%s'", heading, line);
	325	#endif
	326
	327	if (CTX->flags & DSF_WHITELIST) {
	328	/* Use the entire From: line for auto-whitelisting */
	329
	330	if (!strcmp(heading, "From")) {
	331	char wl[256];
	332	char *fromline = line + 5;
	333	unsigned long long whitelist_token;
	334
	335	if (fromline[0] == 32)
	336	fromline++;
	337	snprintf(wl, sizeof(wl), "%s*%s", heading, fromline);
	338	whitelist_token = _ds_getcrc64(wl);
	339	ds_diction_touch(diction, whitelist_token, wl, 0);
	340	diction->whitelist_token = whitelist_token;
	341	}
	342	}
	343
	344	/* Received headers use a different set of delimiters to preserve things
	345	like ip addresses */
	346
	347	token = strtok_r ((multiline) ? line : NULL, SPARSE_DELIMITERS_HEADING, &ptrptr);
	348
	349	while (token)
	350	{
	351	l = strlen(token);
	352
	353	if (l > 0 && l < 50)
	354	{
	355	#ifdef VERBOSE
	356	LOGDEBUG ("Processing '%s' token in '%s' header", token, heading);
	357	#endif
	358	_ds_map_header_token (CTX, token, previous_tokens, diction, heading, bitpattern);
	359	}
	360
	361	token = strtok_r (NULL, SPARSE_DELIMITERS_HEADING, &ptrptr);
	362	}
	363
	364	for(i=0;i<SPARSE_WINDOW_SIZE;i++) {
	365	_ds_map_header_token(CTX, NULL, previous_tokens, diction, heading, bitpattern);
	366	}
	367
	368	_ds_sparse_clear(previous_tokens);
	369	node_nt = c_nt_next (header, &c_nt);
	370	}
	371	nt_destroy (header);
	372
	373	/*
	374	* Body Tokenization
	375	*/
	376
	377	#ifdef VERBOSE
	378	LOGDEBUG("parsing message body");
	379	#endif
	380
	381	token = strtok_r (body, SPARSE_DELIMITERS, &ptrptr);
	382	while (token != NULL)
	383	{
	384	l = strlen (token);
	385	if (l > 0 && l < 50)
	386	{
	387	#ifdef VERBOSE
	388	LOGDEBUG ("Processing body token '%s'", token);
	389	#endif
	390
	391	/* Process "current" token */
	392	_ds_map_body_token (CTX, token, previous_tokens, diction, bitpattern);
	393	}
	394	token = strtok_r (NULL, SPARSE_DELIMITERS, &ptrptr);
	395	}
	396
	397	for(i=0;i<SPARSE_WINDOW_SIZE;i++) {
	398	_ds_map_body_token(CTX, NULL, previous_tokens, diction, bitpattern);
	399	}
	400
	401	_ds_sparse_clear(previous_tokens);
	402
	403	free(bitpattern);
	404
	405	#ifdef VERBOSE
	406	LOGDEBUG("Finished tokenizing (sparse) message");
	407	#endif
	408
	409	return 0;
	410	}
	411
	412	/*
	413	* _ds_{process,map}_{header,body}_token()
	414	*
	415	* DESCRIPTION
	416	* Token processing and mapping functions
	417	* _ds_process_header_token
	418	* _ds_process_body_token
	419	* _ds_map_header_token
	420	* _ds_map_body_token
	421	*
	422	* These functions are responsible to converting the input words into
	423	* full blown tokens with CRCs, probabilities, and producing variants
	424	* based on the tokenizer approach applied.
	425	*/
	426
	427	int
	428	_ds_process_header_token (DSPAM_CTX * CTX, char *token,
	429	const char *previous_token, ds_diction_t diction,
	430	const char *heading)
	431	{
	432	char combined_token[256];
	433	unsigned long long crc;
	434	char *tweaked_token;
	435
	436	if (_ds_match_attribute(CTX->config->attributes, "IgnoreHeader", heading))
	437	return 0;
	438
	439	if (!strncmp(heading, "X-DSPAM-", 8))
	440	return 0;
	441
	442	/* This is where we used to ignore certain headings */
	443
	444	if (heading[0] != 0)
	445	snprintf (combined_token, sizeof (combined_token),
	446	"%s*%s", heading, token);
	447	else
	448	strlcpy (combined_token, token, sizeof (combined_token));
	449
	450	tweaked_token = _ds_truncate_token(token);
	451	if (tweaked_token == NULL)
	452	return EUNKNOWN;
	453
	454	snprintf(combined_token, sizeof(combined_token), "%s*%s", heading, tweaked_token);
	455
	456	crc = _ds_getcrc64 (combined_token);
	457	#ifdef VERBOSE
	458	LOGDEBUG ("Token Hit: '%s'", combined_token);
	459	#endif
	460	ds_diction_touch(diction, crc, combined_token, 0);
	461
	462	if (CTX->tokenizer == DSZ_CHAIN && previous_token != NULL)
	463	{
	464	char *tweaked_previous;
	465
	466	tweaked_previous = _ds_truncate_token(previous_token);
	467	if (tweaked_previous == NULL) {
	468	free(tweaked_token);
	469	return EUNKNOWN;
	470	}
	471
	472	snprintf (combined_token, sizeof (combined_token),
	473	"%s*%s+%s", heading, tweaked_previous, tweaked_token);
	474	crc = _ds_getcrc64 (combined_token);
	475
	476	ds_diction_touch(diction, crc, combined_token, DSD_CHAINED);
	477	free(tweaked_previous);
	478	}
	479
	480	free(tweaked_token);
	481	return 0;
	482	}
	483
	484	int
	485	_ds_process_body_token (DSPAM_CTX * CTX, char *token,
	486	const char *previous_token, ds_diction_t diction)
	487	{
	488	char combined_token[256];
	489	unsigned long long crc;
	490	char *tweaked_token;
	491
	492	tweaked_token = _ds_truncate_token(token);
	493	if (tweaked_token == NULL)
	494	return EUNKNOWN;
	495
	496	crc = _ds_getcrc64 (tweaked_token);
	497
	498	ds_diction_touch(diction, crc, tweaked_token, DSD_CONTEXT);
	499
	500	if (CTX->tokenizer == DSZ_CHAIN && previous_token != NULL)
	501	{
	502	char *tweaked_previous = _ds_truncate_token(previous_token);
	503	if (tweaked_previous == NULL) {
	504	free(tweaked_token);
	505	return EUNKNOWN;
	506	}
	507
	508	snprintf (combined_token, sizeof (combined_token), "%s+%s",
	509	tweaked_previous, tweaked_token);
	510	crc = _ds_getcrc64 (combined_token);
	511
	512	ds_diction_touch(diction, crc, combined_token, DSD_CHAINED \| DSD_CONTEXT);
	513	free(tweaked_previous);
	514	}
	515	free(tweaked_token);
	516
	517	return 0;
	518	}
	519
	520
	521	int
	522	_ds_map_header_token (DSPAM_CTX * CTX, char *token,
	523	char **previous_tokens, ds_diction_t diction,
	524	const char heading, const char bitpattern)
	525	{
	526	int i, t, keylen, breadth;
	527	u_int32_t mask;
	528	unsigned long long crc;
	529	char key[256];
	530	int active = 0, top, tokenizer = CTX->tokenizer;
	531
	532	if (_ds_match_attribute(CTX->config->attributes, "IgnoreHeader", heading))
	533	return 0;
	534
	535	if (!strncmp(heading, "X-DSPAM-", 8))
	536	return 0;
	537
	538	/* Shift all previous tokens up */
	539	for(i=0;i<SPARSE_WINDOW_SIZE-1;i++) {
	540	previous_tokens[i] = previous_tokens[i+1];
	541	if (previous_tokens[i])
	542	active++;
	543	}
	544
	545	previous_tokens[SPARSE_WINDOW_SIZE-1] = token;
	546
	547	if (token)
	548	active++;
	549
	550	breadth = _ds_pow2(active);
	551
	552	/* Iterate and generate all keys necessary */
	553	for (mask=0; mask < (u_int32_t)breadth; mask++) {
	554	int terms = 0;
	555
	556	key[0] = 0;
	557	keylen = 0;
	558	t = 0;
	559	top = 1;
	560
	561	/* Each Bit */
	562	for(i=0;i<SPARSE_WINDOW_SIZE;i++) {
	563
	564	if (t) {
	565	if ((size_t)keylen < (sizeof(key)-1)) {
	566	key[keylen] = '+';
	567	key[++keylen] = 0;
	568	}
	569	}
	570
	571	if (bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] == 1) {
	572	if (previous_tokens[i] == NULL \|\| previous_tokens[i][0] == 0) {
	573	if ((size_t)keylen < (sizeof(key)-1)) {
	574	key[keylen] = '#';
	575	key[++keylen] = 0;
	576	}
	577	}
	578	else
	579	{
	580	int tl = strlen(previous_tokens[i]);
	581	if ((size_t)(keylen + tl) < (sizeof(key)-1)) {
	582	strcpy(key+keylen, previous_tokens[i]);
	583	keylen += tl;
	584	}
	585	terms++;
	586	}
	587	} else {
	588	if ((size_t)keylen < (sizeof(key)-1)) {
	589	key[keylen] = '#';
	590	key[++keylen] = 0;
	591	}
	592	}
	593	t++;
	594	}
	595
	596	/* If the bucket has at least 1 literal, hit it */
	597	if ((tokenizer == DSZ_SBPH && terms != 0) \|\|
	598	(tokenizer == DSZ_OSB && terms == 2))
	599	{
	600	char hkey[256];
	601	char *k = key;
	602	while(keylen>2 && !strcmp((key+keylen)-2, "+#")) {
	603	key[keylen-2] = 0;
	604	keylen -=2;
	605	}
	606	while(!strncmp(k, "#+", 2)) {
	607	top = 0;
	608	k+=2;
	609	keylen -= 2;
	610	}
	611
	612	if (top) {
	613	snprintf(hkey, sizeof(hkey), "%s*%s", heading, k);
	614	crc = _ds_getcrc64(hkey);
	615	ds_diction_touch(diction, crc, hkey, DSD_CONTEXT);
	616	}
	617	}
	618	}
	619
	620	return 0;
	621	}
	622
	623	int
	624	_ds_map_body_token (
	625	DSPAM_CTX * CTX,
	626	char *token,
	627	char **previous_tokens,
	628	ds_diction_t diction,
	629	const char *bitpattern)
	630	{
	631	int i, t, keylen, breadth;
	632	int top, tokenizer = CTX->tokenizer;
	633	unsigned long long crc;
	634	char key[256];
	635	int active = 0;
	636	u_int32_t mask;
	637
	638	/* Shift all previous tokens up */
	639	for(i=0;i<SPARSE_WINDOW_SIZE-1;i++) {
	640	previous_tokens[i] = previous_tokens[i+1];
	641	if (previous_tokens[i])
	642	active++;
	643	}
	644
	645	previous_tokens[SPARSE_WINDOW_SIZE-1] = token;
	646	if (token)
	647	active++;
	648
	649	breadth = _ds_pow2(active);
	650
	651	/* Iterate and generate all keys necessary */
	652
	653	for(mask=0;mask < (u_int32_t)breadth;mask++) {
	654	int terms = 0;
	655	t = 0;
	656
	657	key[0] = 0;
	658	keylen = 0;
	659	top = 1;
	660
	661	/* Each Bit */
	662	for(i=0;i<SPARSE_WINDOW_SIZE;i++) {
	663	if (t) {
	664	if ((size_t)keylen < (sizeof(key)-1)) {
	665	key[keylen] = '+';
	666	key[++keylen] = 0;
	667	}
	668	}
	669	if (bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] == 1) {
	670	if (previous_tokens[i] == NULL \|\| previous_tokens[i][0] == 0) {
	671	if ((size_t)keylen < (sizeof(key)-1)) {
	672	key[keylen] = '#';
	673	key[++keylen] = 0;
	674	}
	675	}
	676	else
	677	{
	678	int tl = strlen(previous_tokens[i]);
	679	if ((size_t)(keylen + tl) < (sizeof(key)-1)) {
	680	strcpy(key+keylen, previous_tokens[i]);
	681	keylen += tl;
	682	}
	683	terms++;
	684	}
	685	} else {
	686	if ((size_t)keylen < (sizeof(key)-1)) {
	687	key[keylen] = '#';
	688	key[++keylen] = 0;
	689	}
	690	}
	691	t++;
	692	}
	693
	694	/* If the bucket has at least 1 literal, hit it */
	695	if ((tokenizer == DSZ_SBPH && terms != 0) \|\|
	696	(tokenizer == DSZ_OSB && terms == 2))
	697	{
	698	char *k = key;
	699	while(keylen>2 && !strcmp((key+keylen)-2, "+#")) {
	700	key[keylen-2] = 0;
	701	keylen -=2;
	702	}
	703	while(!strncmp(k, "#+", 2)) {
	704	top = 0;
	705	k+=2;
	706	keylen -=2;
	707	}
	708
	709	if (top) {
	710	crc = _ds_getcrc64(k);
	711	ds_diction_touch(diction, crc, k, DSD_CONTEXT);
	712	}
	713	}
	714	}
	715
	716	return 0;
	717	}
	718
	719	/*
	720	* _ds_degenerate_message()
	721	*
	722	* DESCRIPTION
	723	* Degenerate the message into headers, body and tokenizable pieces
	724	*
	725	* This function is responsible for analyzing the actualized message and
	726	* degenerating it into only the components which are tokenizable. This
	727	* process effectively eliminates much HTML noise, special symbols, or
	728	* other non-tokenizable/non-desirable components. What is left is the
	729	* bulk of the message and only desired tags, URLs, and other data.
	730	*
	731	* INPUT ARGUMENTS
	732	* header pointer to buffer containing headers
	733	* body pointer to buffer containing message body
	734	*/
	735
	736	int _ds_degenerate_message(DSPAM_CTX CTX, buffer header, buffer * body)
	737	{
	738	char *decode = NULL;
	739	struct nt_node node_nt, node_header;
	740	struct nt_c c_nt, c_nt2;
	741	int i = 0;
	742	char heading[1024];
	743
	744	if (! CTX->message)
	745	{
	746	LOG (LOG_WARNING, "_ds_degenerate_message() failed: CTX->message is NULL");
	747	return EUNKNOWN;
	748	}
	749
	750	/* Iterate through each component and create large header/body buffers */
	751
	752	node_nt = c_nt_first (CTX->message->components, &c_nt);
	753	while (node_nt != NULL)
	754	{
	755	struct _ds_message_part block = (struct _ds_message_part ) node_nt->ptr;
	756
	757	#ifdef VERBOSE
	758	LOGDEBUG ("Processing component %d", i);
	759	#endif
	760
	761	if (! block->headers \|\| ! block->headers->items)
	762	{
	763	#ifdef VERBOSE
	764	LOGDEBUG (" : End of Message Identifier");
	765	#endif
	766	}
	767
	768	else
	769	{
	770	struct _ds_header_field *current_header;
	771
	772	/* Accumulate the headers */
	773	node_header = c_nt_first (block->headers, &c_nt2);
	774	while (node_header != NULL)
	775	{
	776	current_header = (struct _ds_header_field *) node_header->ptr;
	777	snprintf (heading, sizeof (heading),
	778	"%s: %s\n", current_header->heading,
	779	current_header->data);
	780	buffer_cat (header, heading);
	781	node_header = c_nt_next (block->headers, &c_nt2);
	782	}
	783
	784	decode = block->body->data;
	785
	786	if (block->media_type == MT_TEXT \|\|
	787	block->media_type == MT_MESSAGE \|\|
	788	block->media_type == MT_UNKNOWN \|\|
	789	(block->media_type == MT_MULTIPART && !i))
	790	{
	791	/* Accumulate the bodies, skip attachments */
	792
	793	if (
	794	( block->encoding == EN_BASE64
	795	\|\| block->encoding == EN_QUOTED_PRINTABLE)
	796	&& ! block->original_signed_body)
	797	{
	798	if (block->content_disposition != PCD_ATTACHMENT)
	799	{
	800	LOGDEBUG ("decoding message block from encoding type %d",
	801	block->encoding);
	802	decode = _ds_decode_block (block);
	803	}
	804	}
	805
	806	/* We found a tokenizable body component, add prefilters */
	807
	808	if (decode)
	809	{
	810	char *decode2 = NULL;
	811	char *decode3 = NULL;
	812
	813	/* -- PREFILTERS BEGIN -- */
	814
	815	/* Hexadecimal 8-Bit Encodings */
	816
	817	if (block->encoding == EN_8BIT) {
	818	decode2 = _ds_decode_hex8bit(decode);
	819	} else {
	820	decode2 = strdup(decode);
	821	}
	822
	823	/* HTML-Specific Filters */
	824
	825	if (decode2) {
	826	if (block->media_subtype == MST_HTML) {
	827	decode3 = _ds_strip_html(decode2);
	828	} else {
	829	decode3 = strdup(decode2);
	830	}
	831	free(decode2);
	832	}
	833
	834	/* -- PREFILTERS END -- */
	835
	836	if (decode3) {
	837	buffer_cat (body, decode3);
	838	free(decode3);
	839	}
	840
	841	/* If we've decoded the body, save the original copy */
	842	if (decode != block->body->data)
	843	{
	844	block->original_signed_body = block->body;
	845	block->body = buffer_create (decode);
	846	free (decode);
	847	}
	848	}
	849	}
	850	}
	851	#ifdef VERBOSE
	852	LOGDEBUG ("Getting next message component");
	853	#endif
	854	node_nt = c_nt_next (CTX->message->components, &c_nt);
	855	i++;
	856	} /* while (node_nt != NULL) */
	857
	858	if (header->data == NULL)
	859	buffer_cat (header, " ");
	860
	861	if (body->data == NULL)
	862	buffer_cat (body, " ");
	863
	864	return 0;
	865	}
	866
	867	int _ds_url_tokenize(ds_diction_t diction, char body, const char key)
	868	{
	869	char token, url_ptr, url_token, ptr;
	870	char combined_token[256];
	871	unsigned long long crc;
	872	int key_len = strlen(key);
	873
	874	#ifdef VERBOSE
	875	LOGDEBUG("scanning for urls: %s\n", key);
	876	#endif
	877	if (!body)
	878	return EINVAL;
	879	url_ptr = body;
	880
	881	token = strcasestr(url_ptr, key);
	882	while (token != NULL)
	883	{
	884	int i = 0, old;
	885
	886	while(token[i]
	887	&& token[i] > 32
	888	&& token[i] != '>'
	889	&& ((token[i] != '\"' && token[i] != '\'') \|\| i <= key_len))
	890	i++;
	891	old = token[i];
	892	token[i] = 0; /* parse in place */
	893
	894	/* Tokenize URL */
	895	url_token = strtok_r (token, DELIMITERS, &ptr);
	896	while (url_token != NULL)
	897	{
	898	snprintf (combined_token, sizeof (combined_token), "Url*%s", url_token);
	899	crc = _ds_getcrc64 (combined_token);
	900	ds_diction_touch(diction, crc, combined_token, 0);
	901	url_token = strtok_r (NULL, DELIMITERS, &ptr);
	902	}
	903	memset (token, 32, i);
	904	token[i] = old;
	905	url_ptr = token + i;
	906	token = strcasestr(url_ptr, key);
	907	}
	908	return 0;
	909	}
	910
	911	/* Truncate tokens with EOT delimiters */
	912	char * _ds_truncate_token(const char *token) {
	913	char *tweaked;
	914	int i;
	915
	916	if (token == NULL)
	917	return NULL;
	918
	919	tweaked = strdup(token);
	920
	921	if (tweaked == NULL)
	922	return NULL;
	923
	924	i = strlen(tweaked);
	925	while(i>1 && strspn(tweaked+i-2, DELIMITERS_EOT)) {
	926	tweaked[i-1] = 0;
	927	i--;
	928	}
	929
	930	return tweaked;
	931	}
	932
	933	/*
	934	* _ds_spbh_clear
	935	*
	936	* DESCRIPTION
	937	* Clears the SBPH stack
	938	*
	939	* Clears and frees all of the tokens in the SBPH stack. Used when a
	940	* boundary has been crossed (such as a new message header) where
	941	* tokens from the previous boundary are no longer useful.
	942	*/
	943
	944	void _ds_sparse_clear(char **previous_tokens) {
	945	int i;
	946	for(i=0;i<SPARSE_WINDOW_SIZE;i++)
	947	previous_tokens[i] = NULL;
	948	return;
	949	}
	950
	951	/*
	952	* _ds_generate_bitpattern
	953	*
	954	* DESCRIPTION
	955	* Generates a sparse bitpattern for SPARSE_WINDOW_SIZE
	956	*
	957	* This pattern is then used to create token patterns when using SBPH or OSB
	958	*
	959	*/
	960
	961	char *_ds_generate_bitpattern(int breadth) {
	962	char *bitpattern;
	963	u_int32_t mask;
	964	unsigned long exp;
	965	int i;
	966
	967	bitpattern = malloc(SPARSE_WINDOW_SIZE * breadth);
	968
	969	for(mask=0;mask<(u_int32_t)breadth;mask++) {
	970	for(i=0;i<SPARSE_WINDOW_SIZE;i++) {
	971	exp = (i) ? _ds_pow2(i) : 1;
	972	/* Reverse pos = SPARSE_WINDOW_SIZE - (i+1); */
	973	if (mask & exp)
	974	{
	975	bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] = 1;
	976	}
	977	else
	978	{
	979	bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] = 0;
	980	}
	981	}
	982	}
	983
	984	return bitpattern;
	985	}
	986

Note: See TracBrowser for help on using the repository browser.

Download in other formats: