1 | /* $Id: tokenizer.c,v 1.301 2011/06/28 00:13:48 sbajic Exp $ */ |
---|
2 | |
---|
3 | /* |
---|
4 | DSPAM |
---|
5 | COPYRIGHT (C) 2002-2012 DSPAM PROJECT |
---|
6 | |
---|
7 | This program is free software: you can redistribute it and/or modify |
---|
8 | it under the terms of the GNU Affero General Public License as |
---|
9 | published by the Free Software Foundation, either version 3 of the |
---|
10 | License, or (at your option) any later version. |
---|
11 | |
---|
12 | This program is distributed in the hope that it will be useful, |
---|
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
15 | GNU Affero General Public License for more details. |
---|
16 | |
---|
17 | You should have received a copy of the GNU Affero General Public License |
---|
18 | along with this program. If not, see <http://www.gnu.org/licenses/>. |
---|
19 | |
---|
20 | */ |
---|
21 | |
---|
22 | /* |
---|
23 | * tokenizer.c - tokenizer functions |
---|
24 | * |
---|
25 | * DESCRIPTION |
---|
26 | * The tokenizer subroutines are responsible for decomposing a message into |
---|
27 | * its colloquial components. All components are stored collectively in |
---|
28 | * a diction object, passed into the function. |
---|
29 | * |
---|
30 | */ |
---|
31 | |
---|
32 | #ifdef HAVE_CONFIG_H |
---|
33 | #include <auto-config.h> |
---|
34 | #endif |
---|
35 | |
---|
36 | #include <stdio.h> |
---|
37 | #include <stdlib.h> |
---|
38 | #include <math.h> |
---|
39 | #include <ctype.h> |
---|
40 | #include <errno.h> |
---|
41 | #include <string.h> |
---|
42 | #ifdef HAVE_UNISTD_H |
---|
43 | #include <unistd.h> |
---|
44 | #endif |
---|
45 | #include <sys/types.h> |
---|
46 | #include <sys/stat.h> |
---|
47 | |
---|
48 | #ifdef TIME_WITH_SYS_TIME |
---|
49 | # include <sys/time.h> |
---|
50 | # include <time.h> |
---|
51 | #else |
---|
52 | # ifdef HAVE_SYS_TIME_H |
---|
53 | # include <sys/time.h> |
---|
54 | # else |
---|
55 | # include <time.h> |
---|
56 | # endif |
---|
57 | #endif |
---|
58 | |
---|
59 | #include "config.h" |
---|
60 | #include "tokenizer.h" |
---|
61 | #include "util.h" |
---|
62 | #include "libdspam.h" |
---|
63 | #include "language.h" |
---|
64 | |
---|
65 | /* |
---|
66 | * _ds_tokenize() - tokenize the message |
---|
67 | * |
---|
68 | * DESCRIPTION |
---|
69 | * tokenizes the supplied message |
---|
70 | * |
---|
71 | * INPUT ARGUMENTS |
---|
72 | * DSPAM_CTX *CTX pointer to context |
---|
73 | * char *header pointer to message header |
---|
74 | * char *body pointer to message body |
---|
75 | * ds_diction_t diction to store components |
---|
76 | * |
---|
77 | * RETURN VALUES |
---|
78 | * standard errors on failure |
---|
79 | * zero if successful |
---|
80 | * |
---|
81 | */ |
---|
82 | |
---|
83 | int |
---|
84 | _ds_tokenize (DSPAM_CTX * CTX, char *headers, char *body, ds_diction_t diction) |
---|
85 | { |
---|
86 | if (diction == NULL) |
---|
87 | return EINVAL; |
---|
88 | |
---|
89 | if (CTX->tokenizer == DSZ_SBPH || CTX->tokenizer == DSZ_OSB) |
---|
90 | return _ds_tokenize_sparse(CTX, headers, body, diction); |
---|
91 | else |
---|
92 | return _ds_tokenize_ngram(CTX, headers, body, diction); |
---|
93 | } |
---|
94 | |
---|
95 | int _ds_tokenize_ngram( |
---|
96 | DSPAM_CTX *CTX, |
---|
97 | char *headers, |
---|
98 | char *body, |
---|
99 | ds_diction_t diction) |
---|
100 | { |
---|
101 | char *token; /* current token */ |
---|
102 | char *previous_token = NULL; /* used for bigrams (chained tokens) */ |
---|
103 | char *line = NULL; /* header broken up into lines */ |
---|
104 | char *ptrptr; |
---|
105 | char heading[128]; /* current heading */ |
---|
106 | int l, tokenizer = CTX->tokenizer; |
---|
107 | |
---|
108 | struct nt *header = NULL; |
---|
109 | struct nt_node *node_nt; |
---|
110 | struct nt_c c_nt; |
---|
111 | |
---|
112 | /* Tokenize URLs in message */ |
---|
113 | |
---|
114 | if (_ds_match_attribute(CTX->config->attributes, "ProcessorURLContext", "on")) { |
---|
115 | _ds_url_tokenize(diction, body, "http://"); |
---|
116 | _ds_url_tokenize(diction, body, "www."); |
---|
117 | _ds_url_tokenize(diction, body, "href="); |
---|
118 | } |
---|
119 | |
---|
120 | /* |
---|
121 | * Header Tokenization |
---|
122 | */ |
---|
123 | |
---|
124 | header = nt_create (NT_CHAR); |
---|
125 | if (header == NULL) |
---|
126 | { |
---|
127 | LOG (LOG_CRIT, ERR_MEM_ALLOC); |
---|
128 | return EUNKNOWN; |
---|
129 | } |
---|
130 | |
---|
131 | line = strtok_r (headers, "\n", &ptrptr); |
---|
132 | while (line) { |
---|
133 | nt_add (header, line); |
---|
134 | line = strtok_r (NULL, "\n", &ptrptr); |
---|
135 | } |
---|
136 | |
---|
137 | node_nt = c_nt_first (header, &c_nt); |
---|
138 | heading[0] = 0; |
---|
139 | while (node_nt) { |
---|
140 | int multiline; |
---|
141 | |
---|
142 | #ifdef VERBOSE |
---|
143 | LOGDEBUG("processing line: %s", node_nt->ptr); |
---|
144 | #endif |
---|
145 | |
---|
146 | line = node_nt->ptr; |
---|
147 | token = strtok_r (line, ":", &ptrptr); |
---|
148 | if (token && token[0] != 32 && token[0] != 9 && !strstr (token, " ")) |
---|
149 | { |
---|
150 | multiline = 0; |
---|
151 | strlcpy (heading, token, 128); |
---|
152 | previous_token = NULL; |
---|
153 | } else { |
---|
154 | multiline = 1; |
---|
155 | } |
---|
156 | |
---|
157 | #ifdef VERBOSE |
---|
158 | LOGDEBUG ("Reading '%s' header from: '%s'", heading, line); |
---|
159 | #endif |
---|
160 | |
---|
161 | if (CTX->flags & DSF_WHITELIST) { |
---|
162 | /* Use the entire From: line for auto-whitelisting */ |
---|
163 | |
---|
164 | if (!strcmp(heading, "From")) { |
---|
165 | char wl[256]; |
---|
166 | char *fromline = line + 5; |
---|
167 | unsigned long long whitelist_token; |
---|
168 | |
---|
169 | if (fromline[0] == 32) |
---|
170 | fromline++; |
---|
171 | snprintf(wl, sizeof(wl), "%s*%s", heading, fromline); |
---|
172 | whitelist_token = _ds_getcrc64(wl); |
---|
173 | ds_diction_touch(diction, whitelist_token, wl, 0); |
---|
174 | diction->whitelist_token = whitelist_token; |
---|
175 | } |
---|
176 | } |
---|
177 | |
---|
178 | /* Received headers use a different set of delimiters to preserve things |
---|
179 | like ip addresses */ |
---|
180 | |
---|
181 | token = strtok_r ((multiline) ? line : NULL, DELIMITERS_HEADING, &ptrptr); |
---|
182 | |
---|
183 | while (token) |
---|
184 | { |
---|
185 | l = strlen(token); |
---|
186 | |
---|
187 | if (l >= 1 && l < 50) |
---|
188 | { |
---|
189 | #ifdef VERBOSE |
---|
190 | LOGDEBUG ("Processing '%s' token in '%s' header", token, heading); |
---|
191 | #endif |
---|
192 | |
---|
193 | /* Process "current" token */ |
---|
194 | if (!_ds_process_header_token |
---|
195 | (CTX, token, previous_token, diction, heading) && |
---|
196 | (tokenizer == DSZ_CHAIN)) |
---|
197 | { |
---|
198 | previous_token = token; |
---|
199 | } |
---|
200 | } |
---|
201 | |
---|
202 | token = strtok_r (NULL, DELIMITERS_HEADING, &ptrptr); |
---|
203 | } |
---|
204 | |
---|
205 | previous_token = NULL; |
---|
206 | node_nt = c_nt_next (header, &c_nt); |
---|
207 | } |
---|
208 | |
---|
209 | nt_destroy (header); |
---|
210 | |
---|
211 | /* |
---|
212 | * Body Tokenization |
---|
213 | */ |
---|
214 | |
---|
215 | #ifdef VERBOSE |
---|
216 | LOGDEBUG("parsing message body"); |
---|
217 | #endif |
---|
218 | |
---|
219 | token = strtok_r (body, DELIMITERS, &ptrptr); |
---|
220 | while (token != NULL) |
---|
221 | { |
---|
222 | l = strlen (token); |
---|
223 | if (l >= 1 && l < 50) |
---|
224 | { |
---|
225 | #ifdef VERBOSE |
---|
226 | LOGDEBUG ("Processing body token '%s'", token); |
---|
227 | #endif |
---|
228 | |
---|
229 | /* Process "current" token */ |
---|
230 | if ( !_ds_process_body_token(CTX, token, previous_token, diction) |
---|
231 | && tokenizer == DSZ_CHAIN) |
---|
232 | { |
---|
233 | previous_token = token; |
---|
234 | } |
---|
235 | } |
---|
236 | token = strtok_r (NULL, DELIMITERS, &ptrptr); |
---|
237 | } |
---|
238 | |
---|
239 | #ifdef VERBOSE |
---|
240 | LOGDEBUG("Finished tokenizing (ngram) message"); |
---|
241 | #endif |
---|
242 | |
---|
243 | /* Final token reassembly (anything left in the buffer) */ |
---|
244 | |
---|
245 | return 0; |
---|
246 | } |
---|
247 | |
---|
248 | int _ds_tokenize_sparse( |
---|
249 | DSPAM_CTX *CTX, |
---|
250 | char *headers, |
---|
251 | char *body, |
---|
252 | ds_diction_t diction) |
---|
253 | { |
---|
254 | int i; |
---|
255 | char *token; /* current token */ |
---|
256 | char *previous_tokens[SPARSE_WINDOW_SIZE]; /* sparse chain */ |
---|
257 | |
---|
258 | char *line = NULL; /* header broken up into lines */ |
---|
259 | char *ptrptr; |
---|
260 | char *bitpattern; |
---|
261 | |
---|
262 | char heading[128]; /* current heading */ |
---|
263 | int l; |
---|
264 | |
---|
265 | struct nt *header = NULL; |
---|
266 | struct nt_node *node_nt; |
---|
267 | struct nt_c c_nt; |
---|
268 | |
---|
269 | for(i=0;i<SPARSE_WINDOW_SIZE;i++) |
---|
270 | previous_tokens[i] = NULL; |
---|
271 | |
---|
272 | bitpattern = _ds_generate_bitpattern(_ds_pow2(SPARSE_WINDOW_SIZE)); |
---|
273 | |
---|
274 | /* Tokenize URLs in message */ |
---|
275 | |
---|
276 | if (_ds_match_attribute(CTX->config->attributes, "ProcessorURLContext", "on")) |
---|
277 | { |
---|
278 | _ds_url_tokenize(diction, body, "http://"); |
---|
279 | _ds_url_tokenize(diction, body, "www."); |
---|
280 | _ds_url_tokenize(diction, body, "href="); |
---|
281 | } |
---|
282 | |
---|
283 | /* |
---|
284 | * Header Tokenization |
---|
285 | */ |
---|
286 | |
---|
287 | header = nt_create (NT_CHAR); |
---|
288 | if (header == NULL) |
---|
289 | { |
---|
290 | LOG (LOG_CRIT, ERR_MEM_ALLOC); |
---|
291 | free(bitpattern); |
---|
292 | return EUNKNOWN; |
---|
293 | } |
---|
294 | |
---|
295 | line = strtok_r (headers, "\n", &ptrptr); |
---|
296 | while (line) { |
---|
297 | nt_add (header, line); |
---|
298 | line = strtok_r (NULL, "\n", &ptrptr); |
---|
299 | } |
---|
300 | |
---|
301 | node_nt = c_nt_first (header, &c_nt); |
---|
302 | heading[0] = 0; |
---|
303 | while (node_nt) { |
---|
304 | int multiline; |
---|
305 | |
---|
306 | #ifdef VERBOSE |
---|
307 | LOGDEBUG("processing line: %s", node_nt->ptr); |
---|
308 | #endif |
---|
309 | |
---|
310 | _ds_sparse_clear(previous_tokens); |
---|
311 | |
---|
312 | line = node_nt->ptr; |
---|
313 | token = strtok_r (line, ":", &ptrptr); |
---|
314 | if (token && token[0] != 32 && token[0] != 9 && !strstr (token, " ")) |
---|
315 | { |
---|
316 | multiline = 0; |
---|
317 | strlcpy (heading, token, 128); |
---|
318 | _ds_sparse_clear(previous_tokens); |
---|
319 | } else { |
---|
320 | multiline = 1; |
---|
321 | } |
---|
322 | |
---|
323 | #ifdef VERBOSE |
---|
324 | LOGDEBUG ("Reading '%s' header from: '%s'", heading, line); |
---|
325 | #endif |
---|
326 | |
---|
327 | if (CTX->flags & DSF_WHITELIST) { |
---|
328 | /* Use the entire From: line for auto-whitelisting */ |
---|
329 | |
---|
330 | if (!strcmp(heading, "From")) { |
---|
331 | char wl[256]; |
---|
332 | char *fromline = line + 5; |
---|
333 | unsigned long long whitelist_token; |
---|
334 | |
---|
335 | if (fromline[0] == 32) |
---|
336 | fromline++; |
---|
337 | snprintf(wl, sizeof(wl), "%s*%s", heading, fromline); |
---|
338 | whitelist_token = _ds_getcrc64(wl); |
---|
339 | ds_diction_touch(diction, whitelist_token, wl, 0); |
---|
340 | diction->whitelist_token = whitelist_token; |
---|
341 | } |
---|
342 | } |
---|
343 | |
---|
344 | /* Received headers use a different set of delimiters to preserve things |
---|
345 | like ip addresses */ |
---|
346 | |
---|
347 | token = strtok_r ((multiline) ? line : NULL, SPARSE_DELIMITERS_HEADING, &ptrptr); |
---|
348 | |
---|
349 | while (token) |
---|
350 | { |
---|
351 | l = strlen(token); |
---|
352 | |
---|
353 | if (l > 0 && l < 50) |
---|
354 | { |
---|
355 | #ifdef VERBOSE |
---|
356 | LOGDEBUG ("Processing '%s' token in '%s' header", token, heading); |
---|
357 | #endif |
---|
358 | _ds_map_header_token (CTX, token, previous_tokens, diction, heading, bitpattern); |
---|
359 | } |
---|
360 | |
---|
361 | token = strtok_r (NULL, SPARSE_DELIMITERS_HEADING, &ptrptr); |
---|
362 | } |
---|
363 | |
---|
364 | for(i=0;i<SPARSE_WINDOW_SIZE;i++) { |
---|
365 | _ds_map_header_token(CTX, NULL, previous_tokens, diction, heading, bitpattern); |
---|
366 | } |
---|
367 | |
---|
368 | _ds_sparse_clear(previous_tokens); |
---|
369 | node_nt = c_nt_next (header, &c_nt); |
---|
370 | } |
---|
371 | nt_destroy (header); |
---|
372 | |
---|
373 | /* |
---|
374 | * Body Tokenization |
---|
375 | */ |
---|
376 | |
---|
377 | #ifdef VERBOSE |
---|
378 | LOGDEBUG("parsing message body"); |
---|
379 | #endif |
---|
380 | |
---|
381 | token = strtok_r (body, SPARSE_DELIMITERS, &ptrptr); |
---|
382 | while (token != NULL) |
---|
383 | { |
---|
384 | l = strlen (token); |
---|
385 | if (l > 0 && l < 50) |
---|
386 | { |
---|
387 | #ifdef VERBOSE |
---|
388 | LOGDEBUG ("Processing body token '%s'", token); |
---|
389 | #endif |
---|
390 | |
---|
391 | /* Process "current" token */ |
---|
392 | _ds_map_body_token (CTX, token, previous_tokens, diction, bitpattern); |
---|
393 | } |
---|
394 | token = strtok_r (NULL, SPARSE_DELIMITERS, &ptrptr); |
---|
395 | } |
---|
396 | |
---|
397 | for(i=0;i<SPARSE_WINDOW_SIZE;i++) { |
---|
398 | _ds_map_body_token(CTX, NULL, previous_tokens, diction, bitpattern); |
---|
399 | } |
---|
400 | |
---|
401 | _ds_sparse_clear(previous_tokens); |
---|
402 | |
---|
403 | free(bitpattern); |
---|
404 | |
---|
405 | #ifdef VERBOSE |
---|
406 | LOGDEBUG("Finished tokenizing (sparse) message"); |
---|
407 | #endif |
---|
408 | |
---|
409 | return 0; |
---|
410 | } |
---|
411 | |
---|
412 | /* |
---|
413 | * _ds_{process,map}_{header,body}_token() |
---|
414 | * |
---|
415 | * DESCRIPTION |
---|
416 | * Token processing and mapping functions |
---|
417 | * _ds_process_header_token |
---|
418 | * _ds_process_body_token |
---|
419 | * _ds_map_header_token |
---|
420 | * _ds_map_body_token |
---|
421 | * |
---|
422 | * These functions are responsible to converting the input words into |
---|
423 | * full blown tokens with CRCs, probabilities, and producing variants |
---|
424 | * based on the tokenizer approach applied. |
---|
425 | */ |
---|
426 | |
---|
427 | int |
---|
428 | _ds_process_header_token (DSPAM_CTX * CTX, char *token, |
---|
429 | const char *previous_token, ds_diction_t diction, |
---|
430 | const char *heading) |
---|
431 | { |
---|
432 | char combined_token[256]; |
---|
433 | unsigned long long crc; |
---|
434 | char *tweaked_token; |
---|
435 | |
---|
436 | if (_ds_match_attribute(CTX->config->attributes, "IgnoreHeader", heading)) |
---|
437 | return 0; |
---|
438 | |
---|
439 | if (!strncmp(heading, "X-DSPAM-", 8)) |
---|
440 | return 0; |
---|
441 | |
---|
442 | /* This is where we used to ignore certain headings */ |
---|
443 | |
---|
444 | if (heading[0] != 0) |
---|
445 | snprintf (combined_token, sizeof (combined_token), |
---|
446 | "%s*%s", heading, token); |
---|
447 | else |
---|
448 | strlcpy (combined_token, token, sizeof (combined_token)); |
---|
449 | |
---|
450 | tweaked_token = _ds_truncate_token(token); |
---|
451 | if (tweaked_token == NULL) |
---|
452 | return EUNKNOWN; |
---|
453 | |
---|
454 | snprintf(combined_token, sizeof(combined_token), "%s*%s", heading, tweaked_token); |
---|
455 | |
---|
456 | crc = _ds_getcrc64 (combined_token); |
---|
457 | #ifdef VERBOSE |
---|
458 | LOGDEBUG ("Token Hit: '%s'", combined_token); |
---|
459 | #endif |
---|
460 | ds_diction_touch(diction, crc, combined_token, 0); |
---|
461 | |
---|
462 | if (CTX->tokenizer == DSZ_CHAIN && previous_token != NULL) |
---|
463 | { |
---|
464 | char *tweaked_previous; |
---|
465 | |
---|
466 | tweaked_previous = _ds_truncate_token(previous_token); |
---|
467 | if (tweaked_previous == NULL) { |
---|
468 | free(tweaked_token); |
---|
469 | return EUNKNOWN; |
---|
470 | } |
---|
471 | |
---|
472 | snprintf (combined_token, sizeof (combined_token), |
---|
473 | "%s*%s+%s", heading, tweaked_previous, tweaked_token); |
---|
474 | crc = _ds_getcrc64 (combined_token); |
---|
475 | |
---|
476 | ds_diction_touch(diction, crc, combined_token, DSD_CHAINED); |
---|
477 | free(tweaked_previous); |
---|
478 | } |
---|
479 | |
---|
480 | free(tweaked_token); |
---|
481 | return 0; |
---|
482 | } |
---|
483 | |
---|
484 | int |
---|
485 | _ds_process_body_token (DSPAM_CTX * CTX, char *token, |
---|
486 | const char *previous_token, ds_diction_t diction) |
---|
487 | { |
---|
488 | char combined_token[256]; |
---|
489 | unsigned long long crc; |
---|
490 | char *tweaked_token; |
---|
491 | |
---|
492 | tweaked_token = _ds_truncate_token(token); |
---|
493 | if (tweaked_token == NULL) |
---|
494 | return EUNKNOWN; |
---|
495 | |
---|
496 | crc = _ds_getcrc64 (tweaked_token); |
---|
497 | |
---|
498 | ds_diction_touch(diction, crc, tweaked_token, DSD_CONTEXT); |
---|
499 | |
---|
500 | if (CTX->tokenizer == DSZ_CHAIN && previous_token != NULL) |
---|
501 | { |
---|
502 | char *tweaked_previous = _ds_truncate_token(previous_token); |
---|
503 | if (tweaked_previous == NULL) { |
---|
504 | free(tweaked_token); |
---|
505 | return EUNKNOWN; |
---|
506 | } |
---|
507 | |
---|
508 | snprintf (combined_token, sizeof (combined_token), "%s+%s", |
---|
509 | tweaked_previous, tweaked_token); |
---|
510 | crc = _ds_getcrc64 (combined_token); |
---|
511 | |
---|
512 | ds_diction_touch(diction, crc, combined_token, DSD_CHAINED | DSD_CONTEXT); |
---|
513 | free(tweaked_previous); |
---|
514 | } |
---|
515 | free(tweaked_token); |
---|
516 | |
---|
517 | return 0; |
---|
518 | } |
---|
519 | |
---|
520 | |
---|
521 | int |
---|
522 | _ds_map_header_token (DSPAM_CTX * CTX, char *token, |
---|
523 | char **previous_tokens, ds_diction_t diction, |
---|
524 | const char *heading, const char *bitpattern) |
---|
525 | { |
---|
526 | int i, t, keylen, breadth; |
---|
527 | u_int32_t mask; |
---|
528 | unsigned long long crc; |
---|
529 | char key[256]; |
---|
530 | int active = 0, top, tokenizer = CTX->tokenizer; |
---|
531 | |
---|
532 | if (_ds_match_attribute(CTX->config->attributes, "IgnoreHeader", heading)) |
---|
533 | return 0; |
---|
534 | |
---|
535 | if (!strncmp(heading, "X-DSPAM-", 8)) |
---|
536 | return 0; |
---|
537 | |
---|
538 | /* Shift all previous tokens up */ |
---|
539 | for(i=0;i<SPARSE_WINDOW_SIZE-1;i++) { |
---|
540 | previous_tokens[i] = previous_tokens[i+1]; |
---|
541 | if (previous_tokens[i]) |
---|
542 | active++; |
---|
543 | } |
---|
544 | |
---|
545 | previous_tokens[SPARSE_WINDOW_SIZE-1] = token; |
---|
546 | |
---|
547 | if (token) |
---|
548 | active++; |
---|
549 | |
---|
550 | breadth = _ds_pow2(active); |
---|
551 | |
---|
552 | /* Iterate and generate all keys necessary */ |
---|
553 | for (mask=0; mask < (u_int32_t)breadth; mask++) { |
---|
554 | int terms = 0; |
---|
555 | |
---|
556 | key[0] = 0; |
---|
557 | keylen = 0; |
---|
558 | t = 0; |
---|
559 | top = 1; |
---|
560 | |
---|
561 | /* Each Bit */ |
---|
562 | for(i=0;i<SPARSE_WINDOW_SIZE;i++) { |
---|
563 | |
---|
564 | if (t) { |
---|
565 | if ((size_t)keylen < (sizeof(key)-1)) { |
---|
566 | key[keylen] = '+'; |
---|
567 | key[++keylen] = 0; |
---|
568 | } |
---|
569 | } |
---|
570 | |
---|
571 | if (bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] == 1) { |
---|
572 | if (previous_tokens[i] == NULL || previous_tokens[i][0] == 0) { |
---|
573 | if ((size_t)keylen < (sizeof(key)-1)) { |
---|
574 | key[keylen] = '#'; |
---|
575 | key[++keylen] = 0; |
---|
576 | } |
---|
577 | } |
---|
578 | else |
---|
579 | { |
---|
580 | int tl = strlen(previous_tokens[i]); |
---|
581 | if ((size_t)(keylen + tl) < (sizeof(key)-1)) { |
---|
582 | strcpy(key+keylen, previous_tokens[i]); |
---|
583 | keylen += tl; |
---|
584 | } |
---|
585 | terms++; |
---|
586 | } |
---|
587 | } else { |
---|
588 | if ((size_t)keylen < (sizeof(key)-1)) { |
---|
589 | key[keylen] = '#'; |
---|
590 | key[++keylen] = 0; |
---|
591 | } |
---|
592 | } |
---|
593 | t++; |
---|
594 | } |
---|
595 | |
---|
596 | /* If the bucket has at least 1 literal, hit it */ |
---|
597 | if ((tokenizer == DSZ_SBPH && terms != 0) || |
---|
598 | (tokenizer == DSZ_OSB && terms == 2)) |
---|
599 | { |
---|
600 | char hkey[256]; |
---|
601 | char *k = key; |
---|
602 | while(keylen>2 && !strcmp((key+keylen)-2, "+#")) { |
---|
603 | key[keylen-2] = 0; |
---|
604 | keylen -=2; |
---|
605 | } |
---|
606 | while(!strncmp(k, "#+", 2)) { |
---|
607 | top = 0; |
---|
608 | k+=2; |
---|
609 | keylen -= 2; |
---|
610 | } |
---|
611 | |
---|
612 | if (top) { |
---|
613 | snprintf(hkey, sizeof(hkey), "%s*%s", heading, k); |
---|
614 | crc = _ds_getcrc64(hkey); |
---|
615 | ds_diction_touch(diction, crc, hkey, DSD_CONTEXT); |
---|
616 | } |
---|
617 | } |
---|
618 | } |
---|
619 | |
---|
620 | return 0; |
---|
621 | } |
---|
622 | |
---|
623 | int |
---|
624 | _ds_map_body_token ( |
---|
625 | DSPAM_CTX * CTX, |
---|
626 | char *token, |
---|
627 | char **previous_tokens, |
---|
628 | ds_diction_t diction, |
---|
629 | const char *bitpattern) |
---|
630 | { |
---|
631 | int i, t, keylen, breadth; |
---|
632 | int top, tokenizer = CTX->tokenizer; |
---|
633 | unsigned long long crc; |
---|
634 | char key[256]; |
---|
635 | int active = 0; |
---|
636 | u_int32_t mask; |
---|
637 | |
---|
638 | /* Shift all previous tokens up */ |
---|
639 | for(i=0;i<SPARSE_WINDOW_SIZE-1;i++) { |
---|
640 | previous_tokens[i] = previous_tokens[i+1]; |
---|
641 | if (previous_tokens[i]) |
---|
642 | active++; |
---|
643 | } |
---|
644 | |
---|
645 | previous_tokens[SPARSE_WINDOW_SIZE-1] = token; |
---|
646 | if (token) |
---|
647 | active++; |
---|
648 | |
---|
649 | breadth = _ds_pow2(active); |
---|
650 | |
---|
651 | /* Iterate and generate all keys necessary */ |
---|
652 | |
---|
653 | for(mask=0;mask < (u_int32_t)breadth;mask++) { |
---|
654 | int terms = 0; |
---|
655 | t = 0; |
---|
656 | |
---|
657 | key[0] = 0; |
---|
658 | keylen = 0; |
---|
659 | top = 1; |
---|
660 | |
---|
661 | /* Each Bit */ |
---|
662 | for(i=0;i<SPARSE_WINDOW_SIZE;i++) { |
---|
663 | if (t) { |
---|
664 | if ((size_t)keylen < (sizeof(key)-1)) { |
---|
665 | key[keylen] = '+'; |
---|
666 | key[++keylen] = 0; |
---|
667 | } |
---|
668 | } |
---|
669 | if (bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] == 1) { |
---|
670 | if (previous_tokens[i] == NULL || previous_tokens[i][0] == 0) { |
---|
671 | if ((size_t)keylen < (sizeof(key)-1)) { |
---|
672 | key[keylen] = '#'; |
---|
673 | key[++keylen] = 0; |
---|
674 | } |
---|
675 | } |
---|
676 | else |
---|
677 | { |
---|
678 | int tl = strlen(previous_tokens[i]); |
---|
679 | if ((size_t)(keylen + tl) < (sizeof(key)-1)) { |
---|
680 | strcpy(key+keylen, previous_tokens[i]); |
---|
681 | keylen += tl; |
---|
682 | } |
---|
683 | terms++; |
---|
684 | } |
---|
685 | } else { |
---|
686 | if ((size_t)keylen < (sizeof(key)-1)) { |
---|
687 | key[keylen] = '#'; |
---|
688 | key[++keylen] = 0; |
---|
689 | } |
---|
690 | } |
---|
691 | t++; |
---|
692 | } |
---|
693 | |
---|
694 | /* If the bucket has at least 1 literal, hit it */ |
---|
695 | if ((tokenizer == DSZ_SBPH && terms != 0) || |
---|
696 | (tokenizer == DSZ_OSB && terms == 2)) |
---|
697 | { |
---|
698 | char *k = key; |
---|
699 | while(keylen>2 && !strcmp((key+keylen)-2, "+#")) { |
---|
700 | key[keylen-2] = 0; |
---|
701 | keylen -=2; |
---|
702 | } |
---|
703 | while(!strncmp(k, "#+", 2)) { |
---|
704 | top = 0; |
---|
705 | k+=2; |
---|
706 | keylen -=2; |
---|
707 | } |
---|
708 | |
---|
709 | if (top) { |
---|
710 | crc = _ds_getcrc64(k); |
---|
711 | ds_diction_touch(diction, crc, k, DSD_CONTEXT); |
---|
712 | } |
---|
713 | } |
---|
714 | } |
---|
715 | |
---|
716 | return 0; |
---|
717 | } |
---|
718 | |
---|
719 | /* |
---|
720 | * _ds_degenerate_message() |
---|
721 | * |
---|
722 | * DESCRIPTION |
---|
723 | * Degenerate the message into headers, body and tokenizable pieces |
---|
724 | * |
---|
725 | * This function is responsible for analyzing the actualized message and |
---|
726 | * degenerating it into only the components which are tokenizable. This |
---|
727 | * process effectively eliminates much HTML noise, special symbols, or |
---|
728 | * other non-tokenizable/non-desirable components. What is left is the |
---|
729 | * bulk of the message and only desired tags, URLs, and other data. |
---|
730 | * |
---|
731 | * INPUT ARGUMENTS |
---|
732 | * header pointer to buffer containing headers |
---|
733 | * body pointer to buffer containing message body |
---|
734 | */ |
---|
735 | |
---|
736 | int _ds_degenerate_message(DSPAM_CTX *CTX, buffer * header, buffer * body) |
---|
737 | { |
---|
738 | char *decode = NULL; |
---|
739 | struct nt_node *node_nt, *node_header; |
---|
740 | struct nt_c c_nt, c_nt2; |
---|
741 | int i = 0; |
---|
742 | char heading[1024]; |
---|
743 | |
---|
744 | if (! CTX->message) |
---|
745 | { |
---|
746 | LOG (LOG_WARNING, "_ds_degenerate_message() failed: CTX->message is NULL"); |
---|
747 | return EUNKNOWN; |
---|
748 | } |
---|
749 | |
---|
750 | /* Iterate through each component and create large header/body buffers */ |
---|
751 | |
---|
752 | node_nt = c_nt_first (CTX->message->components, &c_nt); |
---|
753 | while (node_nt != NULL) |
---|
754 | { |
---|
755 | struct _ds_message_part *block = (struct _ds_message_part *) node_nt->ptr; |
---|
756 | |
---|
757 | #ifdef VERBOSE |
---|
758 | LOGDEBUG ("Processing component %d", i); |
---|
759 | #endif |
---|
760 | |
---|
761 | if (! block->headers || ! block->headers->items) |
---|
762 | { |
---|
763 | #ifdef VERBOSE |
---|
764 | LOGDEBUG (" : End of Message Identifier"); |
---|
765 | #endif |
---|
766 | } |
---|
767 | |
---|
768 | else |
---|
769 | { |
---|
770 | struct _ds_header_field *current_header; |
---|
771 | |
---|
772 | /* Accumulate the headers */ |
---|
773 | node_header = c_nt_first (block->headers, &c_nt2); |
---|
774 | while (node_header != NULL) |
---|
775 | { |
---|
776 | current_header = (struct _ds_header_field *) node_header->ptr; |
---|
777 | snprintf (heading, sizeof (heading), |
---|
778 | "%s: %s\n", current_header->heading, |
---|
779 | current_header->data); |
---|
780 | buffer_cat (header, heading); |
---|
781 | node_header = c_nt_next (block->headers, &c_nt2); |
---|
782 | } |
---|
783 | |
---|
784 | decode = block->body->data; |
---|
785 | |
---|
786 | if (block->media_type == MT_TEXT || |
---|
787 | block->media_type == MT_MESSAGE || |
---|
788 | block->media_type == MT_UNKNOWN || |
---|
789 | (block->media_type == MT_MULTIPART && !i)) |
---|
790 | { |
---|
791 | /* Accumulate the bodies, skip attachments */ |
---|
792 | |
---|
793 | if ( |
---|
794 | ( block->encoding == EN_BASE64 |
---|
795 | || block->encoding == EN_QUOTED_PRINTABLE) |
---|
796 | && ! block->original_signed_body) |
---|
797 | { |
---|
798 | if (block->content_disposition != PCD_ATTACHMENT) |
---|
799 | { |
---|
800 | LOGDEBUG ("decoding message block from encoding type %d", |
---|
801 | block->encoding); |
---|
802 | decode = _ds_decode_block (block); |
---|
803 | } |
---|
804 | } |
---|
805 | |
---|
806 | /* We found a tokenizable body component, add prefilters */ |
---|
807 | |
---|
808 | if (decode) |
---|
809 | { |
---|
810 | char *decode2 = NULL; |
---|
811 | char *decode3 = NULL; |
---|
812 | |
---|
813 | /* -- PREFILTERS BEGIN -- */ |
---|
814 | |
---|
815 | /* Hexadecimal 8-Bit Encodings */ |
---|
816 | |
---|
817 | if (block->encoding == EN_8BIT) { |
---|
818 | decode2 = _ds_decode_hex8bit(decode); |
---|
819 | } else { |
---|
820 | decode2 = strdup(decode); |
---|
821 | } |
---|
822 | |
---|
823 | /* HTML-Specific Filters */ |
---|
824 | |
---|
825 | if (decode2) { |
---|
826 | if (block->media_subtype == MST_HTML) { |
---|
827 | decode3 = _ds_strip_html(decode2); |
---|
828 | } else { |
---|
829 | decode3 = strdup(decode2); |
---|
830 | } |
---|
831 | free(decode2); |
---|
832 | } |
---|
833 | |
---|
834 | /* -- PREFILTERS END -- */ |
---|
835 | |
---|
836 | if (decode3) { |
---|
837 | buffer_cat (body, decode3); |
---|
838 | free(decode3); |
---|
839 | } |
---|
840 | |
---|
841 | /* If we've decoded the body, save the original copy */ |
---|
842 | if (decode != block->body->data) |
---|
843 | { |
---|
844 | block->original_signed_body = block->body; |
---|
845 | block->body = buffer_create (decode); |
---|
846 | free (decode); |
---|
847 | } |
---|
848 | } |
---|
849 | } |
---|
850 | } |
---|
851 | #ifdef VERBOSE |
---|
852 | LOGDEBUG ("Getting next message component"); |
---|
853 | #endif |
---|
854 | node_nt = c_nt_next (CTX->message->components, &c_nt); |
---|
855 | i++; |
---|
856 | } /* while (node_nt != NULL) */ |
---|
857 | |
---|
858 | if (header->data == NULL) |
---|
859 | buffer_cat (header, " "); |
---|
860 | |
---|
861 | if (body->data == NULL) |
---|
862 | buffer_cat (body, " "); |
---|
863 | |
---|
864 | return 0; |
---|
865 | } |
---|
866 | |
---|
867 | int _ds_url_tokenize(ds_diction_t diction, char *body, const char *key) |
---|
868 | { |
---|
869 | char *token, *url_ptr, *url_token, *ptr; |
---|
870 | char combined_token[256]; |
---|
871 | unsigned long long crc; |
---|
872 | int key_len = strlen(key); |
---|
873 | |
---|
874 | #ifdef VERBOSE |
---|
875 | LOGDEBUG("scanning for urls: %s\n", key); |
---|
876 | #endif |
---|
877 | if (!body) |
---|
878 | return EINVAL; |
---|
879 | url_ptr = body; |
---|
880 | |
---|
881 | token = strcasestr(url_ptr, key); |
---|
882 | while (token != NULL) |
---|
883 | { |
---|
884 | int i = 0, old; |
---|
885 | |
---|
886 | while(token[i] |
---|
887 | && token[i] > 32 |
---|
888 | && token[i] != '>' |
---|
889 | && ((token[i] != '\"' && token[i] != '\'') || i <= key_len)) |
---|
890 | i++; |
---|
891 | old = token[i]; |
---|
892 | token[i] = 0; /* parse in place */ |
---|
893 | |
---|
894 | /* Tokenize URL */ |
---|
895 | url_token = strtok_r (token, DELIMITERS, &ptr); |
---|
896 | while (url_token != NULL) |
---|
897 | { |
---|
898 | snprintf (combined_token, sizeof (combined_token), "Url*%s", url_token); |
---|
899 | crc = _ds_getcrc64 (combined_token); |
---|
900 | ds_diction_touch(diction, crc, combined_token, 0); |
---|
901 | url_token = strtok_r (NULL, DELIMITERS, &ptr); |
---|
902 | } |
---|
903 | memset (token, 32, i); |
---|
904 | token[i] = old; |
---|
905 | url_ptr = token + i; |
---|
906 | token = strcasestr(url_ptr, key); |
---|
907 | } |
---|
908 | return 0; |
---|
909 | } |
---|
910 | |
---|
911 | /* Truncate tokens with EOT delimiters */ |
---|
912 | char * _ds_truncate_token(const char *token) { |
---|
913 | char *tweaked; |
---|
914 | int i; |
---|
915 | |
---|
916 | if (token == NULL) |
---|
917 | return NULL; |
---|
918 | |
---|
919 | tweaked = strdup(token); |
---|
920 | |
---|
921 | if (tweaked == NULL) |
---|
922 | return NULL; |
---|
923 | |
---|
924 | i = strlen(tweaked); |
---|
925 | while(i>1 && strspn(tweaked+i-2, DELIMITERS_EOT)) { |
---|
926 | tweaked[i-1] = 0; |
---|
927 | i--; |
---|
928 | } |
---|
929 | |
---|
930 | return tweaked; |
---|
931 | } |
---|
932 | |
---|
933 | /* |
---|
934 | * _ds_spbh_clear |
---|
935 | * |
---|
936 | * DESCRIPTION |
---|
937 | * Clears the SBPH stack |
---|
938 | * |
---|
939 | * Clears and frees all of the tokens in the SBPH stack. Used when a |
---|
940 | * boundary has been crossed (such as a new message header) where |
---|
941 | * tokens from the previous boundary are no longer useful. |
---|
942 | */ |
---|
943 | |
---|
944 | void _ds_sparse_clear(char **previous_tokens) { |
---|
945 | int i; |
---|
946 | for(i=0;i<SPARSE_WINDOW_SIZE;i++) |
---|
947 | previous_tokens[i] = NULL; |
---|
948 | return; |
---|
949 | } |
---|
950 | |
---|
951 | /* |
---|
952 | * _ds_generate_bitpattern |
---|
953 | * |
---|
954 | * DESCRIPTION |
---|
955 | * Generates a sparse bitpattern for SPARSE_WINDOW_SIZE |
---|
956 | * |
---|
957 | * This pattern is then used to create token patterns when using SBPH or OSB |
---|
958 | * |
---|
959 | */ |
---|
960 | |
---|
961 | char *_ds_generate_bitpattern(int breadth) { |
---|
962 | char *bitpattern; |
---|
963 | u_int32_t mask; |
---|
964 | unsigned long exp; |
---|
965 | int i; |
---|
966 | |
---|
967 | bitpattern = malloc(SPARSE_WINDOW_SIZE * breadth); |
---|
968 | |
---|
969 | for(mask=0;mask<(u_int32_t)breadth;mask++) { |
---|
970 | for(i=0;i<SPARSE_WINDOW_SIZE;i++) { |
---|
971 | exp = (i) ? _ds_pow2(i) : 1; |
---|
972 | /* Reverse pos = SPARSE_WINDOW_SIZE - (i+1); */ |
---|
973 | if (mask & exp) |
---|
974 | { |
---|
975 | bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] = 1; |
---|
976 | } |
---|
977 | else |
---|
978 | { |
---|
979 | bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] = 0; |
---|
980 | } |
---|
981 | } |
---|
982 | } |
---|
983 | |
---|
984 | return bitpattern; |
---|
985 | } |
---|
986 | |
---|