[c5c522c] | 1 | /* $Id: tokenizer.c,v 1.301 2011/06/28 00:13:48 sbajic Exp $ */ |
---|
| 2 | |
---|
| 3 | /* |
---|
| 4 | DSPAM |
---|
| 5 | COPYRIGHT (C) 2002-2012 DSPAM PROJECT |
---|
| 6 | |
---|
| 7 | This program is free software: you can redistribute it and/or modify |
---|
| 8 | it under the terms of the GNU Affero General Public License as |
---|
| 9 | published by the Free Software Foundation, either version 3 of the |
---|
| 10 | License, or (at your option) any later version. |
---|
| 11 | |
---|
| 12 | This program is distributed in the hope that it will be useful, |
---|
| 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
| 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
| 15 | GNU Affero General Public License for more details. |
---|
| 16 | |
---|
| 17 | You should have received a copy of the GNU Affero General Public License |
---|
| 18 | along with this program. If not, see <http://www.gnu.org/licenses/>. |
---|
| 19 | |
---|
| 20 | */ |
---|
| 21 | |
---|
| 22 | /* |
---|
| 23 | * tokenizer.c - tokenizer functions |
---|
| 24 | * |
---|
| 25 | * DESCRIPTION |
---|
| 26 | * The tokenizer subroutines are responsible for decomposing a message into |
---|
| 27 | * its colloquial components. All components are stored collectively in |
---|
| 28 | * a diction object, passed into the function. |
---|
| 29 | * |
---|
| 30 | */ |
---|
| 31 | |
---|
| 32 | #ifdef HAVE_CONFIG_H |
---|
| 33 | #include <auto-config.h> |
---|
| 34 | #endif |
---|
| 35 | |
---|
| 36 | #include <stdio.h> |
---|
| 37 | #include <stdlib.h> |
---|
| 38 | #include <math.h> |
---|
| 39 | #include <ctype.h> |
---|
| 40 | #include <errno.h> |
---|
| 41 | #include <string.h> |
---|
| 42 | #ifdef HAVE_UNISTD_H |
---|
| 43 | #include <unistd.h> |
---|
| 44 | #endif |
---|
| 45 | #include <sys/types.h> |
---|
| 46 | #include <sys/stat.h> |
---|
| 47 | |
---|
| 48 | #ifdef TIME_WITH_SYS_TIME |
---|
| 49 | # include <sys/time.h> |
---|
| 50 | # include <time.h> |
---|
| 51 | #else |
---|
| 52 | # ifdef HAVE_SYS_TIME_H |
---|
| 53 | # include <sys/time.h> |
---|
| 54 | # else |
---|
| 55 | # include <time.h> |
---|
| 56 | # endif |
---|
| 57 | #endif |
---|
| 58 | |
---|
| 59 | #include "config.h" |
---|
| 60 | #include "tokenizer.h" |
---|
| 61 | #include "util.h" |
---|
| 62 | #include "libdspam.h" |
---|
| 63 | #include "language.h" |
---|
| 64 | |
---|
| 65 | /* |
---|
| 66 | * _ds_tokenize() - tokenize the message |
---|
| 67 | * |
---|
| 68 | * DESCRIPTION |
---|
| 69 | * tokenizes the supplied message |
---|
| 70 | * |
---|
| 71 | * INPUT ARGUMENTS |
---|
| 72 | * DSPAM_CTX *CTX pointer to context |
---|
| 73 | * char *header pointer to message header |
---|
| 74 | * char *body pointer to message body |
---|
| 75 | * ds_diction_t diction to store components |
---|
| 76 | * |
---|
| 77 | * RETURN VALUES |
---|
| 78 | * standard errors on failure |
---|
| 79 | * zero if successful |
---|
| 80 | * |
---|
| 81 | */ |
---|
| 82 | |
---|
| 83 | int |
---|
| 84 | _ds_tokenize (DSPAM_CTX * CTX, char *headers, char *body, ds_diction_t diction) |
---|
| 85 | { |
---|
| 86 | if (diction == NULL) |
---|
| 87 | return EINVAL; |
---|
| 88 | |
---|
| 89 | if (CTX->tokenizer == DSZ_SBPH || CTX->tokenizer == DSZ_OSB) |
---|
| 90 | return _ds_tokenize_sparse(CTX, headers, body, diction); |
---|
| 91 | else |
---|
| 92 | return _ds_tokenize_ngram(CTX, headers, body, diction); |
---|
| 93 | } |
---|
| 94 | |
---|
| 95 | int _ds_tokenize_ngram( |
---|
| 96 | DSPAM_CTX *CTX, |
---|
| 97 | char *headers, |
---|
| 98 | char *body, |
---|
| 99 | ds_diction_t diction) |
---|
| 100 | { |
---|
| 101 | char *token; /* current token */ |
---|
| 102 | char *previous_token = NULL; /* used for bigrams (chained tokens) */ |
---|
| 103 | char *line = NULL; /* header broken up into lines */ |
---|
| 104 | char *ptrptr; |
---|
| 105 | char heading[128]; /* current heading */ |
---|
| 106 | int l, tokenizer = CTX->tokenizer; |
---|
| 107 | |
---|
| 108 | struct nt *header = NULL; |
---|
| 109 | struct nt_node *node_nt; |
---|
| 110 | struct nt_c c_nt; |
---|
| 111 | |
---|
| 112 | /* Tokenize URLs in message */ |
---|
| 113 | |
---|
| 114 | if (_ds_match_attribute(CTX->config->attributes, "ProcessorURLContext", "on")) { |
---|
| 115 | _ds_url_tokenize(diction, body, "http://"); |
---|
| 116 | _ds_url_tokenize(diction, body, "www."); |
---|
| 117 | _ds_url_tokenize(diction, body, "href="); |
---|
| 118 | } |
---|
| 119 | |
---|
| 120 | /* |
---|
| 121 | * Header Tokenization |
---|
| 122 | */ |
---|
| 123 | |
---|
| 124 | header = nt_create (NT_CHAR); |
---|
| 125 | if (header == NULL) |
---|
| 126 | { |
---|
| 127 | LOG (LOG_CRIT, ERR_MEM_ALLOC); |
---|
| 128 | return EUNKNOWN; |
---|
| 129 | } |
---|
| 130 | |
---|
| 131 | line = strtok_r (headers, "\n", &ptrptr); |
---|
| 132 | while (line) { |
---|
| 133 | nt_add (header, line); |
---|
| 134 | line = strtok_r (NULL, "\n", &ptrptr); |
---|
| 135 | } |
---|
| 136 | |
---|
| 137 | node_nt = c_nt_first (header, &c_nt); |
---|
| 138 | heading[0] = 0; |
---|
| 139 | while (node_nt) { |
---|
| 140 | int multiline; |
---|
| 141 | |
---|
| 142 | #ifdef VERBOSE |
---|
| 143 | LOGDEBUG("processing line: %s", node_nt->ptr); |
---|
| 144 | #endif |
---|
| 145 | |
---|
| 146 | line = node_nt->ptr; |
---|
| 147 | token = strtok_r (line, ":", &ptrptr); |
---|
| 148 | if (token && token[0] != 32 && token[0] != 9 && !strstr (token, " ")) |
---|
| 149 | { |
---|
| 150 | multiline = 0; |
---|
| 151 | strlcpy (heading, token, 128); |
---|
| 152 | previous_token = NULL; |
---|
| 153 | } else { |
---|
| 154 | multiline = 1; |
---|
| 155 | } |
---|
| 156 | |
---|
| 157 | #ifdef VERBOSE |
---|
| 158 | LOGDEBUG ("Reading '%s' header from: '%s'", heading, line); |
---|
| 159 | #endif |
---|
| 160 | |
---|
| 161 | if (CTX->flags & DSF_WHITELIST) { |
---|
| 162 | /* Use the entire From: line for auto-whitelisting */ |
---|
| 163 | |
---|
| 164 | if (!strcmp(heading, "From")) { |
---|
| 165 | char wl[256]; |
---|
| 166 | char *fromline = line + 5; |
---|
| 167 | unsigned long long whitelist_token; |
---|
| 168 | |
---|
| 169 | if (fromline[0] == 32) |
---|
| 170 | fromline++; |
---|
| 171 | snprintf(wl, sizeof(wl), "%s*%s", heading, fromline); |
---|
| 172 | whitelist_token = _ds_getcrc64(wl); |
---|
| 173 | ds_diction_touch(diction, whitelist_token, wl, 0); |
---|
| 174 | diction->whitelist_token = whitelist_token; |
---|
| 175 | } |
---|
| 176 | } |
---|
| 177 | |
---|
| 178 | /* Received headers use a different set of delimiters to preserve things |
---|
| 179 | like ip addresses */ |
---|
| 180 | |
---|
| 181 | token = strtok_r ((multiline) ? line : NULL, DELIMITERS_HEADING, &ptrptr); |
---|
| 182 | |
---|
| 183 | while (token) |
---|
| 184 | { |
---|
| 185 | l = strlen(token); |
---|
| 186 | |
---|
| 187 | if (l >= 1 && l < 50) |
---|
| 188 | { |
---|
| 189 | #ifdef VERBOSE |
---|
| 190 | LOGDEBUG ("Processing '%s' token in '%s' header", token, heading); |
---|
| 191 | #endif |
---|
| 192 | |
---|
| 193 | /* Process "current" token */ |
---|
| 194 | if (!_ds_process_header_token |
---|
| 195 | (CTX, token, previous_token, diction, heading) && |
---|
| 196 | (tokenizer == DSZ_CHAIN)) |
---|
| 197 | { |
---|
| 198 | previous_token = token; |
---|
| 199 | } |
---|
| 200 | } |
---|
| 201 | |
---|
| 202 | token = strtok_r (NULL, DELIMITERS_HEADING, &ptrptr); |
---|
| 203 | } |
---|
| 204 | |
---|
| 205 | previous_token = NULL; |
---|
| 206 | node_nt = c_nt_next (header, &c_nt); |
---|
| 207 | } |
---|
| 208 | |
---|
| 209 | nt_destroy (header); |
---|
| 210 | |
---|
| 211 | /* |
---|
| 212 | * Body Tokenization |
---|
| 213 | */ |
---|
| 214 | |
---|
| 215 | #ifdef VERBOSE |
---|
| 216 | LOGDEBUG("parsing message body"); |
---|
| 217 | #endif |
---|
| 218 | |
---|
| 219 | token = strtok_r (body, DELIMITERS, &ptrptr); |
---|
| 220 | while (token != NULL) |
---|
| 221 | { |
---|
| 222 | l = strlen (token); |
---|
| 223 | if (l >= 1 && l < 50) |
---|
| 224 | { |
---|
| 225 | #ifdef VERBOSE |
---|
| 226 | LOGDEBUG ("Processing body token '%s'", token); |
---|
| 227 | #endif |
---|
| 228 | |
---|
| 229 | /* Process "current" token */ |
---|
| 230 | if ( !_ds_process_body_token(CTX, token, previous_token, diction) |
---|
| 231 | && tokenizer == DSZ_CHAIN) |
---|
| 232 | { |
---|
| 233 | previous_token = token; |
---|
| 234 | } |
---|
| 235 | } |
---|
| 236 | token = strtok_r (NULL, DELIMITERS, &ptrptr); |
---|
| 237 | } |
---|
| 238 | |
---|
| 239 | #ifdef VERBOSE |
---|
| 240 | LOGDEBUG("Finished tokenizing (ngram) message"); |
---|
| 241 | #endif |
---|
| 242 | |
---|
| 243 | /* Final token reassembly (anything left in the buffer) */ |
---|
| 244 | |
---|
| 245 | return 0; |
---|
| 246 | } |
---|
| 247 | |
---|
| 248 | int _ds_tokenize_sparse( |
---|
| 249 | DSPAM_CTX *CTX, |
---|
| 250 | char *headers, |
---|
| 251 | char *body, |
---|
| 252 | ds_diction_t diction) |
---|
| 253 | { |
---|
| 254 | int i; |
---|
| 255 | char *token; /* current token */ |
---|
| 256 | char *previous_tokens[SPARSE_WINDOW_SIZE]; /* sparse chain */ |
---|
| 257 | |
---|
| 258 | char *line = NULL; /* header broken up into lines */ |
---|
| 259 | char *ptrptr; |
---|
| 260 | char *bitpattern; |
---|
| 261 | |
---|
| 262 | char heading[128]; /* current heading */ |
---|
| 263 | int l; |
---|
| 264 | |
---|
| 265 | struct nt *header = NULL; |
---|
| 266 | struct nt_node *node_nt; |
---|
| 267 | struct nt_c c_nt; |
---|
| 268 | |
---|
| 269 | for(i=0;i<SPARSE_WINDOW_SIZE;i++) |
---|
| 270 | previous_tokens[i] = NULL; |
---|
| 271 | |
---|
| 272 | bitpattern = _ds_generate_bitpattern(_ds_pow2(SPARSE_WINDOW_SIZE)); |
---|
| 273 | |
---|
| 274 | /* Tokenize URLs in message */ |
---|
| 275 | |
---|
| 276 | if (_ds_match_attribute(CTX->config->attributes, "ProcessorURLContext", "on")) |
---|
| 277 | { |
---|
| 278 | _ds_url_tokenize(diction, body, "http://"); |
---|
| 279 | _ds_url_tokenize(diction, body, "www."); |
---|
| 280 | _ds_url_tokenize(diction, body, "href="); |
---|
| 281 | } |
---|
| 282 | |
---|
| 283 | /* |
---|
| 284 | * Header Tokenization |
---|
| 285 | */ |
---|
| 286 | |
---|
| 287 | header = nt_create (NT_CHAR); |
---|
| 288 | if (header == NULL) |
---|
| 289 | { |
---|
| 290 | LOG (LOG_CRIT, ERR_MEM_ALLOC); |
---|
| 291 | free(bitpattern); |
---|
| 292 | return EUNKNOWN; |
---|
| 293 | } |
---|
| 294 | |
---|
| 295 | line = strtok_r (headers, "\n", &ptrptr); |
---|
| 296 | while (line) { |
---|
| 297 | nt_add (header, line); |
---|
| 298 | line = strtok_r (NULL, "\n", &ptrptr); |
---|
| 299 | } |
---|
| 300 | |
---|
| 301 | node_nt = c_nt_first (header, &c_nt); |
---|
| 302 | heading[0] = 0; |
---|
| 303 | while (node_nt) { |
---|
| 304 | int multiline; |
---|
| 305 | |
---|
| 306 | #ifdef VERBOSE |
---|
| 307 | LOGDEBUG("processing line: %s", node_nt->ptr); |
---|
| 308 | #endif |
---|
| 309 | |
---|
| 310 | _ds_sparse_clear(previous_tokens); |
---|
| 311 | |
---|
| 312 | line = node_nt->ptr; |
---|
| 313 | token = strtok_r (line, ":", &ptrptr); |
---|
| 314 | if (token && token[0] != 32 && token[0] != 9 && !strstr (token, " ")) |
---|
| 315 | { |
---|
| 316 | multiline = 0; |
---|
| 317 | strlcpy (heading, token, 128); |
---|
| 318 | _ds_sparse_clear(previous_tokens); |
---|
| 319 | } else { |
---|
| 320 | multiline = 1; |
---|
| 321 | } |
---|
| 322 | |
---|
| 323 | #ifdef VERBOSE |
---|
| 324 | LOGDEBUG ("Reading '%s' header from: '%s'", heading, line); |
---|
| 325 | #endif |
---|
| 326 | |
---|
| 327 | if (CTX->flags & DSF_WHITELIST) { |
---|
| 328 | /* Use the entire From: line for auto-whitelisting */ |
---|
| 329 | |
---|
| 330 | if (!strcmp(heading, "From")) { |
---|
| 331 | char wl[256]; |
---|
| 332 | char *fromline = line + 5; |
---|
| 333 | unsigned long long whitelist_token; |
---|
| 334 | |
---|
| 335 | if (fromline[0] == 32) |
---|
| 336 | fromline++; |
---|
| 337 | snprintf(wl, sizeof(wl), "%s*%s", heading, fromline); |
---|
| 338 | whitelist_token = _ds_getcrc64(wl); |
---|
| 339 | ds_diction_touch(diction, whitelist_token, wl, 0); |
---|
| 340 | diction->whitelist_token = whitelist_token; |
---|
| 341 | } |
---|
| 342 | } |
---|
| 343 | |
---|
| 344 | /* Received headers use a different set of delimiters to preserve things |
---|
| 345 | like ip addresses */ |
---|
| 346 | |
---|
| 347 | token = strtok_r ((multiline) ? line : NULL, SPARSE_DELIMITERS_HEADING, &ptrptr); |
---|
| 348 | |
---|
| 349 | while (token) |
---|
| 350 | { |
---|
| 351 | l = strlen(token); |
---|
| 352 | |
---|
| 353 | if (l > 0 && l < 50) |
---|
| 354 | { |
---|
| 355 | #ifdef VERBOSE |
---|
| 356 | LOGDEBUG ("Processing '%s' token in '%s' header", token, heading); |
---|
| 357 | #endif |
---|
| 358 | _ds_map_header_token (CTX, token, previous_tokens, diction, heading, bitpattern); |
---|
| 359 | } |
---|
| 360 | |
---|
| 361 | token = strtok_r (NULL, SPARSE_DELIMITERS_HEADING, &ptrptr); |
---|
| 362 | } |
---|
| 363 | |
---|
| 364 | for(i=0;i<SPARSE_WINDOW_SIZE;i++) { |
---|
| 365 | _ds_map_header_token(CTX, NULL, previous_tokens, diction, heading, bitpattern); |
---|
| 366 | } |
---|
| 367 | |
---|
| 368 | _ds_sparse_clear(previous_tokens); |
---|
| 369 | node_nt = c_nt_next (header, &c_nt); |
---|
| 370 | } |
---|
| 371 | nt_destroy (header); |
---|
| 372 | |
---|
| 373 | /* |
---|
| 374 | * Body Tokenization |
---|
| 375 | */ |
---|
| 376 | |
---|
| 377 | #ifdef VERBOSE |
---|
| 378 | LOGDEBUG("parsing message body"); |
---|
| 379 | #endif |
---|
| 380 | |
---|
| 381 | token = strtok_r (body, SPARSE_DELIMITERS, &ptrptr); |
---|
| 382 | while (token != NULL) |
---|
| 383 | { |
---|
| 384 | l = strlen (token); |
---|
| 385 | if (l > 0 && l < 50) |
---|
| 386 | { |
---|
| 387 | #ifdef VERBOSE |
---|
| 388 | LOGDEBUG ("Processing body token '%s'", token); |
---|
| 389 | #endif |
---|
| 390 | |
---|
| 391 | /* Process "current" token */ |
---|
| 392 | _ds_map_body_token (CTX, token, previous_tokens, diction, bitpattern); |
---|
| 393 | } |
---|
| 394 | token = strtok_r (NULL, SPARSE_DELIMITERS, &ptrptr); |
---|
| 395 | } |
---|
| 396 | |
---|
| 397 | for(i=0;i<SPARSE_WINDOW_SIZE;i++) { |
---|
| 398 | _ds_map_body_token(CTX, NULL, previous_tokens, diction, bitpattern); |
---|
| 399 | } |
---|
| 400 | |
---|
| 401 | _ds_sparse_clear(previous_tokens); |
---|
| 402 | |
---|
| 403 | free(bitpattern); |
---|
| 404 | |
---|
| 405 | #ifdef VERBOSE |
---|
| 406 | LOGDEBUG("Finished tokenizing (sparse) message"); |
---|
| 407 | #endif |
---|
| 408 | |
---|
| 409 | return 0; |
---|
| 410 | } |
---|
| 411 | |
---|
| 412 | /* |
---|
| 413 | * _ds_{process,map}_{header,body}_token() |
---|
| 414 | * |
---|
| 415 | * DESCRIPTION |
---|
| 416 | * Token processing and mapping functions |
---|
| 417 | * _ds_process_header_token |
---|
| 418 | * _ds_process_body_token |
---|
| 419 | * _ds_map_header_token |
---|
| 420 | * _ds_map_body_token |
---|
| 421 | * |
---|
| 422 | * These functions are responsible to converting the input words into |
---|
| 423 | * full blown tokens with CRCs, probabilities, and producing variants |
---|
| 424 | * based on the tokenizer approach applied. |
---|
| 425 | */ |
---|
| 426 | |
---|
| 427 | int |
---|
| 428 | _ds_process_header_token (DSPAM_CTX * CTX, char *token, |
---|
| 429 | const char *previous_token, ds_diction_t diction, |
---|
| 430 | const char *heading) |
---|
| 431 | { |
---|
| 432 | char combined_token[256]; |
---|
| 433 | unsigned long long crc; |
---|
| 434 | char *tweaked_token; |
---|
| 435 | |
---|
| 436 | if (_ds_match_attribute(CTX->config->attributes, "IgnoreHeader", heading)) |
---|
| 437 | return 0; |
---|
| 438 | |
---|
| 439 | if (!strncmp(heading, "X-DSPAM-", 8)) |
---|
| 440 | return 0; |
---|
| 441 | |
---|
| 442 | /* This is where we used to ignore certain headings */ |
---|
| 443 | |
---|
| 444 | if (heading[0] != 0) |
---|
| 445 | snprintf (combined_token, sizeof (combined_token), |
---|
| 446 | "%s*%s", heading, token); |
---|
| 447 | else |
---|
| 448 | strlcpy (combined_token, token, sizeof (combined_token)); |
---|
| 449 | |
---|
| 450 | tweaked_token = _ds_truncate_token(token); |
---|
| 451 | if (tweaked_token == NULL) |
---|
| 452 | return EUNKNOWN; |
---|
| 453 | |
---|
| 454 | snprintf(combined_token, sizeof(combined_token), "%s*%s", heading, tweaked_token); |
---|
| 455 | |
---|
| 456 | crc = _ds_getcrc64 (combined_token); |
---|
| 457 | #ifdef VERBOSE |
---|
| 458 | LOGDEBUG ("Token Hit: '%s'", combined_token); |
---|
| 459 | #endif |
---|
| 460 | ds_diction_touch(diction, crc, combined_token, 0); |
---|
| 461 | |
---|
| 462 | if (CTX->tokenizer == DSZ_CHAIN && previous_token != NULL) |
---|
| 463 | { |
---|
| 464 | char *tweaked_previous; |
---|
| 465 | |
---|
| 466 | tweaked_previous = _ds_truncate_token(previous_token); |
---|
| 467 | if (tweaked_previous == NULL) { |
---|
| 468 | free(tweaked_token); |
---|
| 469 | return EUNKNOWN; |
---|
| 470 | } |
---|
| 471 | |
---|
| 472 | snprintf (combined_token, sizeof (combined_token), |
---|
| 473 | "%s*%s+%s", heading, tweaked_previous, tweaked_token); |
---|
| 474 | crc = _ds_getcrc64 (combined_token); |
---|
| 475 | |
---|
| 476 | ds_diction_touch(diction, crc, combined_token, DSD_CHAINED); |
---|
| 477 | free(tweaked_previous); |
---|
| 478 | } |
---|
| 479 | |
---|
| 480 | free(tweaked_token); |
---|
| 481 | return 0; |
---|
| 482 | } |
---|
| 483 | |
---|
| 484 | int |
---|
| 485 | _ds_process_body_token (DSPAM_CTX * CTX, char *token, |
---|
| 486 | const char *previous_token, ds_diction_t diction) |
---|
| 487 | { |
---|
| 488 | char combined_token[256]; |
---|
| 489 | unsigned long long crc; |
---|
| 490 | char *tweaked_token; |
---|
| 491 | |
---|
| 492 | tweaked_token = _ds_truncate_token(token); |
---|
| 493 | if (tweaked_token == NULL) |
---|
| 494 | return EUNKNOWN; |
---|
| 495 | |
---|
| 496 | crc = _ds_getcrc64 (tweaked_token); |
---|
| 497 | |
---|
| 498 | ds_diction_touch(diction, crc, tweaked_token, DSD_CONTEXT); |
---|
| 499 | |
---|
| 500 | if (CTX->tokenizer == DSZ_CHAIN && previous_token != NULL) |
---|
| 501 | { |
---|
| 502 | char *tweaked_previous = _ds_truncate_token(previous_token); |
---|
| 503 | if (tweaked_previous == NULL) { |
---|
| 504 | free(tweaked_token); |
---|
| 505 | return EUNKNOWN; |
---|
| 506 | } |
---|
| 507 | |
---|
| 508 | snprintf (combined_token, sizeof (combined_token), "%s+%s", |
---|
| 509 | tweaked_previous, tweaked_token); |
---|
| 510 | crc = _ds_getcrc64 (combined_token); |
---|
| 511 | |
---|
| 512 | ds_diction_touch(diction, crc, combined_token, DSD_CHAINED | DSD_CONTEXT); |
---|
| 513 | free(tweaked_previous); |
---|
| 514 | } |
---|
| 515 | free(tweaked_token); |
---|
| 516 | |
---|
| 517 | return 0; |
---|
| 518 | } |
---|
| 519 | |
---|
| 520 | |
---|
| 521 | int |
---|
| 522 | _ds_map_header_token (DSPAM_CTX * CTX, char *token, |
---|
| 523 | char **previous_tokens, ds_diction_t diction, |
---|
| 524 | const char *heading, const char *bitpattern) |
---|
| 525 | { |
---|
| 526 | int i, t, keylen, breadth; |
---|
| 527 | u_int32_t mask; |
---|
| 528 | unsigned long long crc; |
---|
| 529 | char key[256]; |
---|
| 530 | int active = 0, top, tokenizer = CTX->tokenizer; |
---|
| 531 | |
---|
| 532 | if (_ds_match_attribute(CTX->config->attributes, "IgnoreHeader", heading)) |
---|
| 533 | return 0; |
---|
| 534 | |
---|
| 535 | if (!strncmp(heading, "X-DSPAM-", 8)) |
---|
| 536 | return 0; |
---|
| 537 | |
---|
| 538 | /* Shift all previous tokens up */ |
---|
| 539 | for(i=0;i<SPARSE_WINDOW_SIZE-1;i++) { |
---|
| 540 | previous_tokens[i] = previous_tokens[i+1]; |
---|
| 541 | if (previous_tokens[i]) |
---|
| 542 | active++; |
---|
| 543 | } |
---|
| 544 | |
---|
| 545 | previous_tokens[SPARSE_WINDOW_SIZE-1] = token; |
---|
| 546 | |
---|
| 547 | if (token) |
---|
| 548 | active++; |
---|
| 549 | |
---|
| 550 | breadth = _ds_pow2(active); |
---|
| 551 | |
---|
| 552 | /* Iterate and generate all keys necessary */ |
---|
| 553 | for (mask=0; mask < (u_int32_t)breadth; mask++) { |
---|
| 554 | int terms = 0; |
---|
| 555 | |
---|
| 556 | key[0] = 0; |
---|
| 557 | keylen = 0; |
---|
| 558 | t = 0; |
---|
| 559 | top = 1; |
---|
| 560 | |
---|
| 561 | /* Each Bit */ |
---|
| 562 | for(i=0;i<SPARSE_WINDOW_SIZE;i++) { |
---|
| 563 | |
---|
| 564 | if (t) { |
---|
| 565 | if ((size_t)keylen < (sizeof(key)-1)) { |
---|
| 566 | key[keylen] = '+'; |
---|
| 567 | key[++keylen] = 0; |
---|
| 568 | } |
---|
| 569 | } |
---|
| 570 | |
---|
| 571 | if (bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] == 1) { |
---|
| 572 | if (previous_tokens[i] == NULL || previous_tokens[i][0] == 0) { |
---|
| 573 | if ((size_t)keylen < (sizeof(key)-1)) { |
---|
| 574 | key[keylen] = '#'; |
---|
| 575 | key[++keylen] = 0; |
---|
| 576 | } |
---|
| 577 | } |
---|
| 578 | else |
---|
| 579 | { |
---|
| 580 | int tl = strlen(previous_tokens[i]); |
---|
| 581 | if ((size_t)(keylen + tl) < (sizeof(key)-1)) { |
---|
| 582 | strcpy(key+keylen, previous_tokens[i]); |
---|
| 583 | keylen += tl; |
---|
| 584 | } |
---|
| 585 | terms++; |
---|
| 586 | } |
---|
| 587 | } else { |
---|
| 588 | if ((size_t)keylen < (sizeof(key)-1)) { |
---|
| 589 | key[keylen] = '#'; |
---|
| 590 | key[++keylen] = 0; |
---|
| 591 | } |
---|
| 592 | } |
---|
| 593 | t++; |
---|
| 594 | } |
---|
| 595 | |
---|
| 596 | /* If the bucket has at least 1 literal, hit it */ |
---|
| 597 | if ((tokenizer == DSZ_SBPH && terms != 0) || |
---|
| 598 | (tokenizer == DSZ_OSB && terms == 2)) |
---|
| 599 | { |
---|
| 600 | char hkey[256]; |
---|
| 601 | char *k = key; |
---|
| 602 | while(keylen>2 && !strcmp((key+keylen)-2, "+#")) { |
---|
| 603 | key[keylen-2] = 0; |
---|
| 604 | keylen -=2; |
---|
| 605 | } |
---|
| 606 | while(!strncmp(k, "#+", 2)) { |
---|
| 607 | top = 0; |
---|
| 608 | k+=2; |
---|
| 609 | keylen -= 2; |
---|
| 610 | } |
---|
| 611 | |
---|
| 612 | if (top) { |
---|
| 613 | snprintf(hkey, sizeof(hkey), "%s*%s", heading, k); |
---|
| 614 | crc = _ds_getcrc64(hkey); |
---|
| 615 | ds_diction_touch(diction, crc, hkey, DSD_CONTEXT); |
---|
| 616 | } |
---|
| 617 | } |
---|
| 618 | } |
---|
| 619 | |
---|
| 620 | return 0; |
---|
| 621 | } |
---|
| 622 | |
---|
| 623 | int |
---|
| 624 | _ds_map_body_token ( |
---|
| 625 | DSPAM_CTX * CTX, |
---|
| 626 | char *token, |
---|
| 627 | char **previous_tokens, |
---|
| 628 | ds_diction_t diction, |
---|
| 629 | const char *bitpattern) |
---|
| 630 | { |
---|
| 631 | int i, t, keylen, breadth; |
---|
| 632 | int top, tokenizer = CTX->tokenizer; |
---|
| 633 | unsigned long long crc; |
---|
| 634 | char key[256]; |
---|
| 635 | int active = 0; |
---|
| 636 | u_int32_t mask; |
---|
| 637 | |
---|
| 638 | /* Shift all previous tokens up */ |
---|
| 639 | for(i=0;i<SPARSE_WINDOW_SIZE-1;i++) { |
---|
| 640 | previous_tokens[i] = previous_tokens[i+1]; |
---|
| 641 | if (previous_tokens[i]) |
---|
| 642 | active++; |
---|
| 643 | } |
---|
| 644 | |
---|
| 645 | previous_tokens[SPARSE_WINDOW_SIZE-1] = token; |
---|
| 646 | if (token) |
---|
| 647 | active++; |
---|
| 648 | |
---|
| 649 | breadth = _ds_pow2(active); |
---|
| 650 | |
---|
| 651 | /* Iterate and generate all keys necessary */ |
---|
| 652 | |
---|
| 653 | for(mask=0;mask < (u_int32_t)breadth;mask++) { |
---|
| 654 | int terms = 0; |
---|
| 655 | t = 0; |
---|
| 656 | |
---|
| 657 | key[0] = 0; |
---|
| 658 | keylen = 0; |
---|
| 659 | top = 1; |
---|
| 660 | |
---|
| 661 | /* Each Bit */ |
---|
| 662 | for(i=0;i<SPARSE_WINDOW_SIZE;i++) { |
---|
| 663 | if (t) { |
---|
| 664 | if ((size_t)keylen < (sizeof(key)-1)) { |
---|
| 665 | key[keylen] = '+'; |
---|
| 666 | key[++keylen] = 0; |
---|
| 667 | } |
---|
| 668 | } |
---|
| 669 | if (bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] == 1) { |
---|
| 670 | if (previous_tokens[i] == NULL || previous_tokens[i][0] == 0) { |
---|
| 671 | if ((size_t)keylen < (sizeof(key)-1)) { |
---|
| 672 | key[keylen] = '#'; |
---|
| 673 | key[++keylen] = 0; |
---|
| 674 | } |
---|
| 675 | } |
---|
| 676 | else |
---|
| 677 | { |
---|
| 678 | int tl = strlen(previous_tokens[i]); |
---|
| 679 | if ((size_t)(keylen + tl) < (sizeof(key)-1)) { |
---|
| 680 | strcpy(key+keylen, previous_tokens[i]); |
---|
| 681 | keylen += tl; |
---|
| 682 | } |
---|
| 683 | terms++; |
---|
| 684 | } |
---|
| 685 | } else { |
---|
| 686 | if ((size_t)keylen < (sizeof(key)-1)) { |
---|
| 687 | key[keylen] = '#'; |
---|
| 688 | key[++keylen] = 0; |
---|
| 689 | } |
---|
| 690 | } |
---|
| 691 | t++; |
---|
| 692 | } |
---|
| 693 | |
---|
| 694 | /* If the bucket has at least 1 literal, hit it */ |
---|
| 695 | if ((tokenizer == DSZ_SBPH && terms != 0) || |
---|
| 696 | (tokenizer == DSZ_OSB && terms == 2)) |
---|
| 697 | { |
---|
| 698 | char *k = key; |
---|
| 699 | while(keylen>2 && !strcmp((key+keylen)-2, "+#")) { |
---|
| 700 | key[keylen-2] = 0; |
---|
| 701 | keylen -=2; |
---|
| 702 | } |
---|
| 703 | while(!strncmp(k, "#+", 2)) { |
---|
| 704 | top = 0; |
---|
| 705 | k+=2; |
---|
| 706 | keylen -=2; |
---|
| 707 | } |
---|
| 708 | |
---|
| 709 | if (top) { |
---|
| 710 | crc = _ds_getcrc64(k); |
---|
| 711 | ds_diction_touch(diction, crc, k, DSD_CONTEXT); |
---|
| 712 | } |
---|
| 713 | } |
---|
| 714 | } |
---|
| 715 | |
---|
| 716 | return 0; |
---|
| 717 | } |
---|
| 718 | |
---|
| 719 | /* |
---|
| 720 | * _ds_degenerate_message() |
---|
| 721 | * |
---|
| 722 | * DESCRIPTION |
---|
| 723 | * Degenerate the message into headers, body and tokenizable pieces |
---|
| 724 | * |
---|
| 725 | * This function is responsible for analyzing the actualized message and |
---|
| 726 | * degenerating it into only the components which are tokenizable. This |
---|
| 727 | * process effectively eliminates much HTML noise, special symbols, or |
---|
| 728 | * other non-tokenizable/non-desirable components. What is left is the |
---|
| 729 | * bulk of the message and only desired tags, URLs, and other data. |
---|
| 730 | * |
---|
| 731 | * INPUT ARGUMENTS |
---|
| 732 | * header pointer to buffer containing headers |
---|
| 733 | * body pointer to buffer containing message body |
---|
| 734 | */ |
---|
| 735 | |
---|
| 736 | int _ds_degenerate_message(DSPAM_CTX *CTX, buffer * header, buffer * body) |
---|
| 737 | { |
---|
| 738 | char *decode = NULL; |
---|
| 739 | struct nt_node *node_nt, *node_header; |
---|
| 740 | struct nt_c c_nt, c_nt2; |
---|
| 741 | int i = 0; |
---|
| 742 | char heading[1024]; |
---|
| 743 | |
---|
| 744 | if (! CTX->message) |
---|
| 745 | { |
---|
| 746 | LOG (LOG_WARNING, "_ds_degenerate_message() failed: CTX->message is NULL"); |
---|
| 747 | return EUNKNOWN; |
---|
| 748 | } |
---|
| 749 | |
---|
| 750 | /* Iterate through each component and create large header/body buffers */ |
---|
| 751 | |
---|
| 752 | node_nt = c_nt_first (CTX->message->components, &c_nt); |
---|
| 753 | while (node_nt != NULL) |
---|
| 754 | { |
---|
| 755 | struct _ds_message_part *block = (struct _ds_message_part *) node_nt->ptr; |
---|
| 756 | |
---|
| 757 | #ifdef VERBOSE |
---|
| 758 | LOGDEBUG ("Processing component %d", i); |
---|
| 759 | #endif |
---|
| 760 | |
---|
| 761 | if (! block->headers || ! block->headers->items) |
---|
| 762 | { |
---|
| 763 | #ifdef VERBOSE |
---|
| 764 | LOGDEBUG (" : End of Message Identifier"); |
---|
| 765 | #endif |
---|
| 766 | } |
---|
| 767 | |
---|
| 768 | else |
---|
| 769 | { |
---|
| 770 | struct _ds_header_field *current_header; |
---|
| 771 | |
---|
| 772 | /* Accumulate the headers */ |
---|
| 773 | node_header = c_nt_first (block->headers, &c_nt2); |
---|
| 774 | while (node_header != NULL) |
---|
| 775 | { |
---|
| 776 | current_header = (struct _ds_header_field *) node_header->ptr; |
---|
| 777 | snprintf (heading, sizeof (heading), |
---|
| 778 | "%s: %s\n", current_header->heading, |
---|
| 779 | current_header->data); |
---|
| 780 | buffer_cat (header, heading); |
---|
| 781 | node_header = c_nt_next (block->headers, &c_nt2); |
---|
| 782 | } |
---|
| 783 | |
---|
| 784 | decode = block->body->data; |
---|
| 785 | |
---|
| 786 | if (block->media_type == MT_TEXT || |
---|
| 787 | block->media_type == MT_MESSAGE || |
---|
| 788 | block->media_type == MT_UNKNOWN || |
---|
| 789 | (block->media_type == MT_MULTIPART && !i)) |
---|
| 790 | { |
---|
| 791 | /* Accumulate the bodies, skip attachments */ |
---|
| 792 | |
---|
| 793 | if ( |
---|
| 794 | ( block->encoding == EN_BASE64 |
---|
| 795 | || block->encoding == EN_QUOTED_PRINTABLE) |
---|
| 796 | && ! block->original_signed_body) |
---|
| 797 | { |
---|
| 798 | if (block->content_disposition != PCD_ATTACHMENT) |
---|
| 799 | { |
---|
| 800 | LOGDEBUG ("decoding message block from encoding type %d", |
---|
| 801 | block->encoding); |
---|
| 802 | decode = _ds_decode_block (block); |
---|
| 803 | } |
---|
| 804 | } |
---|
| 805 | |
---|
| 806 | /* We found a tokenizable body component, add prefilters */ |
---|
| 807 | |
---|
| 808 | if (decode) |
---|
| 809 | { |
---|
| 810 | char *decode2 = NULL; |
---|
| 811 | char *decode3 = NULL; |
---|
| 812 | |
---|
| 813 | /* -- PREFILTERS BEGIN -- */ |
---|
| 814 | |
---|
| 815 | /* Hexadecimal 8-Bit Encodings */ |
---|
| 816 | |
---|
| 817 | if (block->encoding == EN_8BIT) { |
---|
| 818 | decode2 = _ds_decode_hex8bit(decode); |
---|
| 819 | } else { |
---|
| 820 | decode2 = strdup(decode); |
---|
| 821 | } |
---|
| 822 | |
---|
| 823 | /* HTML-Specific Filters */ |
---|
| 824 | |
---|
| 825 | if (decode2) { |
---|
| 826 | if (block->media_subtype == MST_HTML) { |
---|
| 827 | decode3 = _ds_strip_html(decode2); |
---|
| 828 | } else { |
---|
| 829 | decode3 = strdup(decode2); |
---|
| 830 | } |
---|
| 831 | free(decode2); |
---|
| 832 | } |
---|
| 833 | |
---|
| 834 | /* -- PREFILTERS END -- */ |
---|
| 835 | |
---|
| 836 | if (decode3) { |
---|
| 837 | buffer_cat (body, decode3); |
---|
| 838 | free(decode3); |
---|
| 839 | } |
---|
| 840 | |
---|
| 841 | /* If we've decoded the body, save the original copy */ |
---|
| 842 | if (decode != block->body->data) |
---|
| 843 | { |
---|
| 844 | block->original_signed_body = block->body; |
---|
| 845 | block->body = buffer_create (decode); |
---|
| 846 | free (decode); |
---|
| 847 | } |
---|
| 848 | } |
---|
| 849 | } |
---|
| 850 | } |
---|
| 851 | #ifdef VERBOSE |
---|
| 852 | LOGDEBUG ("Getting next message component"); |
---|
| 853 | #endif |
---|
| 854 | node_nt = c_nt_next (CTX->message->components, &c_nt); |
---|
| 855 | i++; |
---|
| 856 | } /* while (node_nt != NULL) */ |
---|
| 857 | |
---|
| 858 | if (header->data == NULL) |
---|
| 859 | buffer_cat (header, " "); |
---|
| 860 | |
---|
| 861 | if (body->data == NULL) |
---|
| 862 | buffer_cat (body, " "); |
---|
| 863 | |
---|
| 864 | return 0; |
---|
| 865 | } |
---|
| 866 | |
---|
| 867 | int _ds_url_tokenize(ds_diction_t diction, char *body, const char *key) |
---|
| 868 | { |
---|
| 869 | char *token, *url_ptr, *url_token, *ptr; |
---|
| 870 | char combined_token[256]; |
---|
| 871 | unsigned long long crc; |
---|
| 872 | int key_len = strlen(key); |
---|
| 873 | |
---|
| 874 | #ifdef VERBOSE |
---|
| 875 | LOGDEBUG("scanning for urls: %s\n", key); |
---|
| 876 | #endif |
---|
| 877 | if (!body) |
---|
| 878 | return EINVAL; |
---|
| 879 | url_ptr = body; |
---|
| 880 | |
---|
| 881 | token = strcasestr(url_ptr, key); |
---|
| 882 | while (token != NULL) |
---|
| 883 | { |
---|
| 884 | int i = 0, old; |
---|
| 885 | |
---|
| 886 | while(token[i] |
---|
| 887 | && token[i] > 32 |
---|
| 888 | && token[i] != '>' |
---|
| 889 | && ((token[i] != '\"' && token[i] != '\'') || i <= key_len)) |
---|
| 890 | i++; |
---|
| 891 | old = token[i]; |
---|
| 892 | token[i] = 0; /* parse in place */ |
---|
| 893 | |
---|
| 894 | /* Tokenize URL */ |
---|
| 895 | url_token = strtok_r (token, DELIMITERS, &ptr); |
---|
| 896 | while (url_token != NULL) |
---|
| 897 | { |
---|
| 898 | snprintf (combined_token, sizeof (combined_token), "Url*%s", url_token); |
---|
| 899 | crc = _ds_getcrc64 (combined_token); |
---|
| 900 | ds_diction_touch(diction, crc, combined_token, 0); |
---|
| 901 | url_token = strtok_r (NULL, DELIMITERS, &ptr); |
---|
| 902 | } |
---|
| 903 | memset (token, 32, i); |
---|
| 904 | token[i] = old; |
---|
| 905 | url_ptr = token + i; |
---|
| 906 | token = strcasestr(url_ptr, key); |
---|
| 907 | } |
---|
| 908 | return 0; |
---|
| 909 | } |
---|
| 910 | |
---|
| 911 | /* Truncate tokens with EOT delimiters */ |
---|
| 912 | char * _ds_truncate_token(const char *token) { |
---|
| 913 | char *tweaked; |
---|
| 914 | int i; |
---|
| 915 | |
---|
| 916 | if (token == NULL) |
---|
| 917 | return NULL; |
---|
| 918 | |
---|
| 919 | tweaked = strdup(token); |
---|
| 920 | |
---|
| 921 | if (tweaked == NULL) |
---|
| 922 | return NULL; |
---|
| 923 | |
---|
| 924 | i = strlen(tweaked); |
---|
| 925 | while(i>1 && strspn(tweaked+i-2, DELIMITERS_EOT)) { |
---|
| 926 | tweaked[i-1] = 0; |
---|
| 927 | i--; |
---|
| 928 | } |
---|
| 929 | |
---|
| 930 | return tweaked; |
---|
| 931 | } |
---|
| 932 | |
---|
| 933 | /* |
---|
| 934 | * _ds_spbh_clear |
---|
| 935 | * |
---|
| 936 | * DESCRIPTION |
---|
| 937 | * Clears the SBPH stack |
---|
| 938 | * |
---|
| 939 | * Clears and frees all of the tokens in the SBPH stack. Used when a |
---|
| 940 | * boundary has been crossed (such as a new message header) where |
---|
| 941 | * tokens from the previous boundary are no longer useful. |
---|
| 942 | */ |
---|
| 943 | |
---|
| 944 | void _ds_sparse_clear(char **previous_tokens) { |
---|
| 945 | int i; |
---|
| 946 | for(i=0;i<SPARSE_WINDOW_SIZE;i++) |
---|
| 947 | previous_tokens[i] = NULL; |
---|
| 948 | return; |
---|
| 949 | } |
---|
| 950 | |
---|
| 951 | /* |
---|
| 952 | * _ds_generate_bitpattern |
---|
| 953 | * |
---|
| 954 | * DESCRIPTION |
---|
| 955 | * Generates a sparse bitpattern for SPARSE_WINDOW_SIZE |
---|
| 956 | * |
---|
| 957 | * This pattern is then used to create token patterns when using SBPH or OSB |
---|
| 958 | * |
---|
| 959 | */ |
---|
| 960 | |
---|
| 961 | char *_ds_generate_bitpattern(int breadth) { |
---|
| 962 | char *bitpattern; |
---|
| 963 | u_int32_t mask; |
---|
| 964 | unsigned long exp; |
---|
| 965 | int i; |
---|
| 966 | |
---|
| 967 | bitpattern = malloc(SPARSE_WINDOW_SIZE * breadth); |
---|
| 968 | |
---|
| 969 | for(mask=0;mask<(u_int32_t)breadth;mask++) { |
---|
| 970 | for(i=0;i<SPARSE_WINDOW_SIZE;i++) { |
---|
| 971 | exp = (i) ? _ds_pow2(i) : 1; |
---|
| 972 | /* Reverse pos = SPARSE_WINDOW_SIZE - (i+1); */ |
---|
| 973 | if (mask & exp) |
---|
| 974 | { |
---|
| 975 | bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] = 1; |
---|
| 976 | } |
---|
| 977 | else |
---|
| 978 | { |
---|
| 979 | bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] = 0; |
---|
| 980 | } |
---|
| 981 | } |
---|
| 982 | } |
---|
| 983 | |
---|
| 984 | return bitpattern; |
---|
| 985 | } |
---|
| 986 | |
---|