source: npl/mailserver/dspam/dspam-3.10.2/src/tokenizer.c @ c5c522c

gcc484ntopperl-5.22
Last change on this file since c5c522c was c5c522c, checked in by Edwin Eefting <edwin@datux.nl>, 8 years ago

initial commit, transferred from cleaned syn3 svn tree

  • Property mode set to 100644
File size: 23.3 KB
Line 
1/* $Id: tokenizer.c,v 1.301 2011/06/28 00:13:48 sbajic Exp $ */
2
3/*
4 DSPAM
5 COPYRIGHT (C) 2002-2012 DSPAM PROJECT
6
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU Affero General Public License as
9 published by the Free Software Foundation, either version 3 of the
10 License, or (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU Affero General Public License for more details.
16
17 You should have received a copy of the GNU Affero General Public License
18 along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20*/
21
22/*
23 * tokenizer.c - tokenizer functions
24 *
25 * DESCRIPTION
26 *   The tokenizer subroutines are responsible for decomposing a message into
27 *   its colloquial components. All components are stored collectively in
28 *   a diction object, passed into the function.
29 *
30 */
31
32#ifdef HAVE_CONFIG_H
33#include <auto-config.h>
34#endif
35
36#include <stdio.h>
37#include <stdlib.h>
38#include <math.h>
39#include <ctype.h>
40#include <errno.h>
41#include <string.h>
42#ifdef HAVE_UNISTD_H
43#include <unistd.h>
44#endif
45#include <sys/types.h>
46#include <sys/stat.h>
47
48#ifdef TIME_WITH_SYS_TIME
49#   include <sys/time.h>
50#   include <time.h>
51#else
52#   ifdef HAVE_SYS_TIME_H
53#       include <sys/time.h>
54#   else
55#       include <time.h>
56#   endif
57#endif
58
59#include "config.h"
60#include "tokenizer.h"
61#include "util.h"
62#include "libdspam.h"
63#include "language.h"
64
65/*
66 * _ds_tokenize() - tokenize the message
67 *
68 * DESCRIPTION
69 *    tokenizes the supplied message
70 *
71 * INPUT ARGUMENTS
72 *     DSPAM_CTX *CTX    pointer to context
73 *     char *header      pointer to message header
74 *     char *body        pointer to message body
75 *     ds_diction_t      diction to store components
76 *
77 * RETURN VALUES
78 *   standard errors on failure
79 *   zero if successful
80 *
81 */
82
83int
84_ds_tokenize (DSPAM_CTX * CTX, char *headers, char *body, ds_diction_t diction)
85{
86  if (diction == NULL)
87    return EINVAL;
88
89  if (CTX->tokenizer == DSZ_SBPH || CTX->tokenizer == DSZ_OSB)
90    return _ds_tokenize_sparse(CTX, headers, body, diction);
91  else
92    return _ds_tokenize_ngram(CTX, headers, body, diction);
93}
94
95int _ds_tokenize_ngram(
96  DSPAM_CTX *CTX,
97  char *headers,
98  char *body,
99  ds_diction_t diction)
100{
101  char *token;                          /* current token */
102  char *previous_token = NULL;          /* used for bigrams (chained tokens) */
103  char *line = NULL;                    /* header broken up into lines */
104  char *ptrptr;
105  char heading[128];                    /* current heading */
106  int l, tokenizer = CTX->tokenizer;
107
108  struct nt *header = NULL;
109  struct nt_node *node_nt;
110  struct nt_c c_nt;
111
112  /* Tokenize URLs in message */
113
114  if (_ds_match_attribute(CTX->config->attributes, "ProcessorURLContext", "on"))  {
115    _ds_url_tokenize(diction, body, "http://");
116    _ds_url_tokenize(diction, body, "www.");
117    _ds_url_tokenize(diction, body, "href=");
118  }
119
120  /*
121   * Header Tokenization
122   */
123 
124  header = nt_create (NT_CHAR);
125  if (header == NULL)
126  {
127    LOG (LOG_CRIT, ERR_MEM_ALLOC);
128    return EUNKNOWN;
129  }
130
131  line = strtok_r (headers, "\n", &ptrptr);
132  while (line) {
133    nt_add (header, line);
134    line = strtok_r (NULL, "\n", &ptrptr);
135  }
136
137  node_nt = c_nt_first (header, &c_nt);
138  heading[0] = 0;
139  while (node_nt) {
140    int multiline;
141
142#ifdef VERBOSE
143    LOGDEBUG("processing line: %s", node_nt->ptr);
144#endif
145
146    line = node_nt->ptr;
147    token = strtok_r (line, ":", &ptrptr);
148    if (token && token[0] != 32 && token[0] != 9 && !strstr (token, " "))
149    {
150      multiline = 0;
151      strlcpy (heading, token, 128);
152      previous_token = NULL;
153    } else {
154      multiline = 1;
155    }
156
157#ifdef VERBOSE
158    LOGDEBUG ("Reading '%s' header from: '%s'", heading, line);
159#endif
160
161    if (CTX->flags & DSF_WHITELIST) {
162      /* Use the entire From: line for auto-whitelisting */
163
164      if (!strcmp(heading, "From")) {
165        char wl[256];
166        char *fromline = line + 5;
167        unsigned long long whitelist_token;
168
169        if (fromline[0] == 32)
170          fromline++;
171        snprintf(wl, sizeof(wl), "%s*%s", heading, fromline);
172        whitelist_token = _ds_getcrc64(wl);
173        ds_diction_touch(diction, whitelist_token, wl, 0);
174        diction->whitelist_token = whitelist_token;
175      }
176    }
177
178    /* Received headers use a different set of delimiters to preserve things
179       like ip addresses */
180
181    token = strtok_r ((multiline) ? line : NULL, DELIMITERS_HEADING, &ptrptr);
182
183    while (token)
184    {
185      l = strlen(token);
186
187      if (l >= 1 && l < 50)
188      {
189#ifdef VERBOSE
190        LOGDEBUG ("Processing '%s' token in '%s' header", token, heading);
191#endif
192
193        /* Process "current" token */
194        if (!_ds_process_header_token
195            (CTX, token, previous_token, diction, heading) &&
196            (tokenizer == DSZ_CHAIN))
197        {
198          previous_token = token;
199        }
200      }
201
202      token = strtok_r (NULL, DELIMITERS_HEADING, &ptrptr);
203    }
204
205    previous_token = NULL;
206    node_nt = c_nt_next (header, &c_nt);
207  }
208
209  nt_destroy (header);
210
211  /*
212   * Body Tokenization
213   */
214
215#ifdef VERBOSE
216  LOGDEBUG("parsing message body");
217#endif
218
219  token = strtok_r (body, DELIMITERS, &ptrptr);
220  while (token != NULL)
221  {
222    l = strlen (token);
223    if (l >= 1 && l < 50)
224    {
225#ifdef VERBOSE
226        LOGDEBUG ("Processing body token '%s'", token);
227#endif
228
229      /* Process "current" token */
230      if ( !_ds_process_body_token(CTX, token, previous_token, diction)
231        && tokenizer == DSZ_CHAIN)
232      {
233        previous_token = token;
234      }
235    }
236    token = strtok_r (NULL, DELIMITERS, &ptrptr);
237  }
238
239#ifdef VERBOSE
240  LOGDEBUG("Finished tokenizing (ngram) message");
241#endif
242
243  /* Final token reassembly (anything left in the buffer) */
244
245  return 0;
246}
247
248int _ds_tokenize_sparse(
249  DSPAM_CTX *CTX,
250  char *headers,
251  char *body,
252  ds_diction_t diction)
253{
254  int i;
255  char *token;                          /* current token */
256  char *previous_tokens[SPARSE_WINDOW_SIZE];    /* sparse chain */
257
258  char *line = NULL;                    /* header broken up into lines */
259  char *ptrptr;
260  char *bitpattern;
261
262  char heading[128];                    /* current heading */
263  int l;
264
265  struct nt *header = NULL;
266  struct nt_node *node_nt;
267  struct nt_c c_nt;
268
269  for(i=0;i<SPARSE_WINDOW_SIZE;i++)
270    previous_tokens[i] = NULL;
271
272  bitpattern = _ds_generate_bitpattern(_ds_pow2(SPARSE_WINDOW_SIZE));
273
274  /* Tokenize URLs in message */
275
276  if (_ds_match_attribute(CTX->config->attributes, "ProcessorURLContext", "on"))
277  {
278    _ds_url_tokenize(diction, body, "http://");
279    _ds_url_tokenize(diction, body, "www.");
280    _ds_url_tokenize(diction, body, "href=");
281  }
282
283  /*
284   * Header Tokenization
285   */
286 
287  header = nt_create (NT_CHAR);
288  if (header == NULL)
289  {
290    LOG (LOG_CRIT, ERR_MEM_ALLOC);
291    free(bitpattern);
292    return EUNKNOWN;
293  }
294
295  line = strtok_r (headers, "\n", &ptrptr);
296  while (line) {
297    nt_add (header, line);
298    line = strtok_r (NULL, "\n", &ptrptr);
299  }
300
301  node_nt = c_nt_first (header, &c_nt);
302  heading[0] = 0;
303  while (node_nt) {
304    int multiline;
305
306#ifdef VERBOSE
307    LOGDEBUG("processing line: %s", node_nt->ptr);
308#endif
309
310    _ds_sparse_clear(previous_tokens);
311
312    line = node_nt->ptr;
313    token = strtok_r (line, ":", &ptrptr);
314    if (token && token[0] != 32 && token[0] != 9 && !strstr (token, " "))
315    {
316      multiline = 0;
317      strlcpy (heading, token, 128);
318      _ds_sparse_clear(previous_tokens);
319    } else {
320      multiline = 1;
321    }
322
323#ifdef VERBOSE
324    LOGDEBUG ("Reading '%s' header from: '%s'", heading, line);
325#endif
326
327    if (CTX->flags & DSF_WHITELIST) {
328      /* Use the entire From: line for auto-whitelisting */
329
330      if (!strcmp(heading, "From")) {
331        char wl[256];
332        char *fromline = line + 5;
333        unsigned long long whitelist_token;
334
335        if (fromline[0] == 32)
336          fromline++;
337        snprintf(wl, sizeof(wl), "%s*%s", heading, fromline);
338        whitelist_token = _ds_getcrc64(wl);
339        ds_diction_touch(diction, whitelist_token, wl, 0);
340        diction->whitelist_token = whitelist_token;
341      }
342    }
343
344    /* Received headers use a different set of delimiters to preserve things
345       like ip addresses */
346
347    token = strtok_r ((multiline) ? line : NULL, SPARSE_DELIMITERS_HEADING, &ptrptr);
348
349    while (token)
350    {
351      l = strlen(token);
352
353      if (l > 0 && l < 50)
354      {
355#ifdef VERBOSE
356        LOGDEBUG ("Processing '%s' token in '%s' header", token, heading);
357#endif
358        _ds_map_header_token (CTX, token, previous_tokens, diction, heading, bitpattern);
359      }
360
361      token = strtok_r (NULL, SPARSE_DELIMITERS_HEADING, &ptrptr);
362    }
363
364    for(i=0;i<SPARSE_WINDOW_SIZE;i++) {
365      _ds_map_header_token(CTX, NULL, previous_tokens, diction, heading, bitpattern);
366    }
367
368    _ds_sparse_clear(previous_tokens);
369    node_nt = c_nt_next (header, &c_nt);
370  }
371  nt_destroy (header);
372
373  /*
374   * Body Tokenization
375   */
376
377#ifdef VERBOSE
378  LOGDEBUG("parsing message body");
379#endif
380
381  token = strtok_r (body, SPARSE_DELIMITERS, &ptrptr);
382  while (token != NULL)
383  {
384    l = strlen (token);
385    if (l > 0 && l < 50)
386    {
387#ifdef VERBOSE
388        LOGDEBUG ("Processing body token '%s'", token);
389#endif
390
391      /* Process "current" token */
392      _ds_map_body_token (CTX, token, previous_tokens, diction, bitpattern);
393    }
394    token = strtok_r (NULL, SPARSE_DELIMITERS, &ptrptr);
395  }
396
397  for(i=0;i<SPARSE_WINDOW_SIZE;i++) {
398    _ds_map_body_token(CTX, NULL, previous_tokens, diction, bitpattern);
399  }
400
401  _ds_sparse_clear(previous_tokens);
402
403  free(bitpattern);
404
405#ifdef VERBOSE
406  LOGDEBUG("Finished tokenizing (sparse) message");
407#endif
408
409  return 0;
410}
411
412/*
413 * _ds_{process,map}_{header,body}_token()
414 *
415 * DESCRIPTION
416 *  Token processing and mapping functions
417 *    _ds_process_header_token
418 *    _ds_process_body_token
419 *    _ds_map_header_token
420 *    _ds_map_body_token
421 *
422 *  These functions are responsible to converting the input words into
423 *  full blown tokens with CRCs, probabilities, and producing variants
424 *  based on the tokenizer approach applied.
425 */
426 
427int
428_ds_process_header_token (DSPAM_CTX * CTX, char *token,
429                          const char *previous_token, ds_diction_t diction,
430                          const char *heading)
431{
432  char combined_token[256];
433  unsigned long long crc;
434  char *tweaked_token;
435
436  if (_ds_match_attribute(CTX->config->attributes, "IgnoreHeader", heading))
437    return 0;
438
439  if (!strncmp(heading, "X-DSPAM-", 8))
440    return 0;
441
442  /* This is where we used to ignore certain headings */
443
444  if (heading[0] != 0)
445    snprintf (combined_token, sizeof (combined_token),
446              "%s*%s", heading, token);
447  else
448    strlcpy (combined_token, token, sizeof (combined_token));
449
450  tweaked_token = _ds_truncate_token(token);
451  if (tweaked_token == NULL)
452    return EUNKNOWN;
453
454  snprintf(combined_token, sizeof(combined_token), "%s*%s", heading, tweaked_token);
455
456  crc = _ds_getcrc64 (combined_token);
457#ifdef VERBOSE
458  LOGDEBUG ("Token Hit: '%s'", combined_token);
459#endif
460  ds_diction_touch(diction, crc, combined_token, 0);
461
462  if (CTX->tokenizer == DSZ_CHAIN && previous_token != NULL)
463  {
464    char *tweaked_previous;
465
466    tweaked_previous = _ds_truncate_token(previous_token);
467    if (tweaked_previous == NULL) {
468      free(tweaked_token);
469      return EUNKNOWN;
470    }
471
472    snprintf (combined_token, sizeof (combined_token),
473              "%s*%s+%s", heading, tweaked_previous, tweaked_token);
474    crc = _ds_getcrc64 (combined_token);
475
476    ds_diction_touch(diction, crc, combined_token, DSD_CHAINED);
477    free(tweaked_previous);
478  }
479
480  free(tweaked_token);
481  return 0;
482}
483
484int
485_ds_process_body_token (DSPAM_CTX * CTX, char *token,
486                        const char *previous_token, ds_diction_t diction)
487{
488  char combined_token[256];
489  unsigned long long crc;
490  char *tweaked_token;
491
492  tweaked_token = _ds_truncate_token(token);
493  if (tweaked_token == NULL)
494    return EUNKNOWN;
495
496  crc = _ds_getcrc64 (tweaked_token);
497
498  ds_diction_touch(diction, crc, tweaked_token, DSD_CONTEXT);
499
500  if (CTX->tokenizer == DSZ_CHAIN && previous_token != NULL)
501  {
502    char *tweaked_previous = _ds_truncate_token(previous_token);
503    if (tweaked_previous == NULL) {
504      free(tweaked_token);
505      return EUNKNOWN;
506    }
507
508    snprintf (combined_token, sizeof (combined_token), "%s+%s",
509              tweaked_previous, tweaked_token);
510    crc = _ds_getcrc64 (combined_token);
511
512    ds_diction_touch(diction, crc, combined_token, DSD_CHAINED | DSD_CONTEXT);
513    free(tweaked_previous);
514  }
515  free(tweaked_token);
516
517  return 0;
518}
519
520
521int
522_ds_map_header_token (DSPAM_CTX * CTX, char *token,
523                      char **previous_tokens, ds_diction_t diction,
524                      const char *heading, const char *bitpattern)
525{
526  int i, t, keylen, breadth;
527  u_int32_t mask;
528  unsigned long long crc;
529  char key[256];
530  int active = 0, top, tokenizer = CTX->tokenizer;
531
532  if (_ds_match_attribute(CTX->config->attributes, "IgnoreHeader", heading))
533    return 0;
534
535  if (!strncmp(heading, "X-DSPAM-", 8))
536    return 0;
537
538  /* Shift all previous tokens up */
539  for(i=0;i<SPARSE_WINDOW_SIZE-1;i++) {
540    previous_tokens[i] = previous_tokens[i+1];
541    if (previous_tokens[i])
542      active++;
543  }
544
545  previous_tokens[SPARSE_WINDOW_SIZE-1] = token;
546
547  if (token)
548    active++;
549
550  breadth = _ds_pow2(active);
551 
552  /* Iterate and generate all keys necessary */
553  for (mask=0; mask < (u_int32_t)breadth; mask++) {
554    int terms = 0;
555
556    key[0] = 0;
557    keylen = 0;
558    t = 0;
559    top = 1;
560
561    /* Each Bit */
562    for(i=0;i<SPARSE_WINDOW_SIZE;i++) {
563
564      if (t) {
565        if ((size_t)keylen < (sizeof(key)-1)) {
566          key[keylen] = '+';
567          key[++keylen] = 0;
568        }
569      }
570
571      if (bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] == 1) {
572        if (previous_tokens[i] == NULL || previous_tokens[i][0] == 0) {
573          if ((size_t)keylen < (sizeof(key)-1)) {
574            key[keylen] = '#';
575            key[++keylen] = 0;
576          }
577        }
578        else
579        {
580          int tl = strlen(previous_tokens[i]);
581          if ((size_t)(keylen + tl) < (sizeof(key)-1)) {
582            strcpy(key+keylen, previous_tokens[i]);
583            keylen += tl;
584          }
585          terms++;
586        }
587      } else {
588        if ((size_t)keylen < (sizeof(key)-1)) {
589          key[keylen] = '#';
590          key[++keylen] = 0;
591        }
592      }
593      t++;
594    }
595
596    /* If the bucket has at least 1 literal, hit it */
597    if ((tokenizer == DSZ_SBPH && terms != 0) ||
598        (tokenizer == DSZ_OSB  && terms == 2))
599    {
600      char hkey[256];
601      char *k = key;
602      while(keylen>2 && !strcmp((key+keylen)-2, "+#")) {
603        key[keylen-2] = 0;
604        keylen -=2;
605      }
606      while(!strncmp(k, "#+", 2)) {
607        top = 0;
608        k+=2;
609        keylen -= 2;
610      }
611
612      if (top) {
613        snprintf(hkey, sizeof(hkey), "%s*%s", heading, k);
614        crc = _ds_getcrc64(hkey);
615        ds_diction_touch(diction, crc, hkey, DSD_CONTEXT);
616      }
617    }
618  }
619
620  return 0;
621}
622
623int
624_ds_map_body_token (
625  DSPAM_CTX * CTX,
626  char *token,
627  char **previous_tokens,
628  ds_diction_t diction,
629  const char *bitpattern)
630{
631  int i, t, keylen, breadth;
632  int top, tokenizer = CTX->tokenizer;
633  unsigned long long crc;
634  char key[256];
635  int active = 0;
636  u_int32_t mask;
637
638  /* Shift all previous tokens up */
639  for(i=0;i<SPARSE_WINDOW_SIZE-1;i++) {
640    previous_tokens[i] = previous_tokens[i+1];
641    if (previous_tokens[i])
642      active++;
643  }
644
645  previous_tokens[SPARSE_WINDOW_SIZE-1] = token;
646  if (token)
647    active++;
648
649  breadth = _ds_pow2(active);
650
651  /* Iterate and generate all keys necessary */
652
653  for(mask=0;mask < (u_int32_t)breadth;mask++) {
654    int terms = 0;
655    t = 0;
656
657    key[0] = 0;
658    keylen = 0;
659    top = 1;
660
661    /* Each Bit */
662    for(i=0;i<SPARSE_WINDOW_SIZE;i++) {
663      if (t) {
664        if ((size_t)keylen < (sizeof(key)-1)) {
665           key[keylen] = '+';
666           key[++keylen] = 0;
667        }
668      }
669      if (bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] == 1) {
670        if (previous_tokens[i] == NULL || previous_tokens[i][0] == 0) {
671          if ((size_t)keylen < (sizeof(key)-1)) {
672            key[keylen] = '#';
673            key[++keylen] = 0;
674          }
675        }
676        else
677        {
678          int tl = strlen(previous_tokens[i]);
679          if ((size_t)(keylen + tl) < (sizeof(key)-1)) {
680            strcpy(key+keylen, previous_tokens[i]);
681            keylen += tl;
682          }
683          terms++;
684        }
685      } else {
686        if ((size_t)keylen < (sizeof(key)-1)) {
687          key[keylen] = '#';
688          key[++keylen] = 0;
689        }
690      }
691      t++;
692    }
693
694    /* If the bucket has at least 1 literal, hit it */
695    if ((tokenizer == DSZ_SBPH && terms != 0) ||
696        (tokenizer == DSZ_OSB  && terms == 2))
697    {
698      char *k = key;
699      while(keylen>2 && !strcmp((key+keylen)-2, "+#")) {
700        key[keylen-2] = 0;
701        keylen -=2;
702      }
703      while(!strncmp(k, "#+", 2)) {
704        top = 0;
705        k+=2;
706        keylen -=2;
707      }
708 
709      if (top) {
710        crc = _ds_getcrc64(k);
711        ds_diction_touch(diction, crc, k, DSD_CONTEXT);
712      }
713    }
714  }
715
716  return 0;
717}
718
719/*
720 *  _ds_degenerate_message()
721 *
722 * DESCRIPTION
723 *   Degenerate the message into headers, body and tokenizable pieces
724 *
725 *   This function is responsible for analyzing the actualized message and
726 *   degenerating it into only the components which are tokenizable.  This
727 *   process  effectively eliminates much HTML noise, special symbols,  or
728 *   other  non-tokenizable/non-desirable components. What is left  is the
729 *   bulk of  the message  and only  desired tags,  URLs, and other  data.
730 *
731 * INPUT ARGUMENTS
732 *      header    pointer to buffer containing headers
733 *      body      pointer to buffer containing message body
734 */
735
736int _ds_degenerate_message(DSPAM_CTX *CTX, buffer * header, buffer * body)
737{
738  char *decode = NULL;
739  struct nt_node *node_nt, *node_header;
740  struct nt_c c_nt, c_nt2;
741  int i = 0;
742  char heading[1024];
743
744  if (! CTX->message)
745  {
746    LOG (LOG_WARNING, "_ds_degenerate_message() failed: CTX->message is NULL");
747    return EUNKNOWN;
748  }
749
750  /* Iterate through each component and create large header/body buffers */
751
752  node_nt = c_nt_first (CTX->message->components, &c_nt);
753  while (node_nt != NULL)
754  {
755    struct _ds_message_part *block = (struct _ds_message_part *) node_nt->ptr;
756
757#ifdef VERBOSE
758    LOGDEBUG ("Processing component %d", i);
759#endif
760
761    if (! block->headers || ! block->headers->items)
762    {
763#ifdef VERBOSE
764      LOGDEBUG ("  : End of Message Identifier");
765#endif
766    }
767
768    else
769    {
770      struct _ds_header_field *current_header;
771
772      /* Accumulate the headers */
773      node_header = c_nt_first (block->headers, &c_nt2);
774      while (node_header != NULL)
775      {
776        current_header = (struct _ds_header_field *) node_header->ptr;
777        snprintf (heading, sizeof (heading),
778                  "%s: %s\n", current_header->heading,
779                  current_header->data);
780        buffer_cat (header, heading);
781        node_header = c_nt_next (block->headers, &c_nt2);
782      }
783
784      decode = block->body->data;
785
786      if (block->media_type == MT_TEXT    ||
787               block->media_type == MT_MESSAGE ||
788               block->media_type == MT_UNKNOWN ||
789               (block->media_type == MT_MULTIPART && !i))
790      {
791        /* Accumulate the bodies, skip attachments */
792
793        if (
794             (   block->encoding == EN_BASE64
795              || block->encoding == EN_QUOTED_PRINTABLE)
796            && ! block->original_signed_body)
797        {
798          if (block->content_disposition != PCD_ATTACHMENT)
799          {
800            LOGDEBUG ("decoding message block from encoding type %d",
801                      block->encoding);
802            decode = _ds_decode_block (block);
803          }
804        }
805
806        /* We found a tokenizable body component, add prefilters */
807
808        if (decode)
809        {
810          char *decode2 = NULL;
811          char *decode3 = NULL;
812
813          /* -- PREFILTERS BEGIN -- */
814
815          /* Hexadecimal 8-Bit Encodings */
816
817          if (block->encoding == EN_8BIT) {
818            decode2 = _ds_decode_hex8bit(decode);
819          } else {
820            decode2 = strdup(decode);
821          }
822
823          /* HTML-Specific Filters */
824
825          if (decode2) {
826            if (block->media_subtype == MST_HTML) {
827              decode3 = _ds_strip_html(decode2);
828            } else {
829              decode3 = strdup(decode2);
830            }
831            free(decode2);
832          }
833
834          /* -- PREFILTERS END -- */
835
836          if (decode3) {
837            buffer_cat (body, decode3);
838            free(decode3);
839          }
840
841          /* If we've decoded the body, save the original copy */
842          if (decode != block->body->data)
843          {
844            block->original_signed_body = block->body;
845            block->body = buffer_create (decode);
846            free (decode);
847          }
848        }
849      }
850    }
851#ifdef VERBOSE
852    LOGDEBUG ("Getting next message component");
853#endif
854    node_nt = c_nt_next (CTX->message->components, &c_nt);
855    i++;
856  } /* while (node_nt != NULL) */
857
858  if (header->data == NULL)
859    buffer_cat (header, " ");
860
861  if (body->data == NULL)
862    buffer_cat (body, " ");
863
864  return 0;
865}
866
867int _ds_url_tokenize(ds_diction_t diction, char *body, const char *key)
868{
869  char *token, *url_ptr, *url_token, *ptr;
870  char combined_token[256];
871  unsigned long long crc;
872  int key_len = strlen(key);
873
874#ifdef VERBOSE
875  LOGDEBUG("scanning for urls: %s\n", key);
876#endif
877  if (!body)
878    return EINVAL;
879  url_ptr = body;
880
881  token = strcasestr(url_ptr, key);
882  while (token != NULL)
883  {
884    int i = 0, old;
885
886    while(token[i]
887       && token[i] > 32 
888       && token[i] != '>'
889       && ((token[i] != '\"' && token[i] != '\'') || i <= key_len))
890      i++;
891    old = token[i];
892    token[i] = 0; /* parse in place */
893
894    /* Tokenize URL */
895    url_token = strtok_r (token, DELIMITERS, &ptr);
896    while (url_token != NULL)
897    {
898      snprintf (combined_token, sizeof (combined_token), "Url*%s", url_token);
899      crc = _ds_getcrc64 (combined_token);
900      ds_diction_touch(diction, crc, combined_token, 0);
901      url_token = strtok_r (NULL, DELIMITERS, &ptr);
902    }
903    memset (token, 32, i);
904    token[i] = old;
905    url_ptr = token + i;
906    token = strcasestr(url_ptr, key);
907  }
908  return 0;
909}
910
911/* Truncate tokens with EOT delimiters */
912char * _ds_truncate_token(const char *token) {
913  char *tweaked;
914  int i;
915
916  if (token == NULL)
917    return NULL;
918
919  tweaked = strdup(token);
920
921  if (tweaked == NULL)
922    return NULL;
923
924  i = strlen(tweaked);
925  while(i>1 && strspn(tweaked+i-2, DELIMITERS_EOT)) {
926    tweaked[i-1] = 0;
927    i--;
928  }
929
930  return tweaked;
931}
932
933/*
934 *  _ds_spbh_clear
935 *
936 * DESCRIPTION
937 *   Clears the SBPH stack
938 *   
939 *   Clears and frees all of the tokens in the SBPH stack. Used when a
940 *   boundary has been crossed (such as a new message header) where
941 *   tokens from the previous boundary are no longer useful.
942 */
943
944void _ds_sparse_clear(char **previous_tokens) {
945  int i;
946  for(i=0;i<SPARSE_WINDOW_SIZE;i++)
947    previous_tokens[i] = NULL;
948  return;
949}
950
951/*
952 * _ds_generate_bitpattern
953 *
954 * DESCRIPTION
955 *   Generates a sparse bitpattern for SPARSE_WINDOW_SIZE
956 *
957 *   This pattern is then used to create token patterns when using SBPH or OSB
958 *
959 */
960
961char *_ds_generate_bitpattern(int breadth) {
962  char *bitpattern;
963  u_int32_t mask;
964  unsigned long exp;
965  int i;
966
967  bitpattern = malloc(SPARSE_WINDOW_SIZE * breadth);
968
969  for(mask=0;mask<(u_int32_t)breadth;mask++) {
970      for(i=0;i<SPARSE_WINDOW_SIZE;i++) {
971          exp = (i) ? _ds_pow2(i) : 1;
972          /* Reverse pos = SPARSE_WINDOW_SIZE - (i+1); */
973          if (mask & exp)
974          {
975              bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] = 1;
976          }
977          else
978          {
979              bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] = 0;
980          }
981      }
982  }
983
984  return bitpattern;
985}
986
Note: See TracBrowser for help on using the repository browser.