/* $Id: tokenizer.c,v 1.301 2011/06/28 00:13:48 sbajic Exp $ */
/*
DSPAM
COPYRIGHT (C) 2002-2012 DSPAM PROJECT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
/*
* tokenizer.c - tokenizer functions
*
* DESCRIPTION
* The tokenizer subroutines are responsible for decomposing a message into
* its colloquial components. All components are stored collectively in
* a diction object, passed into the function.
*
*/
#ifdef HAVE_CONFIG_H
#include
#endif
#include
#include
#include
#include
#include
#include
#ifdef HAVE_UNISTD_H
#include
#endif
#include
#include
#ifdef TIME_WITH_SYS_TIME
# include
# include
#else
# ifdef HAVE_SYS_TIME_H
# include
# else
# include
# endif
#endif
#include "config.h"
#include "tokenizer.h"
#include "util.h"
#include "libdspam.h"
#include "language.h"
/*
* _ds_tokenize() - tokenize the message
*
* DESCRIPTION
* tokenizes the supplied message
*
* INPUT ARGUMENTS
* DSPAM_CTX *CTX pointer to context
* char *header pointer to message header
* char *body pointer to message body
* ds_diction_t diction to store components
*
* RETURN VALUES
* standard errors on failure
* zero if successful
*
*/
int
_ds_tokenize (DSPAM_CTX * CTX, char *headers, char *body, ds_diction_t diction)
{
if (diction == NULL)
return EINVAL;
if (CTX->tokenizer == DSZ_SBPH || CTX->tokenizer == DSZ_OSB)
return _ds_tokenize_sparse(CTX, headers, body, diction);
else
return _ds_tokenize_ngram(CTX, headers, body, diction);
}
int _ds_tokenize_ngram(
DSPAM_CTX *CTX,
char *headers,
char *body,
ds_diction_t diction)
{
char *token; /* current token */
char *previous_token = NULL; /* used for bigrams (chained tokens) */
char *line = NULL; /* header broken up into lines */
char *ptrptr;
char heading[128]; /* current heading */
int l, tokenizer = CTX->tokenizer;
struct nt *header = NULL;
struct nt_node *node_nt;
struct nt_c c_nt;
/* Tokenize URLs in message */
if (_ds_match_attribute(CTX->config->attributes, "ProcessorURLContext", "on")) {
_ds_url_tokenize(diction, body, "http://");
_ds_url_tokenize(diction, body, "www.");
_ds_url_tokenize(diction, body, "href=");
}
/*
* Header Tokenization
*/
header = nt_create (NT_CHAR);
if (header == NULL)
{
LOG (LOG_CRIT, ERR_MEM_ALLOC);
return EUNKNOWN;
}
line = strtok_r (headers, "\n", &ptrptr);
while (line) {
nt_add (header, line);
line = strtok_r (NULL, "\n", &ptrptr);
}
node_nt = c_nt_first (header, &c_nt);
heading[0] = 0;
while (node_nt) {
int multiline;
#ifdef VERBOSE
LOGDEBUG("processing line: %s", node_nt->ptr);
#endif
line = node_nt->ptr;
token = strtok_r (line, ":", &ptrptr);
if (token && token[0] != 32 && token[0] != 9 && !strstr (token, " "))
{
multiline = 0;
strlcpy (heading, token, 128);
previous_token = NULL;
} else {
multiline = 1;
}
#ifdef VERBOSE
LOGDEBUG ("Reading '%s' header from: '%s'", heading, line);
#endif
if (CTX->flags & DSF_WHITELIST) {
/* Use the entire From: line for auto-whitelisting */
if (!strcmp(heading, "From")) {
char wl[256];
char *fromline = line + 5;
unsigned long long whitelist_token;
if (fromline[0] == 32)
fromline++;
snprintf(wl, sizeof(wl), "%s*%s", heading, fromline);
whitelist_token = _ds_getcrc64(wl);
ds_diction_touch(diction, whitelist_token, wl, 0);
diction->whitelist_token = whitelist_token;
}
}
/* Received headers use a different set of delimiters to preserve things
like ip addresses */
token = strtok_r ((multiline) ? line : NULL, DELIMITERS_HEADING, &ptrptr);
while (token)
{
l = strlen(token);
if (l >= 1 && l < 50)
{
#ifdef VERBOSE
LOGDEBUG ("Processing '%s' token in '%s' header", token, heading);
#endif
/* Process "current" token */
if (!_ds_process_header_token
(CTX, token, previous_token, diction, heading) &&
(tokenizer == DSZ_CHAIN))
{
previous_token = token;
}
}
token = strtok_r (NULL, DELIMITERS_HEADING, &ptrptr);
}
previous_token = NULL;
node_nt = c_nt_next (header, &c_nt);
}
nt_destroy (header);
/*
* Body Tokenization
*/
#ifdef VERBOSE
LOGDEBUG("parsing message body");
#endif
token = strtok_r (body, DELIMITERS, &ptrptr);
while (token != NULL)
{
l = strlen (token);
if (l >= 1 && l < 50)
{
#ifdef VERBOSE
LOGDEBUG ("Processing body token '%s'", token);
#endif
/* Process "current" token */
if ( !_ds_process_body_token(CTX, token, previous_token, diction)
&& tokenizer == DSZ_CHAIN)
{
previous_token = token;
}
}
token = strtok_r (NULL, DELIMITERS, &ptrptr);
}
#ifdef VERBOSE
LOGDEBUG("Finished tokenizing (ngram) message");
#endif
/* Final token reassembly (anything left in the buffer) */
return 0;
}
int _ds_tokenize_sparse(
DSPAM_CTX *CTX,
char *headers,
char *body,
ds_diction_t diction)
{
int i;
char *token; /* current token */
char *previous_tokens[SPARSE_WINDOW_SIZE]; /* sparse chain */
char *line = NULL; /* header broken up into lines */
char *ptrptr;
char *bitpattern;
char heading[128]; /* current heading */
int l;
struct nt *header = NULL;
struct nt_node *node_nt;
struct nt_c c_nt;
for(i=0;iconfig->attributes, "ProcessorURLContext", "on"))
{
_ds_url_tokenize(diction, body, "http://");
_ds_url_tokenize(diction, body, "www.");
_ds_url_tokenize(diction, body, "href=");
}
/*
* Header Tokenization
*/
header = nt_create (NT_CHAR);
if (header == NULL)
{
LOG (LOG_CRIT, ERR_MEM_ALLOC);
free(bitpattern);
return EUNKNOWN;
}
line = strtok_r (headers, "\n", &ptrptr);
while (line) {
nt_add (header, line);
line = strtok_r (NULL, "\n", &ptrptr);
}
node_nt = c_nt_first (header, &c_nt);
heading[0] = 0;
while (node_nt) {
int multiline;
#ifdef VERBOSE
LOGDEBUG("processing line: %s", node_nt->ptr);
#endif
_ds_sparse_clear(previous_tokens);
line = node_nt->ptr;
token = strtok_r (line, ":", &ptrptr);
if (token && token[0] != 32 && token[0] != 9 && !strstr (token, " "))
{
multiline = 0;
strlcpy (heading, token, 128);
_ds_sparse_clear(previous_tokens);
} else {
multiline = 1;
}
#ifdef VERBOSE
LOGDEBUG ("Reading '%s' header from: '%s'", heading, line);
#endif
if (CTX->flags & DSF_WHITELIST) {
/* Use the entire From: line for auto-whitelisting */
if (!strcmp(heading, "From")) {
char wl[256];
char *fromline = line + 5;
unsigned long long whitelist_token;
if (fromline[0] == 32)
fromline++;
snprintf(wl, sizeof(wl), "%s*%s", heading, fromline);
whitelist_token = _ds_getcrc64(wl);
ds_diction_touch(diction, whitelist_token, wl, 0);
diction->whitelist_token = whitelist_token;
}
}
/* Received headers use a different set of delimiters to preserve things
like ip addresses */
token = strtok_r ((multiline) ? line : NULL, SPARSE_DELIMITERS_HEADING, &ptrptr);
while (token)
{
l = strlen(token);
if (l > 0 && l < 50)
{
#ifdef VERBOSE
LOGDEBUG ("Processing '%s' token in '%s' header", token, heading);
#endif
_ds_map_header_token (CTX, token, previous_tokens, diction, heading, bitpattern);
}
token = strtok_r (NULL, SPARSE_DELIMITERS_HEADING, &ptrptr);
}
for(i=0;i 0 && l < 50)
{
#ifdef VERBOSE
LOGDEBUG ("Processing body token '%s'", token);
#endif
/* Process "current" token */
_ds_map_body_token (CTX, token, previous_tokens, diction, bitpattern);
}
token = strtok_r (NULL, SPARSE_DELIMITERS, &ptrptr);
}
for(i=0;iconfig->attributes, "IgnoreHeader", heading))
return 0;
if (!strncmp(heading, "X-DSPAM-", 8))
return 0;
/* This is where we used to ignore certain headings */
if (heading[0] != 0)
snprintf (combined_token, sizeof (combined_token),
"%s*%s", heading, token);
else
strlcpy (combined_token, token, sizeof (combined_token));
tweaked_token = _ds_truncate_token(token);
if (tweaked_token == NULL)
return EUNKNOWN;
snprintf(combined_token, sizeof(combined_token), "%s*%s", heading, tweaked_token);
crc = _ds_getcrc64 (combined_token);
#ifdef VERBOSE
LOGDEBUG ("Token Hit: '%s'", combined_token);
#endif
ds_diction_touch(diction, crc, combined_token, 0);
if (CTX->tokenizer == DSZ_CHAIN && previous_token != NULL)
{
char *tweaked_previous;
tweaked_previous = _ds_truncate_token(previous_token);
if (tweaked_previous == NULL) {
free(tweaked_token);
return EUNKNOWN;
}
snprintf (combined_token, sizeof (combined_token),
"%s*%s+%s", heading, tweaked_previous, tweaked_token);
crc = _ds_getcrc64 (combined_token);
ds_diction_touch(diction, crc, combined_token, DSD_CHAINED);
free(tweaked_previous);
}
free(tweaked_token);
return 0;
}
int
_ds_process_body_token (DSPAM_CTX * CTX, char *token,
const char *previous_token, ds_diction_t diction)
{
char combined_token[256];
unsigned long long crc;
char *tweaked_token;
tweaked_token = _ds_truncate_token(token);
if (tweaked_token == NULL)
return EUNKNOWN;
crc = _ds_getcrc64 (tweaked_token);
ds_diction_touch(diction, crc, tweaked_token, DSD_CONTEXT);
if (CTX->tokenizer == DSZ_CHAIN && previous_token != NULL)
{
char *tweaked_previous = _ds_truncate_token(previous_token);
if (tweaked_previous == NULL) {
free(tweaked_token);
return EUNKNOWN;
}
snprintf (combined_token, sizeof (combined_token), "%s+%s",
tweaked_previous, tweaked_token);
crc = _ds_getcrc64 (combined_token);
ds_diction_touch(diction, crc, combined_token, DSD_CHAINED | DSD_CONTEXT);
free(tweaked_previous);
}
free(tweaked_token);
return 0;
}
int
_ds_map_header_token (DSPAM_CTX * CTX, char *token,
char **previous_tokens, ds_diction_t diction,
const char *heading, const char *bitpattern)
{
int i, t, keylen, breadth;
u_int32_t mask;
unsigned long long crc;
char key[256];
int active = 0, top, tokenizer = CTX->tokenizer;
if (_ds_match_attribute(CTX->config->attributes, "IgnoreHeader", heading))
return 0;
if (!strncmp(heading, "X-DSPAM-", 8))
return 0;
/* Shift all previous tokens up */
for(i=0;i2 && !strcmp((key+keylen)-2, "+#")) {
key[keylen-2] = 0;
keylen -=2;
}
while(!strncmp(k, "#+", 2)) {
top = 0;
k+=2;
keylen -= 2;
}
if (top) {
snprintf(hkey, sizeof(hkey), "%s*%s", heading, k);
crc = _ds_getcrc64(hkey);
ds_diction_touch(diction, crc, hkey, DSD_CONTEXT);
}
}
}
return 0;
}
int
_ds_map_body_token (
DSPAM_CTX * CTX,
char *token,
char **previous_tokens,
ds_diction_t diction,
const char *bitpattern)
{
int i, t, keylen, breadth;
int top, tokenizer = CTX->tokenizer;
unsigned long long crc;
char key[256];
int active = 0;
u_int32_t mask;
/* Shift all previous tokens up */
for(i=0;i2 && !strcmp((key+keylen)-2, "+#")) {
key[keylen-2] = 0;
keylen -=2;
}
while(!strncmp(k, "#+", 2)) {
top = 0;
k+=2;
keylen -=2;
}
if (top) {
crc = _ds_getcrc64(k);
ds_diction_touch(diction, crc, k, DSD_CONTEXT);
}
}
}
return 0;
}
/*
* _ds_degenerate_message()
*
* DESCRIPTION
* Degenerate the message into headers, body and tokenizable pieces
*
* This function is responsible for analyzing the actualized message and
* degenerating it into only the components which are tokenizable. This
* process effectively eliminates much HTML noise, special symbols, or
* other non-tokenizable/non-desirable components. What is left is the
* bulk of the message and only desired tags, URLs, and other data.
*
* INPUT ARGUMENTS
* header pointer to buffer containing headers
* body pointer to buffer containing message body
*/
int _ds_degenerate_message(DSPAM_CTX *CTX, buffer * header, buffer * body)
{
char *decode = NULL;
struct nt_node *node_nt, *node_header;
struct nt_c c_nt, c_nt2;
int i = 0;
char heading[1024];
if (! CTX->message)
{
LOG (LOG_WARNING, "_ds_degenerate_message() failed: CTX->message is NULL");
return EUNKNOWN;
}
/* Iterate through each component and create large header/body buffers */
node_nt = c_nt_first (CTX->message->components, &c_nt);
while (node_nt != NULL)
{
struct _ds_message_part *block = (struct _ds_message_part *) node_nt->ptr;
#ifdef VERBOSE
LOGDEBUG ("Processing component %d", i);
#endif
if (! block->headers || ! block->headers->items)
{
#ifdef VERBOSE
LOGDEBUG (" : End of Message Identifier");
#endif
}
else
{
struct _ds_header_field *current_header;
/* Accumulate the headers */
node_header = c_nt_first (block->headers, &c_nt2);
while (node_header != NULL)
{
current_header = (struct _ds_header_field *) node_header->ptr;
snprintf (heading, sizeof (heading),
"%s: %s\n", current_header->heading,
current_header->data);
buffer_cat (header, heading);
node_header = c_nt_next (block->headers, &c_nt2);
}
decode = block->body->data;
if (block->media_type == MT_TEXT ||
block->media_type == MT_MESSAGE ||
block->media_type == MT_UNKNOWN ||
(block->media_type == MT_MULTIPART && !i))
{
/* Accumulate the bodies, skip attachments */
if (
( block->encoding == EN_BASE64
|| block->encoding == EN_QUOTED_PRINTABLE)
&& ! block->original_signed_body)
{
if (block->content_disposition != PCD_ATTACHMENT)
{
LOGDEBUG ("decoding message block from encoding type %d",
block->encoding);
decode = _ds_decode_block (block);
}
}
/* We found a tokenizable body component, add prefilters */
if (decode)
{
char *decode2 = NULL;
char *decode3 = NULL;
/* -- PREFILTERS BEGIN -- */
/* Hexadecimal 8-Bit Encodings */
if (block->encoding == EN_8BIT) {
decode2 = _ds_decode_hex8bit(decode);
} else {
decode2 = strdup(decode);
}
/* HTML-Specific Filters */
if (decode2) {
if (block->media_subtype == MST_HTML) {
decode3 = _ds_strip_html(decode2);
} else {
decode3 = strdup(decode2);
}
free(decode2);
}
/* -- PREFILTERS END -- */
if (decode3) {
buffer_cat (body, decode3);
free(decode3);
}
/* If we've decoded the body, save the original copy */
if (decode != block->body->data)
{
block->original_signed_body = block->body;
block->body = buffer_create (decode);
free (decode);
}
}
}
}
#ifdef VERBOSE
LOGDEBUG ("Getting next message component");
#endif
node_nt = c_nt_next (CTX->message->components, &c_nt);
i++;
} /* while (node_nt != NULL) */
if (header->data == NULL)
buffer_cat (header, " ");
if (body->data == NULL)
buffer_cat (body, " ");
return 0;
}
int _ds_url_tokenize(ds_diction_t diction, char *body, const char *key)
{
char *token, *url_ptr, *url_token, *ptr;
char combined_token[256];
unsigned long long crc;
int key_len = strlen(key);
#ifdef VERBOSE
LOGDEBUG("scanning for urls: %s\n", key);
#endif
if (!body)
return EINVAL;
url_ptr = body;
token = strcasestr(url_ptr, key);
while (token != NULL)
{
int i = 0, old;
while(token[i]
&& token[i] > 32
&& token[i] != '>'
&& ((token[i] != '\"' && token[i] != '\'') || i <= key_len))
i++;
old = token[i];
token[i] = 0; /* parse in place */
/* Tokenize URL */
url_token = strtok_r (token, DELIMITERS, &ptr);
while (url_token != NULL)
{
snprintf (combined_token, sizeof (combined_token), "Url*%s", url_token);
crc = _ds_getcrc64 (combined_token);
ds_diction_touch(diction, crc, combined_token, 0);
url_token = strtok_r (NULL, DELIMITERS, &ptr);
}
memset (token, 32, i);
token[i] = old;
url_ptr = token + i;
token = strcasestr(url_ptr, key);
}
return 0;
}
/* Truncate tokens with EOT delimiters */
char * _ds_truncate_token(const char *token) {
char *tweaked;
int i;
if (token == NULL)
return NULL;
tweaked = strdup(token);
if (tweaked == NULL)
return NULL;
i = strlen(tweaked);
while(i>1 && strspn(tweaked+i-2, DELIMITERS_EOT)) {
tweaked[i-1] = 0;
i--;
}
return tweaked;
}
/*
* _ds_spbh_clear
*
* DESCRIPTION
* Clears the SBPH stack
*
* Clears and frees all of the tokens in the SBPH stack. Used when a
* boundary has been crossed (such as a new message header) where
* tokens from the previous boundary are no longer useful.
*/
void _ds_sparse_clear(char **previous_tokens) {
int i;
for(i=0;i