[c5c522c] | 1 | /* $Id: tokenizer.h,v 1.10 2011/06/28 00:13:48 sbajic Exp $ */ |
---|
| 2 | |
---|
| 3 | /* |
---|
| 4 | DSPAM |
---|
| 5 | COPYRIGHT (C) 2002-2012 DSPAM PROJECT |
---|
| 6 | |
---|
| 7 | This program is free software: you can redistribute it and/or modify |
---|
| 8 | it under the terms of the GNU Affero General Public License as |
---|
| 9 | published by the Free Software Foundation, either version 3 of the |
---|
| 10 | License, or (at your option) any later version. |
---|
| 11 | |
---|
| 12 | This program is distributed in the hope that it will be useful, |
---|
| 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
| 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
| 15 | GNU Affero General Public License for more details. |
---|
| 16 | |
---|
| 17 | You should have received a copy of the GNU Affero General Public License |
---|
| 18 | along with this program. If not, see <http://www.gnu.org/licenses/>. |
---|
| 19 | |
---|
| 20 | */ |
---|
| 21 | |
---|
| 22 | #ifndef _TOKENIZER_H |
---|
| 23 | # define _TOKENIZER_H |
---|
| 24 | |
---|
| 25 | #include "diction.h" |
---|
| 26 | #include "nodetree.h" |
---|
| 27 | #include "error.h" |
---|
| 28 | #include "storage_driver.h" |
---|
| 29 | #include "decode.h" |
---|
| 30 | |
---|
| 31 | #define SPARSE_WINDOW_SIZE 5 |
---|
| 32 | |
---|
| 33 | int _ds_tokenize( |
---|
| 34 | DSPAM_CTX * CTX, |
---|
| 35 | char *headers, |
---|
| 36 | char *body, |
---|
| 37 | ds_diction_t diction); |
---|
| 38 | |
---|
| 39 | int _ds_tokenize_sparse( |
---|
| 40 | DSPAM_CTX * CTX, |
---|
| 41 | char *headers, |
---|
| 42 | char *body, |
---|
| 43 | ds_diction_t diction); |
---|
| 44 | |
---|
| 45 | int _ds_tokenize_ngram( |
---|
| 46 | DSPAM_CTX * CTX, |
---|
| 47 | char *headers, |
---|
| 48 | char *body, |
---|
| 49 | ds_diction_t diction); |
---|
| 50 | |
---|
| 51 | /* _ds_process: ngram token generation routines */ |
---|
| 52 | |
---|
| 53 | int _ds_process_header_token( |
---|
| 54 | DSPAM_CTX * CTX, |
---|
| 55 | char *joined_token, |
---|
| 56 | const char *previous_token, |
---|
| 57 | ds_diction_t diction, |
---|
| 58 | const char *heading); |
---|
| 59 | |
---|
| 60 | int _ds_process_body_token( |
---|
| 61 | DSPAM_CTX * CTX, |
---|
| 62 | char *joined_token, |
---|
| 63 | const char *previous_token, |
---|
| 64 | ds_diction_t diction); |
---|
| 65 | |
---|
| 66 | /* _ds_map: sparse token generation routines */ |
---|
| 67 | |
---|
| 68 | int _ds_map_header_token( |
---|
| 69 | DSPAM_CTX * CTX, |
---|
| 70 | char *token, |
---|
| 71 | char **previous_tokens, |
---|
| 72 | ds_diction_t diction, |
---|
| 73 | const char *heading, |
---|
| 74 | const char *bitpattern); |
---|
| 75 | |
---|
| 76 | int _ds_map_body_token( |
---|
| 77 | DSPAM_CTX * CTX, |
---|
| 78 | char *token, |
---|
| 79 | char **previous_tokens, |
---|
| 80 | ds_diction_t diction, |
---|
| 81 | const char *bitpattern); |
---|
| 82 | |
---|
| 83 | int _ds_degenerate_message( |
---|
| 84 | DSPAM_CTX *CTX, |
---|
| 85 | buffer *header, |
---|
| 86 | buffer *body); |
---|
| 87 | |
---|
| 88 | int _ds_url_tokenize( |
---|
| 89 | ds_diction_t diction, |
---|
| 90 | char *body, |
---|
| 91 | const char *key); |
---|
| 92 | |
---|
| 93 | void _ds_sparse_clear |
---|
| 94 | (char **previous_tokens); |
---|
| 95 | |
---|
| 96 | char * _ds_truncate_token |
---|
| 97 | (const char *token); |
---|
| 98 | |
---|
| 99 | char *_ds_generate_bitpattern |
---|
| 100 | (int breadth); |
---|
| 101 | |
---|
| 102 | #endif |
---|