source: npl/mailserver/dspam/dspam-3.10.2/src/tokenizer.h @ c5c522c

gcc484ntopperl-5.22
Last change on this file since c5c522c was c5c522c, checked in by Edwin Eefting <edwin@datux.nl>, 8 years ago

initial commit, transferred from cleaned syn3 svn tree

  • Property mode set to 100644
File size: 2.2 KB
Line 
1/* $Id: tokenizer.h,v 1.10 2011/06/28 00:13:48 sbajic Exp $ */
2
3/*
4 DSPAM
5 COPYRIGHT (C) 2002-2012 DSPAM PROJECT
6
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU Affero General Public License as
9 published by the Free Software Foundation, either version 3 of the
10 License, or (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU Affero General Public License for more details.
16
17 You should have received a copy of the GNU Affero General Public License
18 along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20*/
21
22#ifndef _TOKENIZER_H
23#  define _TOKENIZER_H
24
25#include "diction.h"
26#include "nodetree.h"
27#include "error.h"
28#include "storage_driver.h"
29#include "decode.h"
30
31#define SPARSE_WINDOW_SIZE       5
32
33int _ds_tokenize(
34  DSPAM_CTX * CTX,
35   char *headers,
36   char *body,
37   ds_diction_t diction);
38
39int _ds_tokenize_sparse(
40  DSPAM_CTX * CTX,
41  char *headers,
42  char *body,
43  ds_diction_t diction);
44
45int _ds_tokenize_ngram(
46  DSPAM_CTX * CTX,
47  char *headers,
48  char *body,
49  ds_diction_t diction);
50
51/* _ds_process: ngram token generation routines */
52
53int _ds_process_header_token(
54  DSPAM_CTX * CTX,
55  char *joined_token,
56  const char *previous_token,
57  ds_diction_t diction,
58  const char *heading);
59
60int _ds_process_body_token(
61  DSPAM_CTX * CTX,
62  char *joined_token,
63  const char *previous_token,
64  ds_diction_t diction);
65
66/* _ds_map: sparse token generation routines */
67
68int _ds_map_header_token(
69  DSPAM_CTX * CTX,
70  char *token,
71  char **previous_tokens,
72  ds_diction_t diction,
73  const char *heading,
74  const char *bitpattern);
75
76int _ds_map_body_token(
77  DSPAM_CTX * CTX,
78  char *token,
79  char **previous_tokens,
80  ds_diction_t diction,
81  const char *bitpattern);
82
83int _ds_degenerate_message(
84  DSPAM_CTX *CTX,
85  buffer *header,
86  buffer *body);
87
88int _ds_url_tokenize(
89  ds_diction_t diction,
90  char *body,
91  const char *key);
92
93void _ds_sparse_clear
94  (char **previous_tokens);
95
96char * _ds_truncate_token
97  (const char *token);
98
99char *_ds_generate_bitpattern
100  (int breadth);
101
102#endif
Note: See TracBrowser for help on using the repository browser.