wasBayesSharp – Blame information for rev 1
?pathlinks?
Rev | Author | Line No. | Line |
---|---|---|---|
1 | office | 1 | using System; |
2 | using System.Collections.Generic; |
||
3 | using System.Linq; |
||
4 | using System.Text.RegularExpressions; |
||
5 | |||
6 | namespace BayesSharp.Tokenizers |
||
7 | { |
||
8 | /// <summary> |
||
9 | /// A simple regex-based whitespace tokenizer. |
||
10 | /// </summary> |
||
11 | public class SimpleTextTokenizer : ITokenizer<string> |
||
12 | { |
||
13 | private readonly Regex _wordRe = new Regex(@"\w+"); |
||
14 | private readonly bool _convertToLower; |
||
15 | private readonly List<string> _ignoreList; |
||
16 | |||
17 | public SimpleTextTokenizer(): this(true, null) |
||
18 | { |
||
19 | } |
||
20 | |||
21 | /// <param name="convertToLower">Tokens must be converted to lower case</param> |
||
22 | /// <param name="ignoreList">Tokens that will be ignored</param> |
||
23 | public SimpleTextTokenizer(bool convertToLower, List<string> ignoreList) |
||
24 | { |
||
25 | _ignoreList = ignoreList; |
||
26 | _convertToLower = convertToLower; |
||
27 | } |
||
28 | |||
29 | /// <param name="input">String to be broken</param> |
||
30 | public IEnumerable<string> Tokenize(object input) |
||
31 | { |
||
32 | if (input.GetType() != typeof (string)) |
||
33 | { |
||
34 | throw new FormatException(string.Format("Expected string, given {0}", input.GetType())); |
||
35 | } |
||
36 | var tokens = MatchTokens(input); |
||
37 | if (_ignoreList == null) |
||
38 | { |
||
39 | return tokens; |
||
40 | } |
||
41 | return tokens.Where(token => !_ignoreList.Contains(token)); |
||
42 | } |
||
43 | |||
44 | private IEnumerable<string> MatchTokens(object input) |
||
45 | { |
||
46 | foreach (Match match in _wordRe.Matches((string) input)) |
||
47 | { |
||
48 | if (_convertToLower) |
||
49 | { |
||
50 | yield return match.Value.ToLower(); |
||
51 | } |
||
52 | else |
||
53 | { |
||
54 | yield return match.Value; |
||
55 | } |
||
56 | } |
||
57 | } |
||
58 | } |
||
59 | } |