wasBayesSharp – Blame information for rev 1

Subversion Repositories:
Rev:
Rev Author Line No. Line
1 office 1 using System;
2 using System.Collections.Generic;
3 using System.Linq;
4 using System.Text.RegularExpressions;
5  
6 namespace BayesSharp.Tokenizers
7 {
8 /// <summary>
9 /// A simple regex-based whitespace tokenizer.
10 /// </summary>
11 public class SimpleTextTokenizer : ITokenizer<string>
12 {
13 private readonly Regex _wordRe = new Regex(@"\w+");
14 private readonly bool _convertToLower;
15 private readonly List<string> _ignoreList;
16  
17 public SimpleTextTokenizer(): this(true, null)
18 {
19 }
20  
21 /// <param name="convertToLower">Tokens must be converted to lower case</param>
22 /// <param name="ignoreList">Tokens that will be ignored</param>
23 public SimpleTextTokenizer(bool convertToLower, List<string> ignoreList)
24 {
25 _ignoreList = ignoreList;
26 _convertToLower = convertToLower;
27 }
28  
29 /// <param name="input">String to be broken</param>
30 public IEnumerable<string> Tokenize(object input)
31 {
32 if (input.GetType() != typeof (string))
33 {
34 throw new FormatException(string.Format("Expected string, given {0}", input.GetType()));
35 }
36 var tokens = MatchTokens(input);
37 if (_ignoreList == null)
38 {
39 return tokens;
40 }
41 return tokens.Where(token => !_ignoreList.Contains(token));
42 }
43  
44 private IEnumerable<string> MatchTokens(object input)
45 {
46 foreach (Match match in _wordRe.Matches((string) input))
47 {
48 if (_convertToLower)
49 {
50 yield return match.Value.ToLower();
51 }
52 else
53 {
54 yield return match.Value;
55 }
56 }
57 }
58 }
59 }