由于没有实现完整的一套字典机制,而是普通的文本字典,所以就不提供完整源码下载了,贴出核心的源码。从版本完整度上来说只能算是0.6版。
另外,本分词系统使用的词库是ShootAnalyzer的词库。
使用方法:
参考以下代码
1
[TestMethod]
2 public void TestMethod1()
3 {
4 //
5 // TODO: 在此 添加测试逻辑
6 //
7
8 Participle p = new Participle();
9 p.Init( @" D:\labs\xxxx " );
10 string txt = @" 天下真的有神吗?我不是呀 " ;
11 string outstr = string .Empty;
12 Stopwatch st = new Stopwatch();
13 st.Start();
14 outstr = p.TextSpliter(txt);
15 st.Stop();
16
17 Stopwatch st2 = new Stopwatch();
18 st2.Start();
19 List < string > hs = p.TextArray(txt);
20 st2.Stop();
21 Console.WriteLine(outstr);
22 Console.WriteLine(st.ElapsedMilliseconds.ToString( " f2 " ));
23 Console.WriteLine(st2.ElapsedMilliseconds.ToString( " f2 " ));
24
25 YurowAnalyzer.YurowAnalyzer y = new YurowAnalyzer.YurowAnalyzer( @" D:\labs\xxxx " );
26 TokenStream t = y.TokenStream( null , new StringReader(txt));
27
28 Token token = t.Next();
29 while (token != null )
30 {
31 Console.WriteLine(token.TermText() + " \t " + token.StartOffset() + " \t " + token.EndOffset());
32 token = t.Next();
33 }
34 t.Close();
35 }
2 public void TestMethod1()
3 {
4 //
5 // TODO: 在此 添加测试逻辑
6 //
7
8 Participle p = new Participle();
9 p.Init( @" D:\labs\xxxx " );
10 string txt = @" 天下真的有神吗?我不是呀 " ;
11 string outstr = string .Empty;
12 Stopwatch st = new Stopwatch();
13 st.Start();
14 outstr = p.TextSpliter(txt);
15 st.Stop();
16
17 Stopwatch st2 = new Stopwatch();
18 st2.Start();
19 List < string > hs = p.TextArray(txt);
20 st2.Stop();
21 Console.WriteLine(outstr);
22 Console.WriteLine(st.ElapsedMilliseconds.ToString( " f2 " ));
23 Console.WriteLine(st2.ElapsedMilliseconds.ToString( " f2 " ));
24
25 YurowAnalyzer.YurowAnalyzer y = new YurowAnalyzer.YurowAnalyzer( @" D:\labs\xxxx " );
26 TokenStream t = y.TokenStream( null , new StringReader(txt));
27
28 Token token = t.Next();
29 while (token != null )
30 {
31 Console.WriteLine(token.TermText() + " \t " + token.StartOffset() + " \t " + token.EndOffset());
32 token = t.Next();
33 }
34 t.Close();
35 }
在Lucene.Net 索引或者搜索中直接使用YurowAnalyzer.YurowAnalyzer 分析器。
下载地址:
http://files.cnblogs.com/birdshover/YurowAnalyzer.rar
下面贴上些关键源码:
Participle类(分词类)
1
2 public List < int > StartArr;
3
4 public List < string > TextArray( string text)
5 {
6 List < string > hs = new List < string > ();
7 StartArr = new List < int > ();
8 int start = 0 ;
9 for ( int i = 0 ; i < text.Length; i ++ )
10 {
11 char nowchar = text[i];
12 char nextchar = (i == text.Length - 1 ) ? ' \0 ' : text[i + 1 ];
13 if (DataCatch.EnglishChar.Contains(nowchar))
14 {
15 if (start < 1 )
16 start = i;
17 if (DataCatch.EnglishChar.Contains(nextchar))
18 i ++ ;
19 else
20 {
21 hs.Add(text.Substring(start, i - start));
22 StartArr.Add(start);
23 start = 0 ;
24 }
25 continue ;
26 }
27
28 if (DataCatch.Num.Contains(nowchar))
29 {
30 if (start < 1 )
31 start = i;
32 if (DataCatch.Num.Contains(nextchar))
33 {
34 i ++ ;
35 }
36 else
37 {
38 hs.Add(text.Substring(start, i - start));
39 StartArr.Add(start);
40 start = 0 ;
41 }
42 continue ;
43 }
44 if (nowchar == ' ' )
45 {
46 continue ;
47 }
48 if (nextchar == ' ' || nextchar == ' \0 ' )
49 {
50 hs.Add(nowchar.ToString());
51 StartArr.Add(i);
52 i ++ ;
53 continue ;
54 }
55 if (DataCatch.GetDict().ContainsKey(nowchar) && DataCatch.GetDict()[nowchar].ContainsKey(nextchar))
56 {
57 HashSet < string > list = DataCatch.GetDict()[nowchar][nextchar];
58 if (list.Count == 0 )
59 {
60 hs.Add(nowchar.ToString() + nextchar.ToString());
61 StartArr.Add(i);
62 i ++ ;
63 continue ;
64 }
65 int maxnum = 0 ;
66 string temp = string .Empty;
67 string outstr = string .Empty;
68 foreach ( string item in list)
69 {
70 if (text.Length - i > item.Length + 1 )
71 {
72 temp = text.Substring(i + 2 , item.Length);
73 if (list.Contains(temp))
74 {
75 if (maxnum > item.Length)
76 continue ;
77 else
78 {
79 maxnum = item.Length;
80 outstr = temp;
81 }
82 }
83 }
84 }
85 if ( ! string .IsNullOrEmpty(outstr))
86 {
87 hs.Add(nowchar.ToString() + nextchar.ToString() + outstr);
88 StartArr.Add(i);
89 i = i + maxnum + 1 ;
90 }
91 else
92 {
93 hs.Add(nowchar.ToString() + nextchar.ToString());
94 StartArr.Add(i);
95 i ++ ;
96 }
97 }
98 else
99 {
100 hs.Add(nowchar.ToString());
101 StartArr.Add(i);
102 }
103 }
104 return hs;
105 }
2 public List < int > StartArr;
3
4 public List < string > TextArray( string text)
5 {
6 List < string > hs = new List < string > ();
7 StartArr = new List < int > ();
8 int start = 0 ;
9 for ( int i = 0 ; i < text.Length; i ++ )
10 {
11 char nowchar = text[i];
12 char nextchar = (i == text.Length - 1 ) ? ' \0 ' : text[i + 1 ];
13 if (DataCatch.EnglishChar.Contains(nowchar))
14 {
15 if (start < 1 )
16 start = i;
17 if (DataCatch.EnglishChar.Contains(nextchar))
18 i ++ ;
19 else
20 {
21 hs.Add(text.Substring(start, i - start));
22 StartArr.Add(start);
23 start = 0 ;
24 }
25 continue ;
26 }
27
28 if (DataCatch.Num.Contains(nowchar))
29 {
30 if (start < 1 )
31 start = i;
32 if (DataCatch.Num.Contains(nextchar))
33 {
34 i ++ ;
35 }
36 else
37 {
38 hs.Add(text.Substring(start, i - start));
39 StartArr.Add(start);
40 start = 0 ;
41 }
42 continue ;
43 }
44 if (nowchar == ' ' )
45 {
46 continue ;
47 }
48 if (nextchar == ' ' || nextchar == ' \0 ' )
49 {
50 hs.Add(nowchar.ToString());
51 StartArr.Add(i);
52 i ++ ;
53 continue ;
54 }
55 if (DataCatch.GetDict().ContainsKey(nowchar) && DataCatch.GetDict()[nowchar].ContainsKey(nextchar))
56 {
57 HashSet < string > list = DataCatch.GetDict()[nowchar][nextchar];
58 if (list.Count == 0 )
59 {
60 hs.Add(nowchar.ToString() + nextchar.ToString());
61 StartArr.Add(i);
62 i ++ ;
63 continue ;
64 }
65 int maxnum = 0 ;
66 string temp = string .Empty;
67 string outstr = string .Empty;
68 foreach ( string item in list)
69 {
70 if (text.Length - i > item.Length + 1 )
71 {
72 temp = text.Substring(i + 2 , item.Length);
73 if (list.Contains(temp))
74 {
75 if (maxnum > item.Length)
76 continue ;
77 else
78 {
79 maxnum = item.Length;
80 outstr = temp;
81 }
82 }
83 }
84 }
85 if ( ! string .IsNullOrEmpty(outstr))
86 {
87 hs.Add(nowchar.ToString() + nextchar.ToString() + outstr);
88 StartArr.Add(i);
89 i = i + maxnum + 1 ;
90 }
91 else
92 {
93 hs.Add(nowchar.ToString() + nextchar.ToString());
94 StartArr.Add(i);
95 i ++ ;
96 }
97 }
98 else
99 {
100 hs.Add(nowchar.ToString());
101 StartArr.Add(i);
102 }
103 }
104 return hs;
105 }
DefaultDict类(加载分词具体实现)
private Dictionary<char, Dictionary<char, HashSet<string>>> dictMemory = new Dictionary<char, Dictionary<char, HashSet<string>>>(DataCatch.InitPage);
1
protected
virtual
void
DoFormat()
2 {
3 Stream stream = new FileStream(dictSourcePath, FileMode.Open, FileAccess.Read, FileShare.Read);
4 StreamReader sr = new StreamReader(stream, Encoding.Default);
5 while (sr.Peek() > - 1 )
6 {
7 string line = sr.ReadLine();
8 if (line.Length > 1 )
9 {
10 char charfirst = line[ 0 ];
11 char charseconde = line[ 1 ];
12 string other = line.Length > 2 ? line.Remove( 0 , 2 ) : null ;
13 if (dictMemory.ContainsKey(charfirst))
14 {
15 if (dictMemory[charfirst].ContainsKey(charseconde))
16 {
17 HashSet < string > list = dictMemory[charfirst][charseconde];
18 if ( ! string .IsNullOrEmpty(other) && ! list.Contains(other))
19 list.Add(other);
20 }
21 else
22 {
23 HashSet < string > list = new HashSet < string > ();
24 if ( ! string .IsNullOrEmpty(other))
25 list.Add(other);
26 dictMemory[charfirst].Add(charseconde, list);
27 }
28 }
29 else
30 {
31 Dictionary < char , HashSet < string >> d = new Dictionary < char , HashSet < string >> ();
32 HashSet < string > list = new HashSet < string > ();
33 if ( ! string .IsNullOrEmpty(other))
34 list.Add(other);
35 d.Add(charseconde, list);
36 dictMemory.Add(charfirst, d);
37 }
38 }
39 }
40 }
2 {
3 Stream stream = new FileStream(dictSourcePath, FileMode.Open, FileAccess.Read, FileShare.Read);
4 StreamReader sr = new StreamReader(stream, Encoding.Default);
5 while (sr.Peek() > - 1 )
6 {
7 string line = sr.ReadLine();
8 if (line.Length > 1 )
9 {
10 char charfirst = line[ 0 ];
11 char charseconde = line[ 1 ];
12 string other = line.Length > 2 ? line.Remove( 0 , 2 ) : null ;
13 if (dictMemory.ContainsKey(charfirst))
14 {
15 if (dictMemory[charfirst].ContainsKey(charseconde))
16 {
17 HashSet < string > list = dictMemory[charfirst][charseconde];
18 if ( ! string .IsNullOrEmpty(other) && ! list.Contains(other))
19 list.Add(other);
20 }
21 else
22 {
23 HashSet < string > list = new HashSet < string > ();
24 if ( ! string .IsNullOrEmpty(other))
25 list.Add(other);
26 dictMemory[charfirst].Add(charseconde, list);
27 }
28 }
29 else
30 {
31 Dictionary < char , HashSet < string >> d = new Dictionary < char , HashSet < string >> ();
32 HashSet < string > list = new HashSet < string > ();
33 if ( ! string .IsNullOrEmpty(other))
34 list.Add(other);
35 d.Add(charseconde, list);
36 dictMemory.Add(charfirst, d);
37 }
38 }
39 }
40 }
转换到Lucene接口
1
public
class
YurowTokenizer : Tokenizer
2 {
3 private string text;
4 private List < string > list;
5 int current = 0 ;
6 private string path;
7 static Participle p;
8 bool isfirstrun = true ;
9
10 public YurowTokenizer(TextReader textreader, string path)
11 {
12 text = textreader.ReadToEnd();
13 this .path = path;
14 if (p == null )
15 {
16 p = new Participle();
17 p.Init(path);
18 }
19 }
20
21 public override Token Next()
22 {
23 if ( string .IsNullOrEmpty(text))
24 return null ;
25
26 if (isfirstrun)
27 {
28 list = p.TextArray(text);
29 isfirstrun = false ;
30 }
31 if (list.Count < 1 || current >= list.Count)
32 return null ;
33 int start = p.StartArr[current];
34 string currentstr = list[current];
35 Token token = new Token(currentstr, start, start + currentstr.Length);
36 current ++ ;
37 return token;
38 }
39 }
2 {
3 private string text;
4 private List < string > list;
5 int current = 0 ;
6 private string path;
7 static Participle p;
8 bool isfirstrun = true ;
9
10 public YurowTokenizer(TextReader textreader, string path)
11 {
12 text = textreader.ReadToEnd();
13 this .path = path;
14 if (p == null )
15 {
16 p = new Participle();
17 p.Init(path);
18 }
19 }
20
21 public override Token Next()
22 {
23 if ( string .IsNullOrEmpty(text))
24 return null ;
25
26 if (isfirstrun)
27 {
28 list = p.TextArray(text);
29 isfirstrun = false ;
30 }
31 if (list.Count < 1 || current >= list.Count)
32 return null ;
33 int start = p.StartArr[current];
34 string currentstr = list[current];
35 Token token = new Token(currentstr, start, start + currentstr.Length);
36 current ++ ;
37 return token;
38 }
39 }
有兴趣的朋友可以自己反编译查看源码。暂时不提供完整源码。
http://www.cnblogs.com/birdshover/ by yurow