C#使用Lucene中文分词
准备工作
NuGet引入Lucene.Net包。
开始搭建
几个中文分词的类:ChineseAnalyzer、ChineseTokenizer、WordTree
如何插入一段漂亮的代码片
ChineseAnalyzer 类
public class ChineseAnalyzer : Analyzer
{
private static string NoisePath = Environment.CurrentDirectory + "\\data\\sNoise.txt";
private string keywords = "";
public ChineseAnalyzer(string keywords)
{
this.keywords = keywords;
}
public static Hashtable chartable = new Hashtable();
private void GetNoise()
{
long dt_s = DateTime.Now.Ticks;
string char_s;
StreamReader reader = new StreamReader(NoisePath, System.Text.Encoding.UTF8);
string word = reader.ReadLine();
while (word != null && word.Trim() != "")
{
Hashtable t_chartable = chartable;
for (int i = 0; i < word.Length; i++)
{
char_s = word.Substring(i, 1);
if (!t_chartable.Contains(char_s))
{
t_chartable.Add(char_s, new Hashtable());
}
t_chartable = (Hashtable)t_chartable[char_s];
}
word = reader.ReadLine();
}
reader.Close();
}
public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
{
GetNoise();
TokenStream result = new ChineseTokenizer(reader, keywords);
result = new StandardFilter(result);
result = new