准备工作
NuGet引入Lucene.Net包。
开始搭建
几个中文分词的类:ChineseAnalyzer、ChineseTokenizer、WordTree
使用的类
- ChineseAnalyzer 类
public class ChineseAnalyzer : Analyzer
{
private static string NoisePath = Environment.CurrentDirectory + "\\data\\sNoise.txt";
private string keywords = "";
public ChineseAnalyzer(string keywords)
{
this.keywords = keywords;
}
public static Hashtable chartable = new Hashtable();
private void GetNoise()
{
long dt_s = DateTime.Now.Ticks;
string char_s;
StreamReader reader = new StreamReader(NoisePath, System.Text.Encoding.UTF8);
string word = reader.ReadLine();
while (word != null && word.Trim() != "")
{
Hashtable t_chartable = chartable;
for (int i = 0; i < word.Length; i++)
{
char_s = word.Substring(i, 1);
if (!t_chartable.Contains(char_s))
{
t_chartable.Add(char_s, new Hashtable());
}
t_chartable = (Hashtable)t_chartable[char_s];
}
word = reader.ReadLine();
}
reader.Close();
}
public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
{
GetNoise();
TokenStream result = new ChineseTokenizer(reader, keywords);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(result, chartable);
return result;
}
}
- WordTree 类
public class WordTree
{
private static string DictPath = Environment.CurrentDirectory + "\\data\\sDict.txt";
public static Hashtable chartable = new Hashtable();
private static bool DictLoaded = false;
public static double DictLoad_Span = 0;
public string strChinese = "[\u4e00-\u9fa5]";
public int GetCharType(string Char)
{
if (new Regex(strChinese).IsMatch(Char))
return 0;
return -1;
}
private static string IsKw = "";
public void LoadDict(string keywords)
{
if (IsKw == keywords) return;
chartable = new Hashtable();
BuidDictTree(keywords);
IsKw = keywords;
return;
}
IDbConnection conn;
private void BuidDictTree(string keywords)
{
if (keywords.IsNull())
{
long dt_s = DateTime.Now.Ticks;
string char_s;
StreamReader reader = new StreamReader(DictPath, System.Text.Encoding.UTF8);
string word = reader.ReadLine();
while (word != null && word.Trim() != "")
{
Hashtable t_chartable = chartable;
for (int i = 0; i < word.Length; i++)
{
char_s = word.Substring(i, 1);
if (!t_chartable.Contains(char_s))
{
t_chartable.Add(char_s, new Hashtable());
}
t_chartable = (Hashtable)t_chartable[char_s];
}
word = reader.ReadLine();
}
reader.Close();
DictLoad_Span = (double)(DateTime.Now.Ticks - dt_s) / (1000 * 10000);
System.Console.Out.WriteLine("读取字典文件所用的时间: " + DictLoad_Span + "s");
}
else
{
try
{
string char_s;
if (!keywords.Contains(" "))
{
keywords = keywords + " ";
}
foreach (var itemkw in keywords.Split(" "))
{
string word = itemkw;
if (!word.IsNull())
{
Hashtable t_chartable = chartable;
for (int i = 0; i < word.Length; i++)
{
char_s = word.Substring(i, 1);
if (!t_chartable.Contains(char_s))
{
t_chartable.Add(char_s, new Hashtable());
}
t_chartable = (Hashtable)t_chartable[char_s];
}
}
}
}
catch (Exception ex)
{
}
}
}
}
- ChineseTokenizer 类
class ChineseTokenizer : Tokenizer
{
private int offset = 0, bufferIndex = 0, dataLen = 0;//偏移量,当前字符的位置,字符长度
private int start;
private string text;
public double TextSeg_Span = 0;
private string keywords = "";
public ChineseTokenizer(System.IO.TextReader reader, string keywords)
{
this.input = reader;
text = input.ReadToEnd();
dataLen = text.Length;
this.keywords = keywords;
}
public override Token Next()
{
Token token = null;
WordTree tree = new WordTree();
tree.LoadDict(keywords);
Hashtable t_chartable = WordTree.chartable;
string ReWord = "";
string char_s;
start = offset;
bufferIndex = start;
while (true)
{
if (start >= dataLen)
{
break;
}
char_s = text.Substring(start, 1);
if (string.IsNullOrEmpty(char_s.Trim()))
{
start++;
continue;
}
if (!t_chartable.Contains(char_s))
{
if (ReWord == "")
{
int j = start + 1;
switch (tree.GetCharType(char_s))
{
case 0://中文单词
ReWord += char_s;
break;
default:
ReWord += char_s;//其他字符单词
break;
}
offset = j;//设置取下一个词的开始位置
}
else
{
offset = start;//设置取下一个词的开始位置
}
//返回token对象
return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1);
}
//字符在字典中
ReWord += char_s;
//取得属于当前字符的词典树
t_chartable = (Hashtable)t_chartable[char_s];
//设置下一循环取下一个词的开始位置
start++;
if (start == dataLen)
{
offset = dataLen;
return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1);
}
}
return token;
}
}
代码调用
/// 使用方法
/// <summary>
/// 处理CUT
/// </summary>
private List<string> CutWord(string kws)
{
List<string> ListKws = new List<string>();
Analyzer analyzer = new TextFc.Tool.ChineseAnalyzer(kws);
StringReader sr = new StringReader(docstr);
TokenStream stream = analyzer.TokenStream(null, sr);
Lucene.Net.Analysis.Token t = stream.Next();
string tstr = "";
while (t != null)
{
tstr = t.ToString().Replace("(", "").Split(",")[0];
tstr //分词结果
t = stream.Next();
}
return ListKws;
}