中科院分词 ICTCLAS 的 .net 版本 是吕震宇1.0版本开发
Lucene.Net2.9
接口
ICTCLASAnalyzer .cs
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.IO;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
namespace Demo
{
public class ICTCLASAnalyzer : Analyzer
{
//定义要过滤的词
private string dictPath;
public ICTCLASAnalyzer(string dictPath)
{
this.dictPath = dictPath;
}
public override TokenStream TokenStream(string fieldName, TextReader reader)
{
TokenStream ts = new ICTCLASTokenizer(reader, dictPath);
return ts;
}
}
}
ICTCLASTokenizer.cs
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.IO;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using SharpICTCLAS;
namespace Demo
{
class ICTCLASTokenizer : Tokenizer
{
int nKind = 2;
List<WordResult[]> result;
int startIndex = 0;
int endIndex = 0;
int i = 1;
/** <summary>
/// 待分词的句子
/// </summary>
private string sentence;
/** <summary>Constructs a tokenizer for this Reader. </summary>
public ICTCLASTokenizer(System.IO.TextReader reader, string DictPath)
{
this.input = reader;
sentence = input.ReadToEnd();
sentence = sentence.Replace("\r\n","");
//string DictPath = @"E:\TestDemo\lucene.net+2.9.2+实现索引生成,修改,查询,删除实例\Demo\WordSegmentDate\";
//string DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;
//Console.WriteLine("正在初始化字典库,请稍候");
WordSegment wordSegment = new WordSegment();
wordSegment.InitWordSegment(DictPath);
result = wordSegment.Segment(sentence, nKind);
}
/** <summary>进行切词,返回数据流中下一个token或者数据流为空时返回null
/// </summary>
public override Token Next()
{
Token token = null;
while (i < result[0].Length-1)
{
string word = result[0][i].sWord;
endIndex = startIndex + word.Length - 1;
token = new Token(word, startIndex, endIndex);
startIndex = endIndex + 1;
i++;
return token;
}
return null;
}
}
}
DEMO地址: