在了解了lucene的工作原理和流程后,就可以更进一步对原有代码进行改进了。在原有项目中使用的是默认的StandardAnalyzer,只能将文本分割成单个词,对于中文并不是很友好,所以本次将替换使用自定义分词器。
1.使用JieBa分词
根据Lucene的源码,只需要继承基类Analyzer即可实现自定义的分词器。此外在原有的SearchManager中已经预留的泛型接口,也可以很方便的接入。
使用NuGet管理器安装jieba.NET。Github上有一堆针对Lucene.NET的JiaBa分词程序包,这里选择了下载量较高的那一个。
1.1.自定义Analyzer实现
public class JieBaAnalyzer : Analyzer
{
private readonly TokenizerMode _mode;
private string _stopUrl;
public JieBaAnalyzer(TokenizerMode mode,string stopUrl= "./Resources/stopwords.txt") :base()
{
this._mode = mode;
_stopUrl = stopUrl;
}
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
{
var tokenizer = new JieBaTokenizer(reader, _mode,_stopUrl);
var tokenstream = (TokenStream)new LowerCaseFilter(Lucene.Net.Util.LuceneVersion.LUCENE_48, (TokenStream)tokenizer);
tokenstream.AddAttribute<ICharTermAttribute>();
tokenstream.AddAttribute<IOffsetAttribute>();
return new TokenStreamComponents((Tokenizer)tokenizer, tokenstream);
}
}
1.2.自定义Tokenizer实现
public class JieBaTokenizer : Tokenizer
{
private System.Collections.Generic.List<JiebaNet.Segmenter.Token> _wordList = new List<JiebaNet.Segmenter.Token>();
private string _inputText;
private ICharTermAttribute _termAtt;
private IOffsetAttribute _offsetAtt;
private IPositionIncrementAttribute _posIncrAtt;
private ITypeAttribute _typeAtt;
private Dictionary<string,int> _stopWords = new Dictionary<string, int>();
private IEnumerator<JiebaNet.Segmenter.Token> _iter;
private readonly JiebaSegmenter _segmenter;
private readonly TokenizerMode _mode;
public JieBaTokenizer(TextReader input, TokenizerMode mode,string stopUrl= "./Resources/stopwords.txt")
: base(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input)
{
_segmenter = new JiebaSegmenter();
_mode = mode;
LoadStopWords(stopUrl);
Init();
}
/// <summary>
/// 加载停用词
/// </summary>
/// <param name="filePath"></param>
private void LoadStopWords(string filePath)
{
using (StreamReader reader=File.OpenText(AppDomain.CurrentDomain.BaseDirectory+filePath))
{
string tmp;
while ((tmp=reader.ReadLine())!=null)
{
if (string.IsNullOrEmpty(tmp))
{
continue;
}
if (_stopWords.ContainsKey(tmp))
{
continue;
}
_stopWords.Add(tmp,1);
}
}
}
/// <summary>
/// 初始化(添加属性)
/// </summary>
private void Init()
{
_termAtt = AddAttribute<ICharTermAttribute>();
_offsetAtt = AddAttribute<IOffsetAttribute>();
_posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
_typeAtt = AddAttribute<ITypeAttribute>();
}
private string ReadToEnd(TextReader input)
{
return input.ReadToEnd();
}
public sealed override Boolean IncrementToken()
{
ClearAttributes();
Lucene.Net.Analysis.Token token = Next();
if (token != null)
{
var buffer = token.ToString();
_termAtt.SetEmpty().Append(buffer);
_offsetAtt.SetOffset(CorrectOffset(token.StartOffset), CorrectOffset(token.EndOffset));
_typeAtt.Type = token.Type;
return true;
}
End();
this.Dispose();
return false;
}
public Lucene.Net.Analysis.Token Next()
{
bool res = _iter.MoveNext();
if (res)
{
JiebaNet.Segmenter.Token current = _iter.Current;
if (current!=null)
{
Lucene.Net.Analysis.Token token = new Lucene.Net.Analysis.Token(current.Word, current.StartIndex, current.EndIndex);
return token;
}
else
{
return null;
}
}
else
return null;
}
public override void Reset()
{
base.Reset();
_inputText = ReadToEnd(base.m_input);
IEnumerable<JiebaNet.Segmenter.Token> tokens = _segmenter.Tokenize(_inputText, _mode);//获取JieBa分词Token
_wordList.Clear();//清除分词列表
foreach (var token in tokens)
{
if (!_stopWords.ContainsKey(token.Word))//移除停用词
{
_wordList.Add(token);
}
}
_iter = _wordList.GetEnumerator();
}
}
1.3.添加字典资源
JieBa分词提供了词库资源,能够更好的根据中文习惯分词。词库资源包位于NuGet包里的Resources文件夹。
包含了字典、专有名词、停用词等,也可以根据自己的需要添加自己想要的分词。
由于在自定义的JieBaTokenizer里包含了读取停用词词库的方法,所以需要将Resources文件夹复制到程序运行目录,以便于初始化时加载停用词,并从分词列表中移除停用词。
1.4.替换原有实现
services.AddSingleton<Lucene.Net.Store.Directory>(Lucene.Net.Store.FSDirectory.Open(configuration["Search:DefaultPath"]));
services.AddSingleton<Lucene.Net.Analysis.Analyzer>(new JieBaAnalyzer(TokenizerMode.Search, configuration["Search:StopWords"]));
//services.AddSingleton<Lucene.Net.Analysis.Analyzer>(new StandardAnalyzer(LuceneVersion.LUCENE_48));
services.AddTransient<ISearchManager, SearchManager> ();
在.NET Core应用的Startup启动类中通过依赖注入的形式替换默认的Analyzer。
原有方法逻辑不受影响,替换完毕。
今天再收获一点,明天再学习一点。