实现效果:
上一篇文章有附全文搜索结果的设计图,下面截一张开发完成上线后的实图:
基本风格是模仿的百度搜索结果,绿色的分页略显小清新。
目前已采集并创建索引的文章约3W多篇,索引文件不算太大,查询速度非常棒。
刀不磨要生锈,人不学要落后。每天都要学一些新东西。
基本技术介绍:
还记得上一次做全文搜索是在2013年,主要核心设计与代码均是当时的架构师写的,自己只能算是全程参与。
当时使用的是经典搭配:盘古分词+Lucene.net。
前几篇文章有说到,盘古分词已经很多年不更新了,我在SupportYun系统一直引用的JieBaNet来做分词技术。
那么是否也有成型的JieBaNet+Lucene.Net的全文搜索方案呢?
经过多番寻找,在GitHub上面找到一个简易的例子:https://github.com/anderscui/jiebaForLuceneNet
博主下面要讲的实现方案就是从这个demo得到的启发,大家有兴趣可以去看看这个demo。
博主使用的具体版本:Lucene.net 3.0.3.0 ,JieBaNet 0.38.3.0(做过简易的调整与扩展,前面文章有讲到)
首先我们对Lucene.Net的分词器Tokenizer、分析器Analyzer做一个基于JieBaNet的扩展。
1.基于LuceneNet扩展的JieBa分析器JiebaForLuceneAnalyzer
1 /// <summary> 2 /// 基于LuceneNet扩展的JieBa分析器 3 /// </summary> 4 public class JiebaForLuceneAnalyzer : Analyzer 5 { 6 protected static readonly ISet<string> DefaultStopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; 7 8 private static ISet<string> StopWords; 9 10 static JiebaForLuceneAnalyzer() 11 { 12 StopWords = new HashSet<string>(); 13 var stopWordsFile = Path.GetFullPath(JiebaNet.Analyser.ConfigManager.StopWordsFile); 14 if (File.Exists(stopWordsFile)) 15 { 16 var lines = File.ReadAllLines(stopWordsFile); 17 foreach (var line in lines) 18 { 19 StopWords.Add(line.Trim()); 20 } 21 } 22 else 23 { 24 StopWords = DefaultStopWords; 25 } 26 } 27 28 public override TokenStream TokenStream(string fieldName, TextReader reader) 29 { 30 var seg = new JiebaSegmenter(); 31 TokenStream result = new JiebaForLuceneTokenizer(seg, reader); 32 result = new LowerCaseFilter(result); 33 result = new StopFilter(true, result, StopWords); 34 return result; 35 } 36 }
2.基于LuceneNet扩展的JieBa分词器:JiebaForLuceneTokenizer
1 /// <summary> 2 /// 基于Lucene的JieBa分词扩展 3 /// </summary> 4 public class JiebaForLuceneTokenizer:Tokenizer 5 { 6 private readonly JiebaSegmenter segmenter; 7 private readonly ITermAttribute termAtt; 8 private readonly IOffsetAttribute offsetAtt; 9 private readonly ITypeAttribute typeAtt; 10 11 private readonly List<Token> tokens; 12 private int position = -1; 13 14 public JiebaForLuceneTokenizer(JiebaSegmenter seg, TextReader input):this(seg, input.ReadToEnd()) { } 15 16 public JiebaForLuceneTokenizer(JiebaSegmenter seg, string input) 17 { 18 segmenter = seg; 19 termAtt = AddAttribute<ITermAttribute>(); 20 offsetAtt = AddAttribute<IOffsetAttribute>(); 21 typeAtt = AddAttribute<ITypeAttribute>(); 22 23 var text = input; 24 tokens = segmenter.Tokenize(text, TokenizerMode.Search).ToList(); 25 } 26 27 public override bool IncrementToken() 28 { 29 ClearAttributes(); 30 position++; 31 if (position < tokens.Count) 32 { 33 var token = tokens[position]; 34 termAtt.SetTermBuffer(token.Word); 35 offsetAtt.SetOffset(token.StartIndex, token.EndIndex); 36 typeAtt.Type = "Jieba"; 37 return true; 38 } 39 40 End(); 41 return false; 42 } 43 44 public IEnumerable<Token> Tokenize(string text, TokenizerMode mode = TokenizerMode.Search) 45 { 46 return segmenter.Tokenize(text, mode); 47 } 48 }
理想如果不向现实做一点点屈服,那么理想也将归于尘土。
实现方案设计:
我们做全文搜索的设计时一定会考虑的一个问题就是:我们系统是分很多模块的,不同模块的字段差异很大,怎么才能实现同一个索引,既可以单个模块搜索又可以全站搜索,甚至按一些字段做条件来搜索呢?
这些也是SupportYun系统需要考虑的问题,因为目前的数据就天然的拆分成了活动、文章两个类别,字段也大有不同。博主想实现的是一个可以全站搜索(结果包括活动、文章),也可以在文章栏目/活动栏目分别搜索,并且可以按几个指定字段来做搜索条件。
要做一个这样的全文搜索功能,我们需要从程序设计上来下功夫。下面就介绍一下博主的设计方案:
一、索引创建
1.我们设计一个IndexManager来处理最基本的索引创建、更新、删除操作。
1 public class IndexManager 2 { 3 /// <summary> 4 /// 索引存储目录 5 /// </summary> 6 public static readonly string IndexStorePath = ConfigurationManager.AppSettings["IndexStorePath"]; 7 private IndexWriter indexWriter; 8 private FSDirectory entityDirectory; 9 10 ~IndexManager() 11 { 12 if (entityDirectory != null) 13 { 14 entityDirectory.Dispose(); 15 } 16 if (indexWriter != null) 17 { 18 indexWriter.Dispose(); 19 } 20 } 21 22 /// <summary> 23 /// 对内容新增索引 24 /// </summary> 25 public void BuildIndex(List<IndexContent> indexContents) 26 { 27 try 28 { 29 if (entityDirectory == null) 30 { 31 entityDirectory = FSDirectory.Open(new DirectoryInfo(IndexStorePath)); 32 } 33 if (indexWriter == null) 34 { 35 Analyzer analyzer = new JiebaForLuceneAnalyzer(); 36 indexWriter = new IndexWriter(entityDirectory, analyzer, IndexWriter.MaxFieldLength.LIMITED); 37 } 38 lock (IndexStorePath) 39 { 40 foreach (var indexContent in indexContents) 41 { 42 var doc = GetDocument(indexContent); 43 indexWriter.AddDocument(doc); 44 } 45 indexWriter.Commit(); 46 indexWriter.Optimize(); 47 indexWriter.Dispose(); 48 } 49 } 50 catch (Exception exception) 51 { 52 LogUtils.ErrorLog(exception); 53 } 54 finally 55 { 56 if (entityDirectory != null) 57 { 58 entityDirectory.Dispose(); 59 } 60 if (indexWriter != null) 61 { 62 indexWriter.Dispose(); 63 } 64 } 65 } 66 67 /// <summary> 68 /// 删除索引 69 /// </summary> 70 /// <param name="moduleType"></param> 71 /// <param name="tableName">可空</param> 72 /// <param name="rowID"></param> 73 public void DeleteIndex(string moduleType, string tableName, string rowID) 74 { 75 try 76 { 77 if (entityDirectory == null) 78 { 79 entityDirectory = FSDirectory.Open(new DirectoryInfo(IndexStorePath)); 80 } 81 if (indexWriter == null) 82 { 83 Analyzer analyzer = new JiebaForLuceneAnalyzer(); 84 indexWriter = new IndexWriter(entityDirectory, analyzer, IndexWriter.MaxFieldLength.LIMITED); 85 } 86 lock (IndexStorePath) 87 { 88 var query = new BooleanQuery 89 { 90 { new TermQuery(new Term("ModuleType", moduleType)), Occur.MUST}, 91 { new TermQuery(new Term("RowId", rowID)), Occur.MUST} 92 }; 93 if (!string.IsNullOrEmpty(tableName)) 94 { 95 query.Add(new TermQuery(new Term("TableName", tableName)), Occur.MUST); 96 } 97 98 indexWriter.DeleteDocuments(query); 99 indexWriter.Commit(); 100 indexWriter.Optimize(); 101 indexWriter.Dispose(); 102 } 103 } 104 catch (Exception exception) 105 { 106 LogUtils.ErrorLog(exception); 107 } 108 finally 109 { 110 if (entityDirectory != null) 111 { 112 entityDirectory.Dispose(); 113 } 114 if (indexWriter != null) 115 { 116 indexWriter.Dispose(); 117 } 118 } 119 } 120 121 /// <summary> 122 /// 更新索引 123 /// </summary> 124 /// <param name="indexContent"></param> 125 public void UpdateIndex(IndexContent indexContent) 126 { 127 try 128 { 129 if (entityDirectory == null) 130 { 131 entityDirectory = FSDirectory.Open(new DirectoryInfo(IndexStorePath)); 132 } 133 if (indexWriter == null) 134 { 135 Analyzer analyzer = new JiebaForLuceneAnalyzer(); 136 indexWriter = new IndexWriter(entityDirectory, analyzer, IndexWriter.MaxFieldLength.LIMITED); 137 } 138 lock (IndexStorePath)