Lucene.Net只是一个全文检索开发包。它的功能就是提供了全文检索功能的一个数据库。Lucene.Net不管文本数据怎么来的,用户可以基于Lucene.Net开发满足自己需求的搜索引擎。Lucene.Net智能对
文本信息进行检索。如果不是文本信息,要转换为文本信息,比如检索Excel文件,就要用NPOI把Excel读取成字符串,然后把字符串扔给Lucene.Net。Lucene.Net会把扔给它的文本切词保存,加快检索
速度。
Lucene.Net中不同的分词算法就是不同的类,所有分词算法类都从Analyzer类继承。
庖丁解牛、盘古分词,IKAnalyzer分词(Java)等是基于词库的分词算法,可以提高分词成功率,但是效率低。
盘古分词使用方法:
将Dict文件夹放置项目根目录,并将其下文件"复制到输出目录"属性设置为"如果较新则复制"
引用PanGu.dll与PanGu.Lucene.Analyzer.dll类库即可使用盘古分词算法
添加引用:PanGu.HighLight.dll
我的代码示例用的是Lucene的2.9.2版本,我也在思考,欢迎大家给我意见,我用3.0以上的Lucene用不了盘古分词,期待更好的分词出现在.net的平台下。
//获取索引路径
string indexPath = new LuceneAreaProvider().GetIndexPath();
#region 分析器
private Analyzer _analyzer = null;
public Analyzer analyzer
{
get
{
_analyzer = new Lucene.Net.Analysis.PanGu.PanGuAnalyzer();
return _analyzer;
}
}
#endregion
#region FSDirectory directory_luce
private Lucene.Net.Store.FSDirectory _directory_luce = null;
public Lucene.Net.Store.FSDirectory directory_luce
{
get
{
if (_directory_luce == null)
{
_directory_luce = Lucene.Net.Store.FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
}
return _directory_luce;
}
}
#endregion
#region 获取分词
/// <summary>
/// 获取分词
/// </summary>
/// <param name="searchText"></param>
/// <returns></returns>
private List<string> GetSplitString(string searchText)
{
List<string> listResult = new List<string>();
TokenStream tokenStream = analyzer.TokenStream(searchText, new StringReader(searchText));
//Boolean hasNext = tokenStream.IncrementToken();
Lucene.Net.Analysis.Tokenattributes.TermAttributeImpl ita;
//while (hasNext)
//{
// //ita = tokenStream.GetAttribute<Lucene.Net.Analysis.Tokenattributes.TermAttributeImpl>();
// //listResult.Add(tokenStream());
// hasNext = tokenStream.IncrementToken();
//}
Token token = tokenStream.Next();
while (token != null)
{
listResult.Add(token.TermText());
token = tokenStream.Next();
}
return listResult;
}
#endregion
#region 创建索引
/// <summary>
/// 创建索引
/// 连表查询所获取的数据
/// </summary>
public void CreateIndex()
{
//这个是存在磁盘上,还有一种方式是存在内存中,这个得根据具体的情况而定
FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
//思考判断 字典是否存在并有数据
//有的话做追加 没有就是添加所有的数据索引
bool isUpdate = IndexReader.IndexExists(directory);
if (isUpdate)
{
if (IndexWriter.IsLocked(directory))
{
IndexWriter.Unlock(directory);
}
}
//IndexWriter用于向索引库写内容
//IndexWriter的第三个参数的解释:true表示删除之前的重新写入 false:表示追加
//使用IndexWriter打开directory时会自动对索引库文件上锁 多人同时操作并发问题
IndexWriter writer = new IndexWriter(directory, analyzer, !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);
// getRegionData 为数据源,这个得根据个人的具体情况而定
foreach (ApiRegionMappingEntity item in getRegionData)
{
Document document = new Document();
document.Add(new Field("id", item.ApiRegionMapId.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));//NOT_ANALYZED--不分词
document.Add(new Field("regionID", item.JxRegionId.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
document.Add(new Field("ApiRegionId", item.ApiRegionId.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
document.Add(new Field("Platform", item.Platform.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
document.Add(new Field("name", item.ApiRegionName.ToString(), Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
writer.AddDocument(document);//将文档写入索引库
}
writer.Optimize(); //添加完后 合并
writer.Close();
directory.Close();//不要忘了Close,否则索引结果搜不到
}
#endregion
#region 关键字搜索 集合
/// <summary>
/// 搜索
/// </summary>
/// <param name="searchText"></param>
/// <returns></returns>
public List<ApiRegionMappingEntity> Search(string searchText)
{
//搜索关键字
string kw = searchText;
//索引路径
//string indexPath = System.AppDomain.CurrentDomain.BaseDirectory + "\\lucenedir";
//FS是FileSystem的简写,它的父类是Directory,Directory表示索引文件保存的地方,它有两个子类FSDirectory、RAMDirectory。使用时别和IO里的Directory混了
FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory());//打开索引库
IndexReader reader = IndexReader.Open(directory, true);//IndexWriter用于从索引库读内容
IndexSearcher searcher = new IndexSearcher(reader);//IndexSearcher用于搜索索引库
//搜索条件
BooleanQuery query = new BooleanQuery();
foreach (string word in GetSplitString(searchText))
{
query.Add(new TermQuery(new Term("name", word)), BooleanClause.Occur.SHOULD);
}
//创建 盛放查询结果的容器
TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);
//根据条件 查询结果放入容器
searcher.Search(query, collector);
//获取所有的文档数据
List<ApiRegionMappingEntity> regionMapingList = new List<ApiRegionMappingEntity>();
ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs;
for (int i = 0; i < docs.Length; i++)
{
int docId = docs[i].doc;//得到查询结果文档的id(Lucene内部分配的id)
Document doc = searcher.Doc(docId);//找到文档id对应的文档详细信息
ApiRegionMappingEntity mappingEntity = new ApiRegionMappingEntity();
mappingEntity.ApiRegionMapId = Convert.ToInt32(doc.Get("id"));
mappingEntity.JxRegionId = Convert.ToInt32(doc.Get("regionID"));
mappingEntity.ApiRegionId = Convert.ToInt32(doc.Get("ApiRegionId"));
mappingEntity.Platform = Convert.ToInt32(doc.Get("Platform"));
mappingEntity.ApiRegionName = doc.Get("name");
mappingEntity.ApiRegionId = Convert.ToInt32(doc.Get("ApiRegionId"));
regionMapingList.Add(mappingEntity);
}
return regionMapingList;
}
#endregion
#region 关键词高亮显示
/// <summary>
/// 关键词高亮显示
/// </summary>
/// <param name="keyword"></param>
/// <param name="content"></param>
/// <returns></returns>
private string Highlight(string keyword, string content)
{
//创建HTMLFormatter,参数为高亮单词的前后缀
PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter =
new PanGu.HighLight.SimpleHTMLFormatter("<font color=\"red\"><b>", "</b></font>");
//创建 Highlighter ,输入HTMLFormatter 和 盘古分词对象Semgent
PanGu.HighLight.Highlighter highlighter =
new PanGu.HighLight.Highlighter(simpleHTMLFormatter,
new Segment());
//设置每个摘要段的字符数
highlighter.FragmentSize = 50;
//获取最匹配的摘要段
return highlighter.GetBestFragment(keyword, content);
}
#endregion
#region 删除索引
/// <summary>
/// 删除索引
/// </summary>
/// <param name="field">field</param>
/// <param name="value">value</param>
public void DeleteIndex(string field, string value)
{
FSDirectory directory = directory_luce;
bool isUpdate = IndexReader.IndexExists(directory);
if (isUpdate)
{
if (IndexWriter.IsLocked(directory))
{
IndexWriter.Unlock(directory);
}
}
IndexWriter writer = new IndexWriter(directory, analyzer, !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);
writer.DeleteDocuments(new Term(field, value));
writer.Close();
}
/// <summary>
/// 删除所有 索引
/// </summary>
public void DelelteAll()
{
FSDirectory directory = directory_luce;
bool isUpdate = IndexReader.IndexExists(directory);
if (isUpdate)
{
if (IndexWriter.IsLocked(directory))
{
IndexWriter.Unlock(directory);
}
}
IndexWriter writer = new IndexWriter(directory, analyzer, !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);
writer.DeleteAll();
writer.Close();
}
#endregion
Query
TermQuery
BooleanQuery
RangeQuery范围搜索
PrefixQuery前缀搜索
PhraseQuery多关键字的搜索
FuzzyQuery 相近词语的搜索