目前的查询方法过于简单,而且无法与实际业务中的实体建立关系,因此本篇文章就来描述对查询方法的扩展。
1.查询多个字段的检索方法
1.1.定义接口及输入输出项
查询输入项SingleSearchOption:
public class SingleSearchOption:SearchOptionBase
{
/// <summary>
/// 检索关键词
/// </summary>
public string Keyword { get; set; }
/// <summary>
/// 限定检索域
/// </summary>
public List<string> Fields { get; set; }
public SingleSearchOption(string keyword,List<string> fields,int maxHits=100)
{
if (string.IsNullOrWhiteSpace(keyword))
{
throw new ArgumentException("搜索关键词不能为空");
}
Keyword = keyword;
Fields = fields;
MaxHits = maxHits;
}
public SingleSearchOption()
{
}
}
其中SearchOptionBase:
public class SearchOptionBase : ISearchOption
{
/// <summary>
/// 最大检索量
/// </summary>
public int MaxHits { get ; set; }
}
输出结果SingleSearchResult:
public class SingleSearchResult : ISearchResult<SearchResultItem>
{
/// <summary>
/// 匹配结果
/// </summary>
public List<SearchResultItem> Items { get; set; }
/// <summary>
/// 检索耗时
/// </summary>
public long Elapsed { get; set; }
/// <summary>
/// 匹配结果数
/// </summary>
public int TotalHits { get; set; }
public SingleSearchResult()
{
Items = new List<SearchResultItem>();
}
}
其中查询结果项SearchResultItem:
public class SearchResultItem : ISearchResultItem
{
/// <summary>
/// 结果评分
/// </summary>
public float Score { get; set; }
/// <summary>
/// 实体Id
/// </summary>
public string EntityId { get; set; }
/// <summary>
/// 实体类名
/// </summary>
public string EntityName { get; set; }
}
1.2.方法实现
/// <summary>
/// 简单查询
/// </summary>
/// <param name="option"></param>
/// <returns></returns>
public SingleSearchResult SingleSearch(SingleSearchOption option)
{
SingleSearchResult result = new SingleSearchResult();
Stopwatch watch=Stopwatch.StartNew();
using (Lucene.Net.Index.DirectoryReader reader = DirectoryReader.Open(Directory))
{
//实例化索引检索器
IndexSearcher searcher = new IndexSearcher(reader);
var queryParser = new MultiFieldQueryParser(LuceneVersion.LUCENE_48, option.Fields.ToArray(), Analyzer);
Query query = queryParser.Parse(option.Keyword);
var matches = searcher.Search(query, option.MaxHits).ScoreDocs;
result.TotalHits = matches.Count();
foreach (var match in matches)
{
var doc = searcher.Doc(match.Doc);
SearchResultItem item = new SearchResultItem();
item.Score = match.Score;
item.EntityId = doc.GetField(CoreConstant.EntityId).GetStringValue();
item.EntityName = doc.GetField(CoreConstant.EntityType).GetStringValue();
result.Items.Add(item);
}
}
watch.Stop();
result.Elapsed = watch.ElapsedMilliseconds;
return result;
}
其中实体标识EntityId、实体类名EntityName这两个域是在创建索引时添加进去的,这样确保每个Document和数据库的每条记录都能通过Id被互相找到。
2.可设置权重的检索方法
2.1.定义接口及输入输出项
输入项为:
public class ScoredSearchOption:SearchOptionBase
{
/// <summary>
/// 检索关键词
/// </summary>
public string Keyword { get; set; }
/// <summary>
/// 限定检索域
/// </summary>
public List<string> Fields { get; set; }
/// <summary>
/// 多字段搜索时,给字段设定搜索权重
/// </summary>
private readonly Dictionary<string, float> _boosts;
/// <summary>
/// 多字段搜索时,给字段设定搜索权重
/// </summary>
internal Dictionary<string, float> Boosts
{
get
{
foreach (var field in Fields.Where(field => _boosts.All(x => x.Key.ToUpper() != field.ToUpper())))
{
_boosts.Add(field, 2.0f);
}
return _boosts;
}
}
/// <summary>
/// 匹配度,0-1,数值越大结果越精确
/// </summary>
public float Score { get; set; } = 0.5f;
/// <summary>
/// 过滤条件
/// </summary>
public Filter Filter { get; set; }
public ScoredSearchOption(string keyword,List<string> fields,int maxHits=100,Dictionary<string,float> boosts=null)
{
if (string.IsNullOrWhiteSpace(keyword))
{
throw new ArgumentException("搜索关键词不能为空");
}
Keyword = keyword;
Fields = fields;
MaxHits = maxHits;
_boosts = boosts ?? new Dictionary<string, float>();
}
/// <summary>
/// 设置权重
/// </summary>
/// <param name="field"></param>
/// <param name="boost"></param>
public void SetBoosts(string field,float boost)
{
_boosts[field] = boost;
}
}
输出项为:
public class ScoredSearchResult : ISearchResult<SearchResultItem>
{
public List<SearchResultItem> Items { get; set; }
public long Elapsed { get;set;}
public int TotalHits { get; set; }
public ScoredSearchResult()
{
Items = new List<SearchResultItem>();
}
}
2.2.方法实现
/// <summary>
/// 包含权重的查询
/// </summary>
/// <param name="option"></param>
/// <returns></returns>
public ScoredSearchResult ScoredSearch(ScoredSearchOption option)
{
ScoredSearchResult result = new ScoredSearchResult();
Stopwatch watch = Stopwatch.StartNew();//启动计时器
using (DirectoryReader reader = DirectoryReader.Open(Directory))
{
IndexSearcher searcher = new IndexSearcher(reader);
var queryParser = new MultiFieldQueryParser(LuceneVersion.LUCENE_48, option.Fields.ToArray(), Analyzer, option.Boosts);
var terms = Cut(option.Keyword);//关键词分割
Query query = QueryExpression(queryParser, terms);//查询语句拼接扩展
Sort sort = new Sort(SortField.FIELD_SCORE);//默认按照评分排序
Expression<Func<ScoreDoc, bool>> whereExpression = m => m.Score >= option.Score;
var matches = searcher.Search(query, option.Filter, option.MaxHits, sort, true, true).ScoreDocs
.Where(whereExpression.Compile());
foreach (var match in matches)
{
var doc = searcher.Doc(match.Doc);
SearchResultItem item = new SearchResultItem();
item.Score = match.Score;
item.EntityId = doc.Get(CoreConstant.EntityId);
item.EntityName = doc.Get(CoreConstant.EntityType);
result.Items.Add(item);
}
result.TotalHits = matches.Count();
}
watch.Stop();//停止计时器
result.Elapsed = watch.ElapsedMilliseconds;
return result;
}
其中私有方法Cut用于关键词的分割:
private List<string> Cut(string keyword)
{
List<string> result = new List<string> { keyword };//先将关键词放入分割结果中
if (keyword.Length <= 2)//如果关键词过短则不分割,直接返回结果
{
return result;
}
//常用关键词查询规则替换,‘+’替换并,‘-’替换否,空格替换或
keyword = keyword.Replace("AND ", "+").Replace("NOT ", "-").Replace("OR ", " ");
result.AddRange(Regex.Matches(keyword, @""".+""").Cast<Match>().Select(m =>
{
keyword = keyword.Replace(m.Value, "");
return m.Value;
}));//必须包含的
result.AddRange(Regex.Matches(keyword, @"\s-.+\s?").Cast<Match>().Select(m =>
{
keyword = keyword.Replace(m.Value, "");
return m.Value.Trim();
}));//必须不包含的
result.AddRange(Regex.Matches(keyword, @"[\u4e00-\u9fa5]+").Cast<Match>().Select(m => m.Value));//中文
result.AddRange(Regex.Matches(keyword, @"\p{P}?[A-Z]*[a-z]*[\p{P}|\p{S}]*").Cast<Match>().Select(m => m.Value));//英文单词
result.AddRange(Regex.Matches(keyword, "([A-z]+)([0-9.]+)").Cast<Match>().SelectMany(m => m.Groups.Cast<Group>().Select(g => g.Value)));//英文+数字
//result.AddRange(new JiebaSegmenter().Cut(keyword, true));//结巴分词
result.RemoveAll(s => s.Length < 2);
result = result.Distinct().OrderByDescending(s => s.Length).Take(10).ToList();
return result;
}
私有方法QueryExpression用于查询语句的拼接:
private BooleanQuery QueryExpression(MultiFieldQueryParser queryParser, List<string> terms)
{
BooleanQuery query = new BooleanQuery();
foreach (var term in terms)
{
if (term.StartsWith("\""))
{
query.Add(queryParser.Parse(term.Trim('"')), Occur.MUST);//必须匹配
}
else if (term.StartsWith("-"))
{
query.Add(queryParser.Parse(term), Occur.MUST_NOT);//必须不匹配
}
else
{
query.Add(queryParser.Parse(term.Replace("~", "") + "~"), Occur.SHOULD);//可以匹配
}
}
return query;
}
3.测试示例
写一个示例方法对简单查询进行测试:
public List<DataContent> SingleSearch(SingleSearchOption option)
{
List<DataContent> entities = new List<DataContent>();
SingleSearchResult searchResult = _searchManager.SingleSearch(option);
foreach (var item in searchResult.Items)
{
DataContent entity = _repository.Get(item.EntityId);//查询实体
entities.Add(entity);
}
return entities;
}
目前索引的查询和实体的查询并没有强关联,所以实际上是查询了两次,后续会考虑根据业务需要将两者结合起来。