从零开始搭建.NET Core版搜索引擎(五)--查询方法扩展

19 篇文章 7 订阅
11 篇文章 4 订阅

目前的查询方法过于简单,而且无法与实际业务中的实体建立关系,因此本篇文章就来描述对查询方法的扩展。

1.查询多个字段的检索方法

1.1.定义接口及输入输出项

查询输入项SingleSearchOption:

public class SingleSearchOption:SearchOptionBase
{
    /// <summary>
    /// 检索关键词
    /// </summary>
    public string Keyword { get; set; }
 
    /// <summary>
    /// 限定检索域
    /// </summary>
    public List<string> Fields { get; set; }
 
    public SingleSearchOption(string keyword,List<string> fields,int maxHits=100)
    {
        if (string.IsNullOrWhiteSpace(keyword))
        {
            throw new ArgumentException("搜索关键词不能为空");
        }
        Keyword = keyword;
        Fields = fields;
        MaxHits = maxHits;
    }
 
    public SingleSearchOption()
    {
 
    }
}

其中SearchOptionBase:

public class SearchOptionBase : ISearchOption
{
    /// <summary>
    /// 最大检索量
    /// </summary>
    public int MaxHits { get ; set; }
}

输出结果SingleSearchResult:

public class SingleSearchResult : ISearchResult<SearchResultItem>
{
    /// <summary>
    /// 匹配结果
    /// </summary>
    public List<SearchResultItem> Items { get; set; }
    /// <summary>
    /// 检索耗时
    /// </summary>
    public long Elapsed { get; set; }
    /// <summary>
    /// 匹配结果数
    /// </summary>
    public int TotalHits { get; set; }
 
    public SingleSearchResult()
    {
        Items = new List<SearchResultItem>();
    }
}

其中查询结果项SearchResultItem:

public class SearchResultItem : ISearchResultItem
{
    /// <summary>
    /// 结果评分
    /// </summary>
    public float Score { get; set; }
    /// <summary>
    /// 实体Id
    /// </summary>
    public string EntityId { get; set; }
    /// <summary>
    /// 实体类名
    /// </summary>
    public string EntityName { get; set; }
 
}

1.2.方法实现

/// <summary>
/// 简单查询
/// </summary>
/// <param name="option"></param>
/// <returns></returns>
public SingleSearchResult SingleSearch(SingleSearchOption option)
{
    SingleSearchResult result = new SingleSearchResult();
    Stopwatch watch=Stopwatch.StartNew();
    using (Lucene.Net.Index.DirectoryReader reader = DirectoryReader.Open(Directory))
    {
        //实例化索引检索器
        IndexSearcher searcher = new IndexSearcher(reader);
        var queryParser = new MultiFieldQueryParser(LuceneVersion.LUCENE_48, option.Fields.ToArray(), Analyzer);
        Query query = queryParser.Parse(option.Keyword);
        var matches = searcher.Search(query, option.MaxHits).ScoreDocs;
        result.TotalHits = matches.Count();
        foreach (var match in matches)
        {
            var doc = searcher.Doc(match.Doc);
            SearchResultItem item = new SearchResultItem();
            item.Score = match.Score;
            item.EntityId = doc.GetField(CoreConstant.EntityId).GetStringValue();
            item.EntityName = doc.GetField(CoreConstant.EntityType).GetStringValue();
            result.Items.Add(item);
        }
    }
    watch.Stop();
    result.Elapsed = watch.ElapsedMilliseconds;
    return result;
}

其中实体标识EntityId、实体类名EntityName这两个域是在创建索引时添加进去的,这样确保每个Document和数据库的每条记录都能通过Id被互相找到。

在这里插入图片描述

2.可设置权重的检索方法

2.1.定义接口及输入输出项

输入项为:

public class ScoredSearchOption:SearchOptionBase
{
    /// <summary>
    /// 检索关键词
    /// </summary>
    public string Keyword { get; set; }
 
    /// <summary>
    /// 限定检索域
    /// </summary>
    public List<string> Fields { get; set; }
 
    /// <summary>
    /// 多字段搜索时,给字段设定搜索权重
    /// </summary>
    private readonly Dictionary<string, float> _boosts;
 
    /// <summary>
    /// 多字段搜索时,给字段设定搜索权重
    /// </summary>
    internal Dictionary<string, float> Boosts
    {
        get
        {
            foreach (var field in Fields.Where(field => _boosts.All(x => x.Key.ToUpper() != field.ToUpper())))
            {
                _boosts.Add(field, 2.0f);
            }
 
            return _boosts;
        }
    }
 
    /// <summary>
    /// 匹配度,0-1,数值越大结果越精确
    /// </summary>
    public float Score { get; set; } = 0.5f;
 
    /// <summary>
    /// 过滤条件
    /// </summary>
    public Filter Filter { get; set; }
 
    public ScoredSearchOption(string keyword,List<string> fields,int maxHits=100,Dictionary<string,float> boosts=null)
    {
        if (string.IsNullOrWhiteSpace(keyword))
        {
            throw new ArgumentException("搜索关键词不能为空");
        }
 
        Keyword = keyword;
        Fields = fields;
        MaxHits = maxHits;
        _boosts = boosts ?? new Dictionary<string, float>();
    }
 
    /// <summary>
    /// 设置权重
    /// </summary>
    /// <param name="field"></param>
    /// <param name="boost"></param>
    public void SetBoosts(string field,float boost)
    {
        _boosts[field] = boost;
    }
 
}

输出项为:

public class ScoredSearchResult : ISearchResult<SearchResultItem>
{
    public List<SearchResultItem> Items { get; set; }
    public long Elapsed { get;set;}
    public int TotalHits { get; set; }
 
    public ScoredSearchResult()
    {
        Items = new List<SearchResultItem>();
    }
}

2.2.方法实现

/// <summary>
/// 包含权重的查询
/// </summary>
/// <param name="option"></param>
/// <returns></returns>
public ScoredSearchResult ScoredSearch(ScoredSearchOption option)
{
    ScoredSearchResult result = new ScoredSearchResult();
    Stopwatch watch = Stopwatch.StartNew();//启动计时器
 
    using (DirectoryReader reader = DirectoryReader.Open(Directory))
    {
        IndexSearcher searcher = new IndexSearcher(reader);
        var queryParser = new MultiFieldQueryParser(LuceneVersion.LUCENE_48, option.Fields.ToArray(), Analyzer, option.Boosts);
        var terms = Cut(option.Keyword);//关键词分割
        Query query = QueryExpression(queryParser, terms);//查询语句拼接扩展
        Sort sort = new Sort(SortField.FIELD_SCORE);//默认按照评分排序
        Expression<Func<ScoreDoc, bool>> whereExpression = m => m.Score >= option.Score;
        var matches = searcher.Search(query, option.Filter, option.MaxHits, sort, true, true).ScoreDocs
            .Where(whereExpression.Compile());
 
        foreach (var match in matches)
        {
            var doc = searcher.Doc(match.Doc);
            SearchResultItem item = new SearchResultItem();
            item.Score = match.Score;
            item.EntityId = doc.Get(CoreConstant.EntityId);
            item.EntityName = doc.Get(CoreConstant.EntityType);
            result.Items.Add(item);
        } 
        result.TotalHits = matches.Count();
    } 
    watch.Stop();//停止计时器
    result.Elapsed = watch.ElapsedMilliseconds;
    return result;
}

其中私有方法Cut用于关键词的分割:

private List<string> Cut(string keyword)
{
    List<string> result = new List<string> { keyword };//先将关键词放入分割结果中
    if (keyword.Length <= 2)//如果关键词过短则不分割,直接返回结果
    {
        return result;
    }
    //常用关键词查询规则替换,‘+’替换并,‘-’替换否,空格替换或
    keyword = keyword.Replace("AND ", "+").Replace("NOT ", "-").Replace("OR ", " ");
 
    result.AddRange(Regex.Matches(keyword, @""".+""").Cast<Match>().Select(m =>
    {
        keyword = keyword.Replace(m.Value, "");
        return m.Value;
    }));//必须包含的
    result.AddRange(Regex.Matches(keyword, @"\s-.+\s?").Cast<Match>().Select(m =>
    {
        keyword = keyword.Replace(m.Value, "");
        return m.Value.Trim();
    }));//必须不包含的
 
    result.AddRange(Regex.Matches(keyword, @"[\u4e00-\u9fa5]+").Cast<Match>().Select(m => m.Value));//中文
    result.AddRange(Regex.Matches(keyword, @"\p{P}?[A-Z]*[a-z]*[\p{P}|\p{S}]*").Cast<Match>().Select(m => m.Value));//英文单词
    result.AddRange(Regex.Matches(keyword, "([A-z]+)([0-9.]+)").Cast<Match>().SelectMany(m => m.Groups.Cast<Group>().Select(g => g.Value)));//英文+数字
    //result.AddRange(new JiebaSegmenter().Cut(keyword, true));//结巴分词
    result.RemoveAll(s => s.Length < 2);
    result = result.Distinct().OrderByDescending(s => s.Length).Take(10).ToList();
 
    return result;
}

私有方法QueryExpression用于查询语句的拼接:

private BooleanQuery QueryExpression(MultiFieldQueryParser queryParser, List<string> terms)
{
    BooleanQuery query = new BooleanQuery();
    foreach (var term in terms)
    {
        if (term.StartsWith("\""))
        {
            query.Add(queryParser.Parse(term.Trim('"')), Occur.MUST);//必须匹配
        }
        else if (term.StartsWith("-"))
        {
            query.Add(queryParser.Parse(term), Occur.MUST_NOT);//必须不匹配
        }
        else
        {
            query.Add(queryParser.Parse(term.Replace("~", "") + "~"), Occur.SHOULD);//可以匹配
        }
    }
    return query;
}

3.测试示例

写一个示例方法对简单查询进行测试:

public List<DataContent> SingleSearch(SingleSearchOption option)
{
    List<DataContent> entities = new List<DataContent>();
    SingleSearchResult searchResult = _searchManager.SingleSearch(option);
 
    foreach (var item in searchResult.Items)
    {
        DataContent entity = _repository.Get(item.EntityId);//查询实体
        entities.Add(entity);
    }
 
    return entities;
}

项目地址:https://github.com/ludewig/Muyan.Search

目前索引的查询和实体的查询并没有强关联,所以实际上是查询了两次,后续会考虑根据业务需要将两者结合起来。

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值