Lucene.Net C#分词操作帮助类

5 篇文章 0 订阅
5 篇文章 0 订阅
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.PanGu;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.QueryParsers;
using Lucene.Net.Search;
using Lucene.Net.Store;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;

namespace ReptileJob.Functions
{
    /// <summary>
    /// 分词操作类
    /// </summary>
    public class FenCi
    {
        /// <summary>
        /// 索引存放目录
        /// </summary>
        protected string IndexDic
        {
            get
            {
                return "D://Fenci/SuoYin/IndexDic";
            }
        }

        /// <summary>
        /// 创建索引
        /// </summary>
        /// <param name="dataId"></param>
        /// <param name="dataTitle"></param>
        /// <param name="dataContent"></param>
        /// <param name="code">SRB,其实就是传过来的business</param>
        /// <param name="tableName"></param>
        /// <param name="link"></param>
        /// <param name="source"></param>
        /// <param name="catalog"></param>
        /// <param name="type">1集团,2业务系统</param>
        /// <param name="companyNo">001002</param>
        public void CreateIndex(string dataId, string dataTitle, string dataContent,string code,string tableName,string link,string source,string catalog, string type,string companyNo,string picUrl=null,string nopop=null)
        {
            if (dataTitle==null)
            {
                dataTitle = "";
            }

            if (dataContent == null)
            {
                dataTitle = "";
            }

            if (code == null)
            {
                dataTitle = "";
            }

            if (link == null)
            {
                dataTitle = "";
            }

            if (source == null)
            {
                source = "";
            }

            if (catalog == null)
            {
                catalog = "";
            }

            if (type == null)
            {
                type = "";
            }

            if (companyNo == null)
            {
                companyNo = "";
            }

            if (picUrl == null)
            {
                picUrl = "";
            }

            if (nopop == null)
            {
                nopop = "";
            }

            //首先创建文件目录
            if (!System.IO.Directory.Exists(IndexDic))
            {
                System.IO.Directory.CreateDirectory(IndexDic);
            }

            //判断是否有锁
            if (IndexWriter.IsLocked(IndexDic))
            {
                //  如果索引目录被锁定(比如索引过程中程序异常退出),则首先解锁
                //  Lucene.Net在写索引库之前会自动加锁,在close的时候会自动解锁
                Lucene.Net.Store.Directory direcotry = FSDirectory.GetDirectory("IndexDic");
                IndexWriter.Unlock(direcotry);
            }

            bool isCreated;

            //判断是否重新创建索引文件
            if (File.Exists(IndexDic+ "/segments.gen"))
            {
                isCreated = false;
            }
            else
            {
                isCreated = true;
            }

            //创建索引
            IndexWriter writer = new IndexWriter(IndexDic, new PanGuAnalyzer(), isCreated, IndexWriter.MaxFieldLength.LIMITED);
            try
            {
                Document doc = new Document();
                Field postid = new Field("DataId", dataId, Field.Store.YES, Field.Index.NOT_ANALYZED);
                Field title = new Field("Title", dataTitle, Field.Store.YES, Field.Index.ANALYZED);
                Field postscore = new Field("DataContent", dataContent, Field.Store.YES, Field.Index.ANALYZED);
                Field addTime = new Field("AddTime", DateTime.Now.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED);
                Field Code = new Field("Code", code, Field.Store.YES, Field.Index.NOT_ANALYZED);
                Field TableName = new Field("TableName", tableName, Field.Store.YES, Field.Index.NOT_ANALYZED);
                Field Link = new Field("Link", link, Field.Store.YES, Field.Index.NOT_ANALYZED);
                Field Source = new Field("Source", source, Field.Store.YES, Field.Index.NOT_ANALYZED);
                Field Type = new Field("Type", type, Field.Store.YES, Field.Index.NOT_ANALYZED);
                Field Catalog = new Field("Catalog", catalog, Field.Store.YES, Field.Index.NOT_ANALYZED);
                //Field Business = new Field("Business", business, Field.Store.YES, Field.Index.NOT_ANALYZED);
                Field CompanyNo = new Field("CompanyNo", companyNo, Field.Store.YES, Field.Index.NOT_ANALYZED);
                Field PicUrl = new Field("PicUrl", picUrl, Field.Store.YES, Field.Index.NOT_ANALYZED);
                Field NoPop = new Field("NoPop", nopop, Field.Store.YES, Field.Index.NOT_ANALYZED);
                doc.Add(postid);
                doc.Add(title);
                doc.Add(postscore);
                doc.Add(addTime);
                doc.Add(Code);
                doc.Add(TableName);
                doc.Add(Link);
                doc.Add(Source);
                doc.Add(Type);
                doc.Add(Catalog);
                //doc.Add(Business);
                doc.Add(CompanyNo);
                doc.Add(PicUrl);
                doc.Add(NoPop);
                writer.AddDocument(doc);

                writer.Optimize();
                writer.Commit();
            }catch(Exception ex)
            {
                throw ex;
            }
            finally
            {
                //关闭锁
                writer.Close();
            }
        }//方法结束

        /// <summary>
        /// 检索方法
        /// </summary>
        /// <param name="keyword"></param>
        /// <param name="pageIndex"></param>
        /// <param name="pageSize"></param>
        /// <param name="catalog"></param>
        /// <returns></returns>
        public List<SearchResult> Search(string keyword,int pageIndex,int pageSize, out int totalCount, string catalog = null)
        {
            //首先创建文件目录
            if (!System.IO.Directory.Exists(IndexDic))
            {
                System.IO.Directory.CreateDirectory(IndexDic);
            }

            //判断是否重新创建索引文件
            if (!File.Exists(IndexDic + "/segments.gen"))
            {
                totalCount = 0;
                return new List<SearchResult>();
            }
           

            if (!string.IsNullOrEmpty(keyword))
            {
                string[] keywords = keyword.Split(' ');
                BooleanQuery boolQuery = new BooleanQuery();
                IndexSearcher searcher = new IndexSearcher(IndexDic, true);
                Sort sort = new Sort(new SortField("Catalog", SortField.STRING_VAL, false));
                List<SearchResult> searchResults = new List<SearchResult>();
                foreach (string keywordItem in keywords)
                {
                    if (!string.IsNullOrEmpty(keywordItem))
                    {                        
                        QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_29, "DataContent", new PanGuAnalyzer());                       
                        Query query = parser.Parse(keywordItem);
                        //Query query = new TermQuery(new Term("DataContent", keywordItem));

                        //should的话就是只命中其中一个关键字
                        //boolQuery.Add(query, BooleanClause.Occur.SHOULD);

                        //must是全部命中
                        boolQuery.Add(query, BooleanClause.Occur.MUST);
                    }
                }

                if (!string.IsNullOrEmpty(catalog))
                {
                    string[] catalogs = catalog.Split(',');
                    List<string> allCatalogs = new List<string>() { "1", "2", "3","4"};
                    List<string> NotContainCatalogs = allCatalogs.Where(x => !catalogs.Contains(x)).ToList();
                    foreach (string ss in NotContainCatalogs)
                    {
                        Query query = new TermQuery(new Term("Catalog", ss));
                        boolQuery.Add(query, BooleanClause.Occur.MUST_NOT);
                    }
                }

                TopDocs docs = searcher.Search(boolQuery, null, pageIndex * pageSize, sort);
                //int Count = searcher.MaxDoc();
                int Count = docs.totalHits;
                totalCount = Count;
                if (docs != null && docs.totalHits > 0)
                {
                    for (int i = 0; i < docs.totalHits; i++)
                    {
                        if (i >= (pageIndex - 1) * pageSize && i < pageIndex * pageSize)
                        {
                            SearchResult searchResult = new SearchResult();
                            Document doc = searcher.Doc(docs.scoreDocs[i].doc);

                            searchResult.Id = doc.Get("DataId")?.ToString();
                            searchResult.Title = doc.Get("Title")?.ToString();
                            searchResult.Content = doc.Get("DataContent")?.ToString();
                            searchResult.Code = doc.Get("Code")?.ToString();
                            searchResult.TableName = doc.Get("TableName")?.ToString();
                            searchResult.Link = doc.Get("Link")?.ToString();
                            searchResult.Source = doc.Get("Source")?.ToString();
                            if (!string.IsNullOrEmpty(doc.Get("AddTime")?.ToString()))
                            {
                                searchResult.AddTime = DateTime.Parse(doc.Get("AddTime")?.ToString());
                            }

                            if (!string.IsNullOrEmpty(doc.Get("Type")?.ToString()))
                            {
                                searchResult.Type = int.Parse(doc.Get("Type")?.ToString());
                            }

                            if (!string.IsNullOrEmpty(doc.Get("Catalog")?.ToString()))
                            {
                                searchResult.Catalog = int.Parse(doc.Get("Catalog")?.ToString());
                            }
                            searchResult.CompanyNo = doc.Get("CompanyNo")?.ToString();
                            searchResult.PicUrl = doc.Get("PicUrl")?.ToString();
                            searchResult.NoPop = doc.Get("NoPop")?.ToString();
                            //高亮显示
                            searchResult.Content = SimpleHighLighter(searchResult.Content, keyword, "<font style=\"color: red; font - family:\'Cambria\';\"><b>","</b></font>");
                            searchResult.Title = SimpleHighLighter(searchResult.Title, keyword, "<font style=\"color: red; font - family:\'Cambria\';\"><b>", "</b></font>");
                            searchResults.Add(searchResult);
                        }
                    }
                }

                return searchResults;
            }
            else
            {
                totalCount = 0;
                return new List<SearchResult>();
            }
        }//方法结束

        /// <summary>
        /// 删除,用于去重二级单位调接口
        /// </summary>
        /// <param name="id"></param>
        /// <param name="business"></param>
        public void Delete2Company(string id,string business) 
        {

            //判断是否有数据
            if (!File.Exists(IndexDic + "/segments.gen"))
            {
                return;
            }

            IndexWriter writer = new IndexWriter(IndexDic, new PanGuAnalyzer(), false, IndexWriter.MaxFieldLength.LIMITED);
            BooleanQuery boolQuery = new BooleanQuery();
            try
            {
                Query query = new TermQuery(new Term("DataId", id));
                Query query1 = new TermQuery(new Term("Code", business));
                boolQuery.Add(query, BooleanClause.Occur.MUST);
                boolQuery.Add(query1, BooleanClause.Occur.MUST);
                writer.DeleteDocuments(boolQuery);
            }
            catch(Exception ex)
            {
                throw ex;
            }
            finally
            {
                writer.Close();
            }
        }//方法结束

        /// <summary>
        /// 去重业务数据
        /// </summary>
        /// <param name="id"></param>
        /// <param name="tableName"></param>
        public void Delete(string id, string tableName)
        {
            //判断是否有数据
            if (!File.Exists(IndexDic + "/segments.gen"))
            {
                return;
            }
            IndexWriter writer = new IndexWriter(IndexDic, new PanGuAnalyzer(), false, IndexWriter.MaxFieldLength.LIMITED);
            BooleanQuery boolQuery = new BooleanQuery();
            try
            {
                Query query = new TermQuery(new Term("DataId", id));
                Query query1 = new TermQuery(new Term("TableName", tableName));
                boolQuery.Add(query, BooleanClause.Occur.MUST);
                boolQuery.Add(query1, BooleanClause.Occur.MUST);
                writer.DeleteDocuments(boolQuery);
            }catch(Exception ex)
            {
                throw ex;
            }
            finally
            {
                writer.Close();
            }
        }//方法结束

        /// <summary>
        /// 去重功能
        /// </summary>
        /// <param name="id"></param>
        /// <param name="link"></param>
        /// <param name="flag"></param>
        public void Delete(string id, string link,bool flag=false)
        {
            //判断是否有数据
            if (!File.Exists(IndexDic + "/segments.gen"))
            {
                return;
            }
            IndexWriter writer = new IndexWriter(IndexDic, new PanGuAnalyzer(), false, IndexWriter.MaxFieldLength.LIMITED);
            BooleanQuery boolQuery = new BooleanQuery();
            try
            {
                Query query = new TermQuery(new Term("DataId", id));
                Query query1 = new TermQuery(new Term("Link", link));
                boolQuery.Add(query, BooleanClause.Occur.MUST);
                boolQuery.Add(query1, BooleanClause.Occur.MUST);
                writer.DeleteDocuments(boolQuery);
            }
            catch (Exception ex)
            {
                throw ex;
            }
            finally
            {
                writer.Close();
            }
        }//方法结束


        /// <summary>
        /// 高亮显示
        /// </summary>
        /// <param name="p_Body"></param>
        /// <param name="p_KeyWords"></param>
        /// <param name="p_Before"></param>
        /// <param name="p_After"></param>
        /// <param name="p_MaxLength"></param>
        /// <returns></returns>
        public string SimpleHighLighter(string p_Body, string p_KeyWords, string p_Before,
            string p_After, int p_MaxLength=0)
        {
            string[] KeyWords = p_KeyWords.Trim().Split(' ');
            for (int i = 0; i < KeyWords.Length; i++)
            {
                if (!string.IsNullOrEmpty(KeyWords[i]))
                {
                    p_Body = p_Body.Replace(KeyWords[i], p_Before + KeyWords[i] + p_After);
                }

            }
            return p_Body;

        }


        /// <summary>
        /// 利用盘古分词对用户输入的内容进行分词
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        public List<string> GetPanGuWord(string str)
        {
            List<string> list = new List<string>();
            Analyzer analyzer = new PanGuAnalyzer();
            TokenStream tokenStream = analyzer.TokenStream("", new StringReader(str));
            Lucene.Net.Analysis.Token token = null;
            while ((token = tokenStream.Next()) != null)
            {
                list.Add(token.TermText());
            }
            return list;
        }
    }

    /// <summary>
    /// 搜索结果
    /// </summary>
    public class SearchResult
    {
        public string NoPop { get; set; }
        public string PicUrl { get; set; }

        public string Id { get; set; }

        public string Title { get; set; }

        public string Content { get; set; }

        public DateTime AddTime { get; set; }

        public string Code { get; set; }

        public string TableName { get; set; }

        public string Link { get; set; }

        public string Source { get; set; }

        public int Type { get; set; }

        public int Catalog { get; set; }

        public string CompanyNo { get; set; }
    }

 

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
ShootSearch 中文分词组件(c#开源) 技术支持:support@shootsoft.net http://www.shootsoft.net 分词算法:词库+最大匹配 词库来自www.jesoft.cn,已经过一定的修改 使用说明: 先加载词库,再进行分词.分词过程重载两次:支持回车的和不支持回车的,注意选择! 可以手动添加词语到sDict.txt,不过每次手动修改后需要调用SortDic()方法,否则无法实现最大匹配! sDict.txt的编码为UTF-8! 示例: 文字内容来自:http://tech.tom.com/2006-08-09/04B5/34545343.html using ShootSeg; ... Segment seg = new Segment(); seg.InitWordDics(); string str="日前,奇虎董事长周鸿祎新推出了一款反流氓软件“360安全卫士”,并将雅虎中国3721网络实名定义为流氓软件。此举引起了雅虎员工的强烈不满,甚至有就职于雅虎的原3721员工声称将起诉周鸿祎。围绕着3721这个产品,引发了一场雅虎中国与奇虎之间的战争。"; seg.Separator = "/"; Console.WriteLine(seg.SegmentText(str.Text,true)); 日前/,/奇虎/董事长/周鸿祎/新/推出/了/一款/反/流氓/软件/“/360/安全/卫士/”/,/并将/雅虎/中国/3721/网络实名/定义/为/流氓/软件/。/此举/引起/了/雅虎/员工/的/强烈不满/,/甚至有/就职/于/雅虎/的/原/3721员/工/声称/将/起诉/周鸿祎/。/围绕着/3721/这个/产品/,/引发/了/一场/雅虎/中国/与/奇虎/之间/的/战争/。/ 不加人名识别效果如下: 日前/,/奇/虎/董事长/周/鸿/祎/新/推出/了/一款/反/流氓/软件/“/360/安全/卫士/”/,/并将/雅虎/中国/3721/网络实名/定义/为/流氓/软件/。/此举/引起/了/雅虎/员工/的/强烈不满/,/甚至有/就职/于/雅虎/的/原/3721员/工/声称/将/起诉/周/鸿/祎/。/围绕着/3721/这个/产品/,/引发/了/一场/雅虎/中国/与/奇/虎/之间/的/战争/。/ 2006-8-9----1.0 bate 060809 支持英文、数字、中文(简体)混合分词 常用的数量和人名的匹配 超过22万词的词库整理 实现正向最大匹配算法 智能数字,日期,人名识别
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值