C# Lucene简单使用

准备工作

NuGet引入Lucene.Net包。

开始搭建

几个中文分词的类:ChineseAnalyzer、ChineseTokenizer、WordTree

使用的类

  • ChineseAnalyzer 类
public class ChineseAnalyzer : Analyzer
    {
        private static string NoisePath = Environment.CurrentDirectory + "\\data\\sNoise.txt";

        private string keywords = "";

        public ChineseAnalyzer(string keywords)
        {
            this.keywords = keywords;
        }

        public static Hashtable chartable = new Hashtable();

        private void GetNoise()
        {
            long dt_s = DateTime.Now.Ticks;
            string char_s;
            StreamReader reader = new StreamReader(NoisePath, System.Text.Encoding.UTF8);
            string word = reader.ReadLine();
            while (word != null && word.Trim() != "")
            {
                Hashtable t_chartable = chartable;
                for (int i = 0; i < word.Length; i++)
                {
                    char_s = word.Substring(i, 1);
                    if (!t_chartable.Contains(char_s))
                    {
                        t_chartable.Add(char_s, new Hashtable());
                    }
                    t_chartable = (Hashtable)t_chartable[char_s];
                }
                word = reader.ReadLine();
            }
            reader.Close();
        }

        public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
        {
            GetNoise();
            TokenStream result = new ChineseTokenizer(reader, keywords);
            result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            result = new StopFilter(result, chartable);
            return result;
        }

    }
  • WordTree 类

public class WordTree
    {

        private static string DictPath = Environment.CurrentDirectory + "\\data\\sDict.txt";

        public static Hashtable chartable = new Hashtable();


        private static bool DictLoaded = false;

        public static double DictLoad_Span = 0;

        public string strChinese = "[\u4e00-\u9fa5]";


        public int GetCharType(string Char)
        {
            if (new Regex(strChinese).IsMatch(Char))
                return 0;
            return -1;
        }

        private static string IsKw = "";

        public void LoadDict(string keywords)
        {
            if (IsKw == keywords) return;
            chartable = new Hashtable();
            BuidDictTree(keywords);
            IsKw = keywords;
            return;
        }
        IDbConnection conn;

        private void BuidDictTree(string keywords)
        {
            if (keywords.IsNull())
            {
                long dt_s = DateTime.Now.Ticks;
                string char_s;
                StreamReader reader = new StreamReader(DictPath, System.Text.Encoding.UTF8);
                string word = reader.ReadLine();
                while (word != null && word.Trim() != "")
                {
                    Hashtable t_chartable = chartable;
                    for (int i = 0; i < word.Length; i++)
                    {
                        char_s = word.Substring(i, 1);
                        if (!t_chartable.Contains(char_s))
                        {
                            t_chartable.Add(char_s, new Hashtable());
                        }
                        t_chartable = (Hashtable)t_chartable[char_s];
                    }
                    word = reader.ReadLine();
                }
                reader.Close();
                DictLoad_Span = (double)(DateTime.Now.Ticks - dt_s) / (1000 * 10000);
                System.Console.Out.WriteLine("读取字典文件所用的时间: " + DictLoad_Span + "s");
            }
            else
            {
                try
                {
                    string char_s;
                    if (!keywords.Contains(" "))
                    {
                        keywords = keywords + " ";
                    }
                    foreach (var itemkw in keywords.Split(" "))
                    {
                        string word = itemkw;
                        if (!word.IsNull())
                        {
                            Hashtable t_chartable = chartable;
                            for (int i = 0; i < word.Length; i++)
                            {
                                char_s = word.Substring(i, 1);
                                if (!t_chartable.Contains(char_s))
                                {
                                    t_chartable.Add(char_s, new Hashtable());
                                }
                                t_chartable = (Hashtable)t_chartable[char_s];
                            }
                        }

                    }
                }
                catch (Exception ex)
                {

                }
            }

        }

    }

  • ChineseTokenizer 类

class ChineseTokenizer : Tokenizer
    {

        private int offset = 0, bufferIndex = 0, dataLen = 0;//偏移量,当前字符的位置,字符长度

        private int start;

        private string text;

        public double TextSeg_Span = 0;

        private string keywords = "";

        public ChineseTokenizer(System.IO.TextReader reader, string keywords)
        {
            this.input = reader;
            text = input.ReadToEnd();
            dataLen = text.Length;
            this.keywords = keywords;
        }

        public override Token Next()
        {
            Token token = null;
            WordTree tree = new WordTree();

            tree.LoadDict(keywords);

            Hashtable t_chartable = WordTree.chartable;
            string ReWord = "";
            string char_s;
            start = offset;
            bufferIndex = start;

            while (true)
            {
                if (start >= dataLen)
                {
                    break;
                }
                char_s = text.Substring(start, 1);
                if (string.IsNullOrEmpty(char_s.Trim()))
                {
                    start++;
                    continue;
                }
                if (!t_chartable.Contains(char_s))
                {
                    if (ReWord == "")
                    {
                        int j = start + 1;
                        switch (tree.GetCharType(char_s))
                        {
                            case 0://中文单词
                                ReWord += char_s;
                                break;
                            default:
                                ReWord += char_s;//其他字符单词
                                break;
                        }

                        offset = j;//设置取下一个词的开始位置
                    }
                    else
                    {
                        offset = start;//设置取下一个词的开始位置
                    }

                    //返回token对象
                    return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1);
                }
                //字符在字典中
                ReWord += char_s;
                //取得属于当前字符的词典树
                t_chartable = (Hashtable)t_chartable[char_s];
                //设置下一循环取下一个词的开始位置
                start++;
                if (start == dataLen)
                {
                    offset = dataLen;
                    return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1);
                }
            }
            return token;
        }

    }

代码调用

		/// 使用方法
        /// <summary>
        /// 处理CUT
        /// </summary>
        private List<string> CutWord(string kws)
        {
            List<string> ListKws = new List<string>();
            Analyzer analyzer = new TextFc.Tool.ChineseAnalyzer(kws);
            StringReader sr = new StringReader(docstr);
            TokenStream stream = analyzer.TokenStream(null, sr);
            Lucene.Net.Analysis.Token t = stream.Next();
            string tstr = "";
            while (t != null)
            {
                tstr = t.ToString().Replace("(", "").Split(",")[0];
                tstr //分词结果
                t = stream.Next();
            }
            return ListKws;
        }

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值