C# Lucene简单使用

最新推荐文章于 2021-09-01 18:01:08 发布

Wu7z

最新推荐文章于 2021-09-01 18:01:08 发布

阅读量726

点赞数

分类专栏： C# Lucene

原文链接：https://blog.csdn.net/u012532042/article/details/90605611

版权

C# Lucene 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

准备工作

NuGet引入Lucene.Net包。

开始搭建

几个中文分词的类：ChineseAnalyzer、ChineseTokenizer、WordTree

使用的类

ChineseAnalyzer 类

public class ChineseAnalyzer : Analyzer
    {
        private static string NoisePath = Environment.CurrentDirectory + "\\data\\sNoise.txt";

        private string keywords = "";

        public ChineseAnalyzer(string keywords)
        {
            this.keywords = keywords;
        }

        public static Hashtable chartable = new Hashtable();

        private void GetNoise()
        {
            long dt_s = DateTime.Now.Ticks;
            string char_s;
            StreamReader reader = new StreamReader(NoisePath, System.Text.Encoding.UTF8);
            string word = reader.ReadLine();
            while (word != null && word.Trim() != "")
            {
                Hashtable t_chartable = chartable;
                for (int i = 0; i < word.Length; i++)
                {
                    char_s = word.Substring(i, 1);
                    if (!t_chartable.Contains(char_s))
                    {
                        t_chartable.Add(char_s, new Hashtable());
                    }
                    t_chartable = (Hashtable)t_chartable[char_s];
                }
                word = reader.ReadLine();
            }
            reader.Close();
        }

        public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
        {
            GetNoise();
            TokenStream result = new ChineseTokenizer(reader, keywords);
            result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            result = new StopFilter(result, chartable);
            return result;
        }

    }

WordTree 类


public class WordTree
    {

        private static string DictPath = Environment.CurrentDirectory + "\\data\\sDict.txt";

        public static Hashtable chartable = new Hashtable();


        private static bool DictLoaded = false;

        public static double DictLoad_Span = 0;

        public string strChinese = "[\u4e00-\u9fa5]";


        public int GetCharType(string Char)
        {
            if (new Regex(strChinese).IsMatch(Char))
                return 0;
            return -1;
        }

        private static string IsKw = "";

        public void LoadDict(string keywords)
        {
            if (IsKw == keywords) return;
            chartable = new Hashtable();
            BuidDictTree(keywords);
            IsKw = keywords;
            return;
        }
        IDbConnection conn;

        private void BuidDictTree(string keywords)
        {
            if (keywords.IsNull())
            {
                long dt_s = DateTime.Now.Ticks;
                string char_s;
                StreamReader reader = new StreamReader(DictPath, System.Text.Encoding.UTF8);
                string word = reader.ReadLine();
                while (word != null && word.Trim() != "")
                {
                    Hashtable t_chartable = chartable;
                    for (int i = 0; i < word.Length; i++)
                    {
                        char_s = word.Substring(i, 1);
                        if (!t_chartable.Contains(char_s))
                        {
                            t_chartable.Add(char_s, new Hashtable());
                        }
                        t_chartable = (Hashtable)t_chartable[char_s];
                    }
                    word = reader.ReadLine();
                }
                reader.Close();
                DictLoad_Span = (double)(DateTime.Now.Ticks - dt_s) / (1000 * 10000);
                System.Console.Out.WriteLine("读取字典文件所用的时间: " + DictLoad_Span + "s");
            }
            else
            {
                try
                {
                    string char_s;
                    if (!keywords.Contains(" "))
                    {
                        keywords = keywords + " ";
                    }
                    foreach (var itemkw in keywords.Split(" "))
                    {
                        string word = itemkw;
                        if (!word.IsNull())
                        {
                            Hashtable t_chartable = chartable;
                            for (int i = 0; i < word.Length; i++)
                            {
                                char_s = word.Substring(i, 1);
                                if (!t_chartable.Contains(char_s))
                                {
                                    t_chartable.Add(char_s, new Hashtable());
                                }
                                t_chartable = (Hashtable)t_chartable[char_s];
                            }
                        }

                    }
                }
                catch (Exception ex)
                {

                }
            }

        }

    }

ChineseTokenizer 类


class ChineseTokenizer : Tokenizer
    {

        private int offset = 0, bufferIndex = 0, dataLen = 0;//偏移量，当前字符的位置，字符长度

        private int start;

        private string text;

        public double TextSeg_Span = 0;

        private string keywords = "";

        public ChineseTokenizer(System.IO.TextReader reader, string keywords)
        {
            this.input = reader;
            text = input.ReadToEnd();
            dataLen = text.Length;
            this.keywords = keywords;
        }

        public override Token Next()
        {
            Token token = null;
            WordTree tree = new WordTree();

            tree.LoadDict(keywords);

            Hashtable t_chartable = WordTree.chartable;
            string ReWord = "";
            string char_s;
            start = offset;
            bufferIndex = start;

            while (true)
            {
                if (start >= dataLen)
                {
                    break;
                }
                char_s = text.Substring(start, 1);
                if (string.IsNullOrEmpty(char_s.Trim()))
                {
                    start++;
                    continue;
                }
                if (!t_chartable.Contains(char_s))
                {
                    if (ReWord == "")
                    {
                        int j = start + 1;
                        switch (tree.GetCharType(char_s))
                        {
                            case 0://中文单词
                                ReWord += char_s;
                                break;
                            default:
                                ReWord += char_s;//其他字符单词
                                break;
                        }

                        offset = j;//设置取下一个词的开始位置
                    }
                    else
                    {
                        offset = start;//设置取下一个词的开始位置
                    }

                    //返回token对象
                    return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1);
                }
                //字符在字典中
                ReWord += char_s;
                //取得属于当前字符的词典树
                t_chartable = (Hashtable)t_chartable[char_s];
                //设置下一循环取下一个词的开始位置
                start++;
                if (start == dataLen)
                {
                    offset = dataLen;
                    return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1);
                }
            }
            return token;
        }

    }

代码调用

		/// 使用方法
        /// <summary>
        /// 处理CUT
        /// </summary>
        private List<string> CutWord(string kws)
        {
            List<string> ListKws = new List<string>();
            Analyzer analyzer = new TextFc.Tool.ChineseAnalyzer(kws);
            StringReader sr = new StringReader(docstr);
            TokenStream stream = analyzer.TokenStream(null, sr);
            Lucene.Net.Analysis.Token t = stream.Next();
            string tstr = "";
            while (t != null)
            {
                tstr = t.ToString().Replace("(", "").Split(",")[0];
                tstr //分词结果
                t = stream.Next();
            }
            return ListKws;
        }

Wu7z

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
C# Lucene简单使用

准备工作NuGet引入Lucene.Net包。开始搭建几个中文分词的类：ChineseAnalyzer、ChineseTokenizer、WordTree使用的类ChineseAnalyzer 类public class ChineseAnalyzer : Analyzer { private static string NoisePath = Environment.CurrentDirectory + "\\data\\sNoise.txt";
复制链接

扫一扫