C#使用Lucene中文分词

最新推荐文章于 2024-04-13 13:34:22 发布

佳楠丶

最新推荐文章于 2024-04-13 13:34:22 发布

阅读量2.3k

点赞数

分类专栏： C# 文章标签： C# lucene 中文分词

本文链接：https://blog.csdn.net/u012532042/article/details/90605611

版权

C#使用Lucene中文分词

准备工作
- 开始搭建
- 如何插入一段漂亮的代码片

准备工作

NuGet引入Lucene.Net包。

开始搭建

几个中文分词的类：ChineseAnalyzer、ChineseTokenizer、WordTree

如何插入一段漂亮的代码片

ChineseAnalyzer 类
public class ChineseAnalyzer : Analyzer
    {
   
        private static string NoisePath = Environment.CurrentDirectory + "\\data\\sNoise.txt";

        private string keywords = "";

        public ChineseAnalyzer(string keywords)
        {
   
            this.keywords = keywords;
        }

        public static Hashtable chartable = new Hashtable();

        private void GetNoise()
        {
   
            long dt_s = DateTime.Now.Ticks;
            string char_s;
            StreamReader reader = new StreamReader(NoisePath, System.Text.Encoding.UTF8);
            string word = reader.ReadLine();
            while (word != null && word.Trim() != "")
            {
   
                Hashtable t_chartable = chartable;
                for (int i = 0; i < word.Length; i++)
                {
   
                    char_s = word.Substring(i, 1);
                    if (!t_chartable.Contains(char_s))
                    {
   
                        t_chartable.Add(char_s, new Hashtable());
                    }
                    t_chartable = (Hashtable)t_chartable[char_s];
                }
                word = reader.ReadLine();
            }
            reader.Close();
        }

        public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
        {
   
            GetNoise();
            TokenStream result = new ChineseTokenizer(reader, keywords);
            result = new StandardFilter(result);
            result = new