C# 分词算法,ChineseAnalyzer,源代码分析，其他地方的代码都是稀烂。。。。

最新推荐文章于 2023-11-27 23:05:51 发布

帝街街

最新推荐文章于 2023-11-27 23:05:51 发布

阅读量1.9k

点赞数 1

分类专栏： c# 优秀案例文章标签： Lucene chineseana 中文分词词汇分割 c#

本文链接：https://blog.csdn.net/u014479921/article/details/58258419

版权

c# 优秀案例专栏收录该内容

16 篇文章 0 订阅

订阅专栏

1.引用文件下载地址：

http://www.piaoyi.org/upimg/file071127_08/02/ChineseAnalyzer.rar

2.引用一个Lucene.Net.dll文件

3.添加新类库文件 WordTree.cs

using System;
using System.Collections;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;

namespace A.SplitString
{
    public class WordTree
    {
        //需要添加的对照文件 sdict.txt文件
        private static string DictPath = System.Web.HttpContext.Current.Server.MapPath("~/sDict.txt");
        public static Hashtable chartable = new Hashtable();

        public static bool DictLoaded = false;

        public static double DictLoad_Span = 0.0;

        public string strChinese = "[一-龥]";

        public string strNumber = "[0-9]";

        public string strEnglish = "[a-zA-Z]";

        public int GetCharType(string Char)
        {
            int result;
            if (new Regex(this.strChinese).IsMatch(Char))
            {
                result = 0;
            }
            else if (new Regex(this.strEnglish).IsMatch(Char))
            {
                result = 1;
            }
            else if (new Regex(this.strNumber).IsMatch(Char))
            {
                result = 2;
            }
            else
            {
                result = -1;
            }
            return result;
        }

        public void LoadDict()
        {
            if (!WordTree.DictLoaded)
            {
                this.BuidDictTree();
                WordTree.DictLoaded = true;
            }
        }

        private void BuidDictTree()
        {
            long ticks = DateTime.Now.Ticks;



            StreamReader streamReader = new StreamReader(WordTree.DictPath, Encoding.UTF8);
            string text = streamReader.ReadLine();
            if (!chartable.Contains("word"))
            {
                WordTree.chartable.Add("word", null);
            }
            while (!string.IsNullOrEmpty(text))
            {
                Hashtable hashtable = WordTree.chartable;
                for (int i = 0; i < text.Length; i++)
                {
                    string key = text.Substring(i, 1);
                    if (!hashtable.Contains(key))
                    {
                        hashtable.Add(key, new Hashtable());
                    }
                    hashtable = (Hashtable)hashtable[key];
                }
                if (!hashtable.Contains("word"))
                {
                    hashtable.Add("word", null);
                }
                text = streamReader.ReadLine();
            }
            streamReader.Close();
        }
    }
}

4.添加cs文件 ChineseTokenizer.cs

using Lucene.Net.Analysis;
using System;
using System.Collections;
using System.IO;

namespace A.SplitString
{
    internal class ChineseTokenizer : Tokenizer
    {
        private int bufferIndex = 0;

        private int dataLen = 0;

        private int start;

        private string text;

        public ChineseTokenizer(TextReader reader)
        {
            this.input = reader;
            this.text = this.input.ReadToEnd();
            this.dataLen = this.text.Length;
        }

        public override Token Next()
        {
            WordTree wordTree = new WordTree();
            wordTree.LoadDict();
            Hashtable hashtable = WordTree.chartable;
            string text = string.Empty;
            this.bufferIndex = this.start;
            int num = this.start;
            int num2 = this.bufferIndex;
            string text2 = string.Empty;
            Token result;
            while (this.start < this.dataLen)
            {
                string text3 = this.text.Substring(this.start, 1);
                if (!string.IsNullOrEmpty(text3.Trim()))
                {
                    if (!hashtable.Contains(text3))
                    {
                        if (text == string.Empty)
                        {
                            int i = this.start + 1;
                            switch (wordTree.GetCharType(text3))
                            {
                                case 0:
                                    text += text3;
                                    break;
                                case 1:
                                    while (i < this.dataLen)
                                    {
                                        if (wordTree.GetCharType(this.text.Substring(i, 1)) != 1)
                                        {
                                            break;
                                        }
                                        i++;
                                    }
                                    text += this.text.Substring(this.start, i - this.start).ToLower();
                                    break;
                                case 2:
                                    while (i < this.dataLen)
                                    {
                                        if (wordTree.GetCharType(this.text.Substring(i, 1)) != 2)
                                        {
                                            break;
                                        }
                                        i++;
                                    }
                                    text += this.text.Substring(this.start, i - this.start);
                                    break;
                                default:
                                    this.start++;
                                    this.bufferIndex = this.start;
                                    continue;
                            }
                            this.start = i;
                        }
                        else if (wordTree.GetCharType(text3) == -1)
                        {
                            this.start++;
                        }
                        if (hashtable.Contains("word"))
                        {
                            result = new Token(text, this.bufferIndex, this.bufferIndex + text.Length);
                        }
                        else
                        {
                            this.start = num + 1;
                            result = new Token(text2, num2, num2 + text2.Length);
                        }
                    }
                    else
                    {
                        text += text3;
                        hashtable = (Hashtable)hashtable[text3];
                        if (hashtable.Contains("word") || text.Length == 1)
                        {
                            text2 = text;
                            num = this.start;
                            num2 = this.bufferIndex;
                        }
                        this.start++;
                        if (this.start != this.dataLen)
                        {
                            continue;
                        }
                        if (hashtable.Contains("word") || text.Length == 1)
                        {
                            result = new Token(text, this.bufferIndex, this.bufferIndex + text.Length);
                        }
                        else
                        {
                            this.start = num + 1;
                            result = new Token(text2, num2, num2 + text2.Length);
                        }
                    }
                    return result;
                }
                this.start++;
                this.bufferIndex = this.start;
            }
            result = null;
            return result;
        }
    }
}

5.添加cs 文件 SplitAdapter.cs

using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace A.SplitString
{
    public class SplitAdapter : Analyzer
    {

        public static string[] CHINESE_ENGLISH_STOP_WORDS;

        public static readonly string[] Filter = new string[321];


        public SplitAdapter(string path)
        {

            StreamReader streamReader = new StreamReader(path, Encoding.UTF8);
            string text = streamReader.ReadLine();
            int num = 0;
            while (!string.IsNullOrEmpty(text))
            {
                SplitAdapter.Filter[num] = text;
                text = streamReader.ReadLine();
                num++;
            }
        }



        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            TokenStream tokenStream = new ChineseTokenizer(reader);
            tokenStream = new StandardFilter(tokenStream);
            return new StopFilter(tokenStream, SplitAdapter.Filter);
        }
    }
}

6.实现类库

using Lucene.Net.Analysis;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;

namespace A.Helper
{
    public class MatchingHelper
    {
        public static List<string> GetMatchingList(string inputString)
        {

            string snoisePath = System.Web.HttpContext.Current.Server.MapPath("~/sNoise.config");
            List<string> resultList = new List<string>();

            SplitAdapter analyzer = new SplitAdapter(snoisePath);

            StringReader reader = new StringReader(inputString);
            TokenStream tokenStream = analyzer.TokenStream(null, reader);
            Token token = tokenStream.Next();
            while (token != null)
            {
                resultList.Add(token.TermText());
                token = tokenStream.Next();
            }
            return resultList;
            //这个 list,就是拆分后的 词汇


        }
    }
}

帝街街

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
C# 分词算法,ChineseAnalyzer,源代码分析，其他地方的代码都是稀烂。。。。

1.引用文件下载地址：点击下载2.引用一个Lucene.Net.dll文件3.添加新类库文件 WordTree.csusing System;using System.Collections;using System.IO;using System.Text;using System.Text.RegularExpressions;namespace A.SplitString{
复制链接

扫一扫

专栏目录