基于朴素贝叶斯分类器的文本分类算法C#版(一)

昨天有幸拜读了洞庭散人的<基于朴素贝叶斯分类器的文本分类算法>,我正在学习这个,我从内心感谢洞庭散人的分享!随即我把它移植到了c#平台上。

该程序用到了Lucene.Net,用到了基于词典的ICTCLAS中文分词1.0.

ICTCLAS中文分词for Lucene.Net接口代码(实现Analyzer):

ContractedBlock.gif ExpandedBlockStart.gif Code
 1using System;
 2using System.Collections.Generic;
 3using System.Text;
 4using System.IO;
 5
 6using Lucene.Net.Analysis;
 7using Lucene.Net.Analysis.Standard;
 8
 9namespace AspxOn.Search.FenLei
10ExpandedBlockStart.gifContractedBlock.gif{
11
12ExpandedSubBlockStart.gifContractedSubBlock.gif    /**//// <summary>
13    /// ICTCLAS分词组件for Lucene.net接口
14    /// </summary>

15    public class ICTCLASAnalyzer : Analyzer
16ExpandedSubBlockStart.gifContractedSubBlock.gif    {
17        //定义要过滤的词
18        public static readonly System.String[] CHINESE_ENGLISH_STOP_WORDS = new string[428];
19        public string NoisePath = Environment.CurrentDirectory + "\\data\\stopwords.txt";
20
21        public ICTCLASAnalyzer()
22ExpandedSubBlockStart.gifContractedSubBlock.gif        {
23            StreamReader reader = new StreamReader(NoisePath, System.Text.Encoding.Default);
24            string noise = reader.ReadLine();
25            int i = 0;
26            
27            while (!string.IsNullOrEmpty(noise))
28ExpandedSubBlockStart.gifContractedSubBlock.gif            {
29                CHINESE_ENGLISH_STOP_WORDS[i] = noise;
30                noise = reader.ReadLine();
31                i++;
32            }

33            
34        }

35
36ExpandedSubBlockStart.gifContractedSubBlock.gif                /**//**//**//// Constructs a {@link StandardTokenizer} filtered by a {@link
37       /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. 
38        /// 

39        public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
40ExpandedSubBlockStart.gifContractedSubBlock.gif        {
41            TokenStream result = new ICTCLASTokenizer(reader);
42            result = new StandardFilter(result);
43            result = new LowerCaseFilter(result);
44            result = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);
45            return result;
46        }

47
48
49    }

50}

 

ICTCLAS中文分词for Lucene.Net接口代码(实现Tokenizer):

ContractedBlock.gif ExpandedBlockStart.gif Code
 1using System;
 2using System.Collections.Generic;
 3using System.Text;
 4
 5using Lucene.Net.Analysis;
 6using SharpICTCLAS;
 7using System.IO;
 8
 9namespace AspxOn.Search.FenLei
10ExpandedBlockStart.gifContractedBlock.gif{
11    public class ICTCLASTokenizer : Tokenizer
12ExpandedSubBlockStart.gifContractedSubBlock.gif    {
13        int nKind = 1;
14        List<WordResult[]> result;
15        int startIndex = 0;
16        int endIndex = 0;
17        int i = 1;
18ExpandedSubBlockStart.gifContractedSubBlock.gif        /**//**/
19ExpandedSubBlockStart.gifContractedSubBlock.gif        /**//// 
20        /// 待分词的句子
21        /// 

22        private string sentence;
23ExpandedSubBlockStart.gifContractedSubBlock.gif        /**//**/
24ExpandedSubBlockStart.gifContractedSubBlock.gif        /**//// Constructs a tokenizer for this Reader. 
25        public ICTCLASTokenizer(System.IO.TextReader reader)
26ExpandedSubBlockStart.gifContractedSubBlock.gif        {
27            this.input = reader;
28            sentence = input.ReadToEnd();
29            sentence = sentence.Replace("\r\n""");
30            string DictPath = Path.Combine(Environment.CurrentDirectory, "Data"+ Path.DirectorySeparatorChar;
31            //Console.WriteLine("正在初始化字典库,请稍候");
32            WordSegment wordSegment = new WordSegment();
33            wordSegment.InitWordSegment(DictPath);
34            result = wordSegment.Segment(sentence, nKind);
35        }

36
37ExpandedSubBlockStart.gifContractedSubBlock.gif        /**//**/
38ExpandedSubBlockStart.gifContractedSubBlock.gif        /**//// 进行切词,返回数据流中下一个token或者数据流为空时返回null
39        /// 

40        public override Token Next()
41ExpandedSubBlockStart.gifContractedSubBlock.gif        {
42            Token token = null;
43            while (i < result[0].Length - 1)
44ExpandedSubBlockStart.gifContractedSubBlock.gif            {
45                string word = result[0][i].sWord;
46                endIndex = startIndex + word.Length - 1;
47                token = new Token(word, startIndex, endIndex);
48                startIndex = endIndex + 1;
49
50                i++;
51                return token;
52
53            }

54            return null;
55        }

56
57    }

58}

 

中文分词器代码:

ContractedBlock.gif ExpandedBlockStart.gif Code
 1using System;
 2using System.Collections.Generic;
 3using System.Text;
 4using System.IO;
 5
 6using Lucene.Net.Analysis;
 7using Lucene.Net.Analysis.Standard;
 8using Lucene.Net.Documents;
 9
10using Lucene.Net.Analysis.Cn;
11using Lucene.Net.Analysis.KTDictSeg;
12
13namespace AspxOn.Search.FenLei
14ExpandedBlockStart.gifContractedBlock.gif{
15ExpandedSubBlockStart.gifContractedSubBlock.gif    /**//// <summary>
16    /// 中文分词器
17    /// </summary>

18    public class ChineseSpliter
19ExpandedSubBlockStart.gifContractedSubBlock.gif    {
20        public static string Split(string text, string splitToken)
21ExpandedSubBlockStart.gifContractedSubBlock.gif        {
22            StringBuilder sb = new StringBuilder();
23
24            Analyzer an = new ICTCLASAnalyzer();
25
26            //TokenStream ts = an.ReusableTokenStream("", new StringReader(text));
27
28            TokenStream ts = an.TokenStream(""new StringReader(text));
29
30            Lucene.Net.Analysis.Token token;
31            while ((token = ts.Next()) != null)
32ExpandedSubBlockStart.gifContractedSubBlock.gif            {
33                sb.Append(splitToken + token.TermText());
34            }

35
36            return sb.ToString().Substring(1);
37        }

38    }

39}

 

训练管理器代码:

ContractedBlock.gif ExpandedBlockStart.gif Code
  1using System;
  2using System.Collections.Generic;
  3using System.Text;
  4using System.IO;
  5
  6using System.Text.RegularExpressions;
  7
  8namespace AspxOn.Search.FenLei
  9ExpandedBlockStart.gifContractedBlock.gif{
 10
 11ExpandedSubBlockStart.gifContractedSubBlock.gif    /**//// <summary>
 12    /// 训练管理器
 13    /// </summary>

 14    public class TrainingDataManager
 15ExpandedSubBlockStart.gifContractedSubBlock.gif    {
 16        private string[] trainingFileClassicfications; //训练预料分类数组
 17        private DirectoryInfo trainingTextDir; //训练预料存放目录
 18        private string defaultDir = "D:\\SogouC.mini.20061127\\SogouC.mini\\Sample";
 19        //private string defaultDir = @"J:\SogouC.reduced.20061127\SogouC.reduced\Reduced";
 20
 21        public TrainingDataManager()
 22ExpandedSubBlockStart.gifContractedSubBlock.gif        {
 23            if (!Directory.Exists(defaultDir))
 24ExpandedSubBlockStart.gifContractedSubBlock.gif            {
 25                throw new Exception("当前语料目录不存在!");
 26            }

 27            trainingTextDir = new DirectoryInfo(defaultDir);
 28
 29            trainingFileClassicfications = Directory.GetDirectories(defaultDir,"*",SearchOption.TopDirectoryOnly);
 30
 31            for (int i = 0; i < trainingFileClassicfications.Length; i++)
 32ExpandedSubBlockStart.gifContractedSubBlock.gif            {
 33                trainingFileClassicfications[i] = (Regex.Split(trainingFileClassicfications[i], "\\\\"))[(Regex.Split(trainingFileClassicfications[i], "\\\\")).Length - 1];
 34                //Console.WriteLine(trainingFileClassicfications[i]);
 35            }

 36        }

 37
 38ExpandedSubBlockStart.gifContractedSubBlock.gif        /**//// <summary>
 39        /// 获取分类列表
 40        /// </summary>
 41        /// <returns></returns>

 42        public string[] GetTrainingClassifications()
 43ExpandedSubBlockStart.gifContractedSubBlock.gif        {
 44            return trainingFileClassicfications;
 45        }

 46
 47ExpandedSubBlockStart.gifContractedSubBlock.gif        /**//// <summary>
 48        /// 获取指定分类下的文件路径
 49        /// </summary>
 50        /// <param name="classification"></param>
 51        /// <returns></returns>

 52        public string[] GetFilesPath(string classification)
 53ExpandedSubBlockStart.gifContractedSubBlock.gif        {
 54            string[] ret = Directory.GetFiles(defaultDir+"\\"+classification);
 55           
 56            return ret;
 57        }

 58
 59ExpandedSubBlockStart.gifContractedSubBlock.gif        /**//// <summary>
 60        /// 获取指定位置的文件内容
 61        /// </summary>
 62        /// <param name="filepath"></param>
 63        /// <returns></returns>

 64        public string GetFileText(string filepath)
 65ExpandedSubBlockStart.gifContractedSubBlock.gif        {
 66            FileStream fs = new FileStream(filepath, FileMode.Open, FileAccess.Read, FileShare.Read);
 67            byte[] bt = new byte[fs.Length];
 68            fs.Read(bt, 0, bt.Length);
 69            fs.Close();
 70            string s = Encoding.Default.GetString(bt);
 71            return s;
 72        }

 73
 74ExpandedSubBlockStart.gifContractedSubBlock.gif        /**//// <summary>
 75        /// 获取训练文本集中的文本数目
 76        /// </summary>
 77        /// <returns></returns>

 78        public int GetTrainFileCount()
 79ExpandedSubBlockStart.gifContractedSubBlock.gif        {
 80            int ret = 0;
 81            for (int i = 0; i < trainingFileClassicfications.Length; i++)
 82ExpandedSubBlockStart.gifContractedSubBlock.gif            {
 83                ret += GetTrainFileCountOfCertainClassification(trainingFileClassicfications[i]);
 84            }

 85            return ret;
 86        }

 87
 88ExpandedSubBlockStart.gifContractedSubBlock.gif        /**//// <summary>
 89        /// 获取指定分类下的文本数目
 90        /// </summary>
 91        /// <param name="classification"></param>
 92        /// <returns></returns>

 93        public int GetTrainFileCountOfCertainClassification(string classification)
 94ExpandedSubBlockStart.gifContractedSubBlock.gif        {
 95            int ret = 0;
 96
 97            ret = Directory.GetFiles(defaultDir + "\\" + classification).Length;
 98
 99            return ret;
100        }

101
102ExpandedSubBlockStart.gifContractedSubBlock.gif        /**//// <summary>
103        /// 获取指定分类包含关键字或关键词的样本数目
104        /// </summary>
105        /// <param name="classification">指定分类</param>
106        /// <param name="key">关键词或关键字</param>
107        /// <returns>样本数目</returns>

108        public int GetCountContainKeyOfClassification(string classification, string key)
109ExpandedSubBlockStart.gifContractedSubBlock.gif        {
110            int ret = 0;
111            string[] filepaths = GetFilesPath(classification);
112            try
113ExpandedSubBlockStart.gifContractedSubBlock.gif            {
114                
115                for (int i = 0; i < filepaths.Length; i++)
116ExpandedSubBlockStart.gifContractedSubBlock.gif                {
117                    string text = GetFileText(filepaths[i]);
118                    if (text.Contains(key))
119ExpandedSubBlockStart.gifContractedSubBlock.gif                    {
120                        ret++;
121                    }

122                }

123            }

124            catch
125ExpandedSubBlockStart.gifContractedSubBlock.gif            {
126                throw new Exception("error!");
127            }

128            return ret;
129        }

130    }

131}

132

 

先验概率计算代码:

ContractedBlock.gif ExpandedBlockStart.gif Code
 1using System;
 2using System.Collections.Generic;
 3using System.Text;
 4
 5namespace AspxOn.Search.FenLei
 6ExpandedBlockStart.gifContractedBlock.gif{
 7ExpandedSubBlockStart.gifContractedSubBlock.gif    /**//// <summary>
 8    /// 先验概率(事先概率)计算
 9    /// </summary>

10    public class PriorProbability
11ExpandedSubBlockStart.gifContractedSubBlock.gif    {
12        private static TrainingDataManager tdm = new TrainingDataManager();
13
14ExpandedSubBlockStart.gifContractedSubBlock.gif        /**//// <summary>
15        /// 计算先验概率
16        /// </summary>
17        /// <param name="c">给定的分类</param>
18        /// <returns>给定条件下的先验概率</returns>

19        public static float CaculatePc(string c)
20ExpandedSubBlockStart.gifContractedSubBlock.gif        {
21            float ret = 0F;
22            float Nc = tdm.GetTrainFileCountOfCertainClassification(c);
23            float N = tdm.GetTrainFileCount();
24            ret = Nc / N;
25            return ret;
26        }

27    }

28}

 

条件概率计算代码:

ContractedBlock.gif ExpandedBlockStart.gif Code
 1using System;
 2using System.Collections.Generic;
 3using System.Text;
 4
 5namespace AspxOn.Search.FenLei
 6ExpandedBlockStart.gifContractedBlock.gif{
 7ExpandedSubBlockStart.gifContractedSubBlock.gif    /**//// <summary>
 8    /// 条件概率计算
 9    /// </summary>

10    public class ClassConditionalProbability
11ExpandedSubBlockStart.gifContractedSubBlock.gif    {
12
13        private static TrainingDataManager tdm = new TrainingDataManager();
14        private static float M = 0F;
15
16ExpandedSubBlockStart.gifContractedSubBlock.gif        /**//// <summary>
17        /// 类条件概率
18        /// </summary>
19        /// <param name="x">给定关键字</param>
20        /// <param name="c">给定分类</param>
21        /// <returns></returns>

22        public static float CaculatePxc(string x, string c)
23ExpandedSubBlockStart.gifContractedSubBlock.gif        {
24            float ret = 0F;
25            float Nxc = tdm.GetCountContainKeyOfClassification(c, x);
26            float Nc = tdm.GetTrainFileCountOfCertainClassification(c);
27            float V = tdm.GetTrainingClassifications().Length;
28
29            ret = (Nxc + 1/ (Nc + V + M);//为避免出现0这样的极端情况,进行加权处理
30
31            return ret;
32        }

33    }

34}

 

用于保存分类结果的类:

ContractedBlock.gif ExpandedBlockStart.gif Code
 1using System;
 2using System.Collections.Generic;
 3using System.Text;
 4
 5namespace AspxOn.Search.FenLei
 6ExpandedBlockStart.gifContractedBlock.gif{
 7ExpandedSubBlockStart.gifContractedSubBlock.gif    /**//// <summary>
 8    /// 保存分类结果
 9    /// </summary>
10    public class ClassifyResult
11ExpandedSubBlockStart.gifContractedSubBlock.gif    {
12
13        public double probability; //分类概率
14        public string classification;  //分类
15        public ClassifyResult()
16ExpandedSubBlockStart.gifContractedSubBlock.gif        {
17            probability = 0;
18            classification = string.Empty;
19        }

20    }

21}

 

贝叶斯分类器代码:

ContractedBlock.gif ExpandedBlockStart.gif Code
 1using System;
 2using System.Collections.Generic;
 3using System.Text;
 4
 5namespace AspxOn.Search.FenLei
 6ExpandedBlockStart.gifContractedBlock.gif{
 7ExpandedSubBlockStart.gifContractedSubBlock.gif    /**//// <summary>
 8    /// 朴素贝叶斯分类器
 9    /// </summary>

10    public class BayesClassifier
11ExpandedSubBlockStart.gifContractedSubBlock.gif    {
12
13        private TrainingDataManager tdm; //训练集合管理器
14        //private string trainingDataPath; //训练集合路径
15        private static float zoomFactor = 10.0F;
16
17ExpandedSubBlockStart.gifContractedSubBlock.gif        /**//// <summary>
18        /// 默认构造器,初始化训练集合
19        /// </summary>

20        public BayesClassifier()
21ExpandedSubBlockStart.gifContractedSubBlock.gif        {
22            tdm = new TrainingDataManager();
23        }

24
25ExpandedSubBlockStart.gifContractedSubBlock.gif        /**//// <summary>
26        /// 计算给定的文本属性向量X在给定的分类Cj中的类条件概率
27        /// </summary>
28        /// <param name="X">文本属性向量X</param>
29        /// <param name="Cj">给定的分类</param>
30        /// <returns>分类条件概率连乘值</returns>

31        protected float CaluProd(string[] X, string Cj)
32ExpandedSubBlockStart.gifContractedSubBlock.gif        {
33            float ret = 1.0F;
34            for (int i = 0; i < X.Length; i++)
35ExpandedSubBlockStart.gifContractedSubBlock.gif            {
36                string Xi = X[i];
37                ret *= ClassConditionalProbability.CaculatePxc(Xi, Cj) * zoomFactor;//因为数值过小,因此将连乘值放大10倍(通过乘以zoomFactor)
38            }

39            ret *= PriorProbability.CaculatePc(Cj); //再乘以先验概率
40            return ret;
41        }

42
43ExpandedSubBlockStart.gifContractedSubBlock.gif        /**//// <summary>
44        /// 对指定文本进行分类
45        /// </summary>
46        /// <param name="text">指定文本</param>
47        /// <returns>分类结果</returns>

48        public List<ClassifyResult> Classify(string text)
49ExpandedSubBlockStart.gifContractedSubBlock.gif        {
50            string[] terms = ChineseSpliter.Split(text, "|").Split('|'); //中文分词处理(分词结果可能包含停用词)
51            string[] classes = tdm.GetTrainingClassifications();  //分类列表数组
52            float probility = 0.0F;
53            List<ClassifyResult> crs = new List<ClassifyResult>(); //分类结果
54            for (int i = 0; i < classes.Length; i++)
55ExpandedSubBlockStart.gifContractedSubBlock.gif            {
56                string Ci = classes[i];
57                probility = CaluProd(terms, Ci); //计算给定的文本属性向量terms在给定的分类Ci中的分类条件概率
58                ClassifyResult cr = new ClassifyResult();
59                cr.classification = Ci;
60                cr.probability = probility;
61                crs.Add(cr);
62            }

63            return crs;
64        }

65
66        public string GetMaxNum(List<ClassifyResult> crs)
67ExpandedSubBlockStart.gifContractedSubBlock.gif        {
68            double ret = 0;
69            string classification = string.Empty;
70            ret = crs[0].probability;
71            for (int i = 0; i < crs.Count; i++)
72ExpandedSubBlockStart.gifContractedSubBlock.gif            {
73                if (crs[i].probability > ret)
74ExpandedSubBlockStart.gifContractedSubBlock.gif                {
75                    ret = crs[i].probability;
76                    classification = crs[i].classification;
77                }

78            }

79            return classification;
80        }

81    }

82}

 

代码太多,编辑的时候卡的很,于是再整个(二)

转载于:https://www.cnblogs.com/waemz/archive/2009/02/25/1397647.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值