也谈贝叶斯分类(C#)版本

   

代码下载

 最近在做一个大作业。搭建一个信息检索平台。用到了贝叶斯分类参考了洞庭散人大哥的技术博客

http://www.cnblogs.com/phinecos/archive/2008/10/21/1316044.html

但是,他的算法运行起来很慢,原因是IO操作过于频繁,而且有些IO操作是可以避免的。下面开始介绍我的贝叶斯分类算法实现。

采用分词器为河北理工大学吕震宇老师的SHARPICTCLAS 该分词器没有Lucene接口,自己实现Analyzer 和Tokenizer 类如下

 

ExpandedBlockStart.gif ICTCLASAnalyzer
using  System;
using  System.Collections.Generic;
using  System.Text;
using  System.IO;
using  Lucene.Net.Analysis;
using  Lucene.Net.Analysis.Standard;

namespace  Bayes
{
    
class  ICTCLASAnalyzer:Analyzer
    {
        
public   static   readonly  System.String[] CHINESE_ENGLISH_STOP_WORDS  =   new   string [ 400 ];
        
public   string  NoisePath  =  Environment.CurrentDirectory  +   " \\data\\stopwords.txt " ;
        
public  ICTCLASAnalyzer()
        {
           StreamReader reader 
=   new  StreamReader(NoisePath, System.Text.Encoding.Default);
            
string  noise  =  reader.ReadLine();
            
int  i  =   0 ;
           
            
while  ( ! string .IsNullOrEmpty(noise) && i < 400 )
            {
                CHINESE_ENGLISH_STOP_WORDS[i] 
=  noise;
               noise 
=  reader.ReadLine();
               i
++ ;
             }
            
      }

               
/**//**//**/ ///  Constructs a {@link StandardTokenizer} filtered by a {@link
       
///  StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. 
       
///  
         public   override  TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
        {
           TokenStream result 
=   new  ICTCLASTokenizer(reader);
            result 
=   new  StandardFilter(result);
            result 
=   new  LowerCaseFilter(result);
            result 
=   new  StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);
           
return  result;
        }


    }
}

 

 

 

ExpandedBlockStart.gif ICTCLASTokenizer
using  System;
using  System.Collections.Generic;
using  System.Text;
using  Lucene.Net.Analysis;
using  Lucene.Net.Documents;
using  Lucene.Net.Analysis.Standard;
using  System.IO;
using  SharpICTCLAS;


namespace  Bayes
{
    
class  ICTCLASTokenizer:Tokenizer
    {
         
int  nKind  =   1 ;
         List
< WordResult[] >  result;
         
int  startIndex  =   0 ;
         
int  endIndex  =   0 ;
         
int  i  =   1 ;
         
/**//**/
         
/**/ ///  
        
///  待分词的句子
        
///  
         private   string  sentence;
         
/**//**/
        
/**/ ///  Constructs a tokenizer for this Reader. 
         public  ICTCLASTokenizer(System.IO.TextReader reader)
        {
             
this .input  =  reader;
             sentence 
=  input.ReadToEnd();
             sentence 
=  sentence.Replace( " \r\n " "" );
             
string  DictPath  =  Path.Combine(Environment.CurrentDirectory,  " Data " +  Path.DirectorySeparatorChar;
            
// Console.WriteLine("正在初始化字典库,请稍候");
            WordSegment wordSegment  =   new  WordSegment();
             wordSegment.InitWordSegment(DictPath);
             result 
=  wordSegment.Segment(sentence, nKind);
         }
 
         
/**//**/
         
/**/ ///  进行切词,返回数据流中下一个token或者数据流为空时返回null
         
///  
          public   override  Token Next()
         {
             Token token 
=   null ;
            
while  (i  <  result[ 0 ].Length  -   1 )
             {
                 
string  word  =  result[ 0 ][i].sWord;
                 endIndex 
=  startIndex  +  word.Length  -   1 ;
                 token 
=   new  Token(word, startIndex, endIndex);
                startIndex 
=  endIndex  +   1 ;

                 i
++ ;
                 
return  token;

            }
            
return   null ;
         }

    }
}

 

 

 下面开始介绍我的实现:分为五个类: ChineseSpliter用于分词,ClassifyResult用于储存结果。MemoryTrainingDataManager,用于管理IO操作 FastNaiveBayesClassification 用于实现贝叶斯算法。和洞庭散人不同之处在于我的各个计算前向概率,条件概率,联合概率的函数写在了一个类里,而不是多个类,这样做的目的在于避免不必要的IO操作。

 

ExpandedBlockStart.gif ClassifyResult
using  System;
using  System.Collections.Generic;
using  System.Text;

namespace  Bayes
{
    
class  ClassifyResult
    {
        
public   string  className;
        
public   float  score;
        
public  ClassifyResult()
        {
            className 
=   "" ;
            score 
=   0 ;
        }
    
    
    }
}

 

 

 

ExpandedBlockStart.gif ChineseSpliter
using  System;
using  System.Collections.Generic;
using  System.Text;
using  System.IO;
using  Lucene.Net.Analysis;


namespace  Bayes
{
    
class  ChineseSpliter
    {    
public   string  Split( string  text, string  splitToken)
        {
          StringBuilder sb 
=   new  StringBuilder();

            Analyzer an 
=   new  ICTCLASAnalyzer();

            
// TokenStream ts = an.ReusableTokenStream("", new StringReader(text));

           TokenStream ts 
=  an.TokenStream( "" new  StringReader(text));

             Lucene.Net.Analysis.Token token;
              
while  ((token  =  ts.Next())  !=   null )
              {
                   sb.Append(splitToken 
+  token.TermText());
               }
 
             
return  sb.ToString().Substring( 1 );
         }
        
public   string [] GetTerms( string  result,  string  spliter)
        {
            
string [] terms  =  result.Split( new   string [] { spliter }, StringSplitOptions.RemoveEmptyEntries);
            
return  terms;

        }

    }
}

 

 

  

ExpandedBlockStart.gif MemoryTrainingDataManager
using  System;
using  System.Collections.Generic;
using  System.Text;
using  System.IO;



namespace  Bayes
{
    
class  MemoryTrainingDataManager
    {   
// 调用 函数GetClassifications()获取类别子目录在磁盘中的储存位置,为公有成员变量 txtClassification赋值
        
// 调用 GetTtotalFileCount() 获取总共的样本集文章数目,为公有成员变量 totalFileCount赋值
         public  String[] txtClassifications; // 训练语料分类集合
         private   static  String defaultPath  =   " F:\\TrainingSet " ;
        
public   int  totalFileCount;
        
public   void    GetClassifications()
        {
            
this .txtClassifications  =  Directory.GetDirectories(defaultPath);
           
        }

        
public   int  GetSubClassFileCount( string  subclass)
        {
            
string [] paths  =  Directory.GetFiles(subclass);
            
return  paths.Length;
        }
        
public   void   GetTotalFileCount()
        {
            
int  count  =   0 ;
            
for  ( int  i  =   0 ; i  <  txtClassifications.Length; i ++ )
            {
                count 
+=  GetSubClassFileCount(txtClassifications[i]);
            }
            totalFileCount 
=  count;
        }
       
        
public   string  GetText( string  filePath)
        {
            StreamReader sr 
=   new  StreamReader(filePath, Encoding.Default);
            
string  text  =  sr.ReadToEnd();
            sr.Close();
            
return  text;
        }
        
public   void   SetMainMemoryStructure( ref  StoreClass sc , string  subclass)
        {
           
               
string  []paths = Directory.GetFiles(subclass);
                sc.classificationName 
=  subclass;
               sc.classificationCount 
=  paths.Length;
               sc.strFileContentList 
=   new   string [sc.classificationCount];
                
for  ( int  k  =   0 ; k  <  paths.Length; k ++ )
                {
                    sc.strFileContentList[k]
= GetText(paths[k]);
                }
           }

        
public   int  GetKeyCountOfSubClass( string  key,  ref  StoreClass sc)
        {
            
int  count  =   0 ;
            
for  ( int  i  =   0 ;  i  <  sc.classificationCount; i ++ )
            {
                
if  (sc.strFileContentList[i].Contains(key))
                {
                    count
++ ;
                }
            }
                
return  count;


        }
         
        




    }
}

 

 

ExpandedBlockStart.gif FastNaiveBayesClassification
using  System;
using  System.Collections.Generic;
using  System.Text;

namespace  Bayes
{
    
class  FastNaiveBayesClassification
    {
       
//  public  StoreClass memorystore=new StoreClass();
         public  MemoryTrainingDataManager mtdm = new  MemoryTrainingDataManager();
        
private  ChineseSpliter spliter  =   new  ChineseSpliter();
        
private   static   float  ZoomFactor  =   10 ;
       
        
public  FastNaiveBayesClassification()
        {
            mtdm.GetClassifications();
            mtdm.GetTotalFileCount();
        }
        
///   <summary>
        
///  Nc 表示属于c类的文本数,N表示总文件数
        
///   </summary>
        
///   <param name="Nc"></param>
        
///   <param name="N"></param>
        
///   <returns></returns>
         public   float  CalculatePriorProbability( float  Nc, float  N)
        {
            
float  ret  =  0F;
            ret 
=  Nc  /  N;
            
return  ret;
        }
        
///   <summary>
        
///  
        
///   </summary>
        
///   <param name="NxC"> 某一类别中某一词频出现的文件数 </param>
        
///   <param name="Nc"> 该类别文件总数 </param>
        
///   <returns></returns>
         public   float  CalculateConditionalProbability( float  NxC,  float  Nc)
        {
            
float  M  =  0F;
            
float  ret  =  0F;
            ret 
=  (NxC  +   1 /  (Nc  +  M  +  mtdm.txtClassifications.Length);
            
return  ret;
        }
        
public   float  CalculateJointProbability( float  []NxC,  float  Nc,  float   N)
        {
            
float  ret  =   1 ;
            
for  ( int  i  =   0 ; i  <  NxC.Length; i ++ )
            {
                ret 
*=  CalculateConditionalProbability(NxC[i], Nc)  *  ZoomFactor;
            }
            ret 
=  ret  *  CalculatePriorProbability(Nc, N) ;
            
return  ret;

        }
        
public   string [] SplitTerms( string  text)
        {
            
// string result = tokenizer.TextSplit(text, "@@@");
            
//  string[] terms = tokenizer.GetTerms(result, "@@@");
             string  result  =  spliter.Split(text,  " @@@ " );
            
string [] terms  =  spliter.GetTerms(result,  " @@@ " );
            
return  terms;
        }

        
public  ClassifyResult Classify( string  text)
        {   
int  end = mtdm.txtClassifications.Length;
            ClassifyResult[] results 
=   new  ClassifyResult[end];
            
for  ( int  i  =   0 ; i  <  end; i ++ )
            {
                results[i] 
=   new  ClassifyResult();
            }
            
string [] terms  =  SplitTerms(text);
            
float  N  =  mtdm.totalFileCount;
            
for  ( int  i  =   0 ; i  <  end; i ++ )
            {
                StoreClass sc 
=   new  StoreClass();
                mtdm.SetMainMemoryStructure(
ref  sc,  mtdm.txtClassifications[i]);
                
float   Nc  =  sc.classificationCount;
                
float [] Nxc  =   new   float [terms.Length];
               
                
for ( int  k = 0 ;k < terms.Length;k ++ )
                {
                  Nxc[k]
= mtdm.GetKeyCountOfSubClass(terms[k], ref  sc);
                 
//  Console.WriteLine("含有的关键词数量{0}",Nxc[k]);
                }
                 results[i].score
=  CalculateJointProbability(Nxc, Nc, N);  
                 results[i].className 
=  sc.classificationName;
                 Console.WriteLine(
" 类别{0},分数{1} " , results[i].className, results[i].score);
            
            }
            
// 选择法排序
             for  ( int  m  =   0 ; m  <  results.Length  -   1 ; m ++ )
            {
                
int  k  =  m;
                
for  ( int  n  =  m  +   1 ; n  <  results.Length; n ++ )
                {
                    
if  (results[n].score  >  results[k].score)
                    {
                        k 
=  n;
                    }
                }
                
if  (k  !=  m)
                {
                    ClassifyResult temp 
=   new  ClassifyResult();
                    temp.score 
=  results[k].score;
                    temp.className 
=  results[k].className;
                    results[k].className 
=  results[m].className;
                    results[k].score 
=  results[m].score;
                    results[m].score 
=  temp.score;
                    results[m].className 
=  temp.className;
                }
            }
            
return  results[ 0 ];

        }
    }
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值