TF-IDF(C#)

翻代码时看到以前写的TF-IDF的C#实现,共享一下..

 

ps: codeproject.com有一个泰国仔实现的版本,代码写得非常乱..

 

ExpandedBlockStart.gif 代码
using  System;
using  System.Collections.Generic;
using  System.Text;

namespace  Cluster
{
    
///   <summary>
    
///  词项
    
///   </summary>
     class  Term
    {
        
///   <summary>
        
///  词在词表中的索引(在线性词表中的序号)
        
///   </summary>
         public   int  index;

        
///   <summary>
        
///  词出现过的文档数(在多少篇文章出现过)
        
///   </summary>
         public   int  docNum;


        
///   <summary>
        
///  
        
///   </summary>
         public  Term( int  index)
        {
            
this .index  =  index;
        }
    }
}

 

ExpandedBlockStart.gif 代码
using  System;
using  System.Collections.Generic;
using  System.Text;

namespace  Cluster
{
    
///   <summary>
    
///  term frequency–inverse document frequency
    
///   </summary>
     static   class  TFIDF
    {
        
///   <summary>
        
///  计算tf-idf
        
///   </summary>
        
///   <param name="docs"> 待处理文档(已分词) </param>
        
///   <returns></returns>
         public   static  List < Dictionary < int double >>  Calculate( string [][] docs)
        {
            List
< Dictionary < int double >>  tfidfs  =   new  List < Dictionary < int double >> ();

            Dictionary
< string , Term >  terms  =   new  Dictionary < string , Term > ();  // 词表
            List < Dictionary < int double >>  tfs  =   new  List < Dictionary < int double >> ();  // 词频
            Dictionary < int double >  idfs  =   new  Dictionary < int double > ();  // 逆文档频率

            CalcTF(docs, terms, tfs);
            CalcIDF(docs, terms, idfs);
            CalcTFIDF(tfs, idfs, tfidfs);

            
return  tfidfs;
        }

        
#region  TF
        
///   <summary>
        
///  计算词频(term frequency)
        
///   </summary>
        
///   <param name="docs"> 文档 </param>
        
///   <param name="terms"> 词表 </param>
        
///   <param name="tfs"> 词数 </param>
         private   static   void  CalcTF( string [][] docs, Dictionary < string , Term >  terms, List < Dictionary < int double >>  tfs)
        {
            
foreach  ( string [] doc  in  docs)
            {
                Dictionary
< int int >  termNums  =   new  Dictionary < int int > ();
                
foreach  ( string  term  in  doc)
                {
                    
int  index  =   - 1 // 词表索引
                     if  ( ! terms.ContainsKey(term))
                    {
                        index 
=  terms.Count;
                        terms.Add(term, 
new  Term(index));
                    }
                    
else
                    {
                        index 
=  terms[term].index;
                    }
                    
if  ( ! termNums.ContainsKey(index))
                    {
                        termNums.Add(index, 
1 );
                        terms[term].docNum
++ // 词的文档数
                    }
                    
else
                    {
                        termNums[index]
++ ;
                    }
                }
                
double  len  =  ( double )doc.Length;
                Dictionary
< int double >  tf  =   new  Dictionary < int double > ();  // 词频
                 foreach  (KeyValuePair < int int >  kvp  in  termNums)
                {
                    tf.Add(kvp.Key, (
double )kvp.Value  /  len);  // 当前词的词数/总词数
                }
                tfs.Add(tf);
            }
        }
        
#endregion

        
#region  IDF
        
///   <summary>
        
///  计算逆文档频率(inverse document frequency)
        
///   </summary>
        
///   <param name="docs"></param>
        
///   <param name="terms"></param>
        
///   <param name="idfs"></param>
         private   static   void  CalcIDF( string [][] docs, Dictionary < string , Term >  terms, Dictionary < int double >  idfs)
        {
            
double  len  =  ( double )docs.Length;
            
foreach  (KeyValuePair < string , Term >  kvp  in  terms)
            {
                
double  idf  =  Math.Log(len  /  ( double )kvp.Value.docNum, Math.E);  // ln(总文档数/当前词出现过的文档数)
                idfs.Add(kvp.Value.index, idf);
            }
        }
        
#endregion

        
#region  TF-IDF
        
///   <summary>
        
///  
        
///   </summary>
        
///   <param name="tfs"></param>
        
///   <param name="idfs"></param>
        
///   <param name="tfidfs"></param>
         private   static   void  CalcTFIDF(List < Dictionary < int double >>  tfs, Dictionary < int double >  idfs, List < Dictionary < int double >>  tfidfs)
        {
            
foreach  (Dictionary < int double >  tf  in  tfs)
            {
                Dictionary
< int double >  tfidf  =   new  Dictionary < int double > ();
                
foreach  (KeyValuePair < int double >  kvp  in  tf)
                {
                    tfidf.Add(kvp.Key, kvp.Value 
*  idfs[kvp.Key]);
                }
                tfidfs.Add(tfidf);
            }
        }
        
#endregion

    }
}

 

转载于:https://www.cnblogs.com/live41/archive/2010/12/29/1920874.html

namespace ServiceRanking { /// <summary> /// Summary description for TF_IDFLib. /// </summary> public class TFIDFMeasure { private string[] _docs; private string[][] _ngramDoc; private int _numDocs=0; private int _numTerms=0; private ArrayList _terms; private int[][] _termFreq; private float[][] _termWeight; private int[] _maxTermFreq; private int[] _docFreq; public class TermVector { public static float ComputeCosineSimilarity(float[] vector1, float[] vector2) { if (vector1.Length != vector2.Length) throw new Exception("DIFER LENGTH"); float denom=(VectorLength(vector1) * VectorLength(vector2)); if (denom == 0F) return 0F; else return (InnerProduct(vector1, vector2) / denom); } public static float InnerProduct(float[] vector1, float[] vector2) { if (vector1.Length != vector2.Length) throw new Exception("DIFFER LENGTH ARE NOT ALLOWED"); float result=0F; for (int i=0; i < vector1.Length; i++) result += vector1[i] * vector2[i]; return result; } public static float VectorLength(float[] vector) { float sum=0.0F; for (int i=0; i < vector.Length; i++) sum=sum + (vector[i] * vector[i]); return (float)Math.Sqrt(sum); } } private IDictionary _wordsIndex=new Hashtable() ; public TFIDFMeasure(string[] documents) { _docs=documents; _numDocs=documents.Length ; MyInit(); } private void GeneratNgramText() { } private ArrayList GenerateTerms(string[] docs) { ArrayList uniques=new ArrayList() ; _ngramDoc=new string[_numDocs][] ; for (int i=0; i < docs.Length ; i++) { Tokeniser tokenizer=new Tokeniser() ; string[] words=tokenizer.Partition(docs[i]); for (int j=0; j < words.Length ; j++) if (!uniques.Contains(words[j]) ) uniques.Add(words[j]) ; } return uniques; } private static object
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值