翻代码时看到以前写的TF-IDF的C#实现,共享一下..
ps: codeproject.com有一个泰国仔实现的版本,代码写得非常乱..
代码
using
System;
using System.Collections.Generic;
using System.Text;
namespace Cluster
{
/// <summary>
/// 词项
/// </summary>
class Term
{
/// <summary>
/// 词在词表中的索引(在线性词表中的序号)
/// </summary>
public int index;
/// <summary>
/// 词出现过的文档数(在多少篇文章出现过)
/// </summary>
public int docNum;
/// <summary>
///
/// </summary>
public Term( int index)
{
this .index = index;
}
}
}
using System.Collections.Generic;
using System.Text;
namespace Cluster
{
/// <summary>
/// 词项
/// </summary>
class Term
{
/// <summary>
/// 词在词表中的索引(在线性词表中的序号)
/// </summary>
public int index;
/// <summary>
/// 词出现过的文档数(在多少篇文章出现过)
/// </summary>
public int docNum;
/// <summary>
///
/// </summary>
public Term( int index)
{
this .index = index;
}
}
}
代码
using
System;
using System.Collections.Generic;
using System.Text;
namespace Cluster
{
/// <summary>
/// term frequency–inverse document frequency
/// </summary>
static class TFIDF
{
/// <summary>
/// 计算tf-idf
/// </summary>
/// <param name="docs"> 待处理文档(已分词) </param>
/// <returns></returns>
public static List < Dictionary < int , double >> Calculate( string [][] docs)
{
List < Dictionary < int , double >> tfidfs = new List < Dictionary < int , double >> ();
Dictionary < string , Term > terms = new Dictionary < string , Term > (); // 词表
List < Dictionary < int , double >> tfs = new List < Dictionary < int , double >> (); // 词频
Dictionary < int , double > idfs = new Dictionary < int , double > (); // 逆文档频率
CalcTF(docs, terms, tfs);
CalcIDF(docs, terms, idfs);
CalcTFIDF(tfs, idfs, tfidfs);
return tfidfs;
}
#region TF
/// <summary>
/// 计算词频(term frequency)
/// </summary>
/// <param name="docs"> 文档 </param>
/// <param name="terms"> 词表 </param>
/// <param name="tfs"> 词数 </param>
private static void CalcTF( string [][] docs, Dictionary < string , Term > terms, List < Dictionary < int , double >> tfs)
{
foreach ( string [] doc in docs)
{
Dictionary < int , int > termNums = new Dictionary < int , int > ();
foreach ( string term in doc)
{
int index = - 1 ; // 词表索引
if ( ! terms.ContainsKey(term))
{
index = terms.Count;
terms.Add(term, new Term(index));
}
else
{
index = terms[term].index;
}
if ( ! termNums.ContainsKey(index))
{
termNums.Add(index, 1 );
terms[term].docNum ++ ; // 词的文档数
}
else
{
termNums[index] ++ ;
}
}
double len = ( double )doc.Length;
Dictionary < int , double > tf = new Dictionary < int , double > (); // 词频
foreach (KeyValuePair < int , int > kvp in termNums)
{
tf.Add(kvp.Key, ( double )kvp.Value / len); // 当前词的词数/总词数
}
tfs.Add(tf);
}
}
#endregion
#region IDF
/// <summary>
/// 计算逆文档频率(inverse document frequency)
/// </summary>
/// <param name="docs"></param>
/// <param name="terms"></param>
/// <param name="idfs"></param>
private static void CalcIDF( string [][] docs, Dictionary < string , Term > terms, Dictionary < int , double > idfs)
{
double len = ( double )docs.Length;
foreach (KeyValuePair < string , Term > kvp in terms)
{
double idf = Math.Log(len / ( double )kvp.Value.docNum, Math.E); // ln(总文档数/当前词出现过的文档数)
idfs.Add(kvp.Value.index, idf);
}
}
#endregion
#region TF-IDF
/// <summary>
///
/// </summary>
/// <param name="tfs"></param>
/// <param name="idfs"></param>
/// <param name="tfidfs"></param>
private static void CalcTFIDF(List < Dictionary < int , double >> tfs, Dictionary < int , double > idfs, List < Dictionary < int , double >> tfidfs)
{
foreach (Dictionary < int , double > tf in tfs)
{
Dictionary < int , double > tfidf = new Dictionary < int , double > ();
foreach (KeyValuePair < int , double > kvp in tf)
{
tfidf.Add(kvp.Key, kvp.Value * idfs[kvp.Key]);
}
tfidfs.Add(tfidf);
}
}
#endregion
}
}
using System.Collections.Generic;
using System.Text;
namespace Cluster
{
/// <summary>
/// term frequency–inverse document frequency
/// </summary>
static class TFIDF
{
/// <summary>
/// 计算tf-idf
/// </summary>
/// <param name="docs"> 待处理文档(已分词) </param>
/// <returns></returns>
public static List < Dictionary < int , double >> Calculate( string [][] docs)
{
List < Dictionary < int , double >> tfidfs = new List < Dictionary < int , double >> ();
Dictionary < string , Term > terms = new Dictionary < string , Term > (); // 词表
List < Dictionary < int , double >> tfs = new List < Dictionary < int , double >> (); // 词频
Dictionary < int , double > idfs = new Dictionary < int , double > (); // 逆文档频率
CalcTF(docs, terms, tfs);
CalcIDF(docs, terms, idfs);
CalcTFIDF(tfs, idfs, tfidfs);
return tfidfs;
}
#region TF
/// <summary>
/// 计算词频(term frequency)
/// </summary>
/// <param name="docs"> 文档 </param>
/// <param name="terms"> 词表 </param>
/// <param name="tfs"> 词数 </param>
private static void CalcTF( string [][] docs, Dictionary < string , Term > terms, List < Dictionary < int , double >> tfs)
{
foreach ( string [] doc in docs)
{
Dictionary < int , int > termNums = new Dictionary < int , int > ();
foreach ( string term in doc)
{
int index = - 1 ; // 词表索引
if ( ! terms.ContainsKey(term))
{
index = terms.Count;
terms.Add(term, new Term(index));
}
else
{
index = terms[term].index;
}
if ( ! termNums.ContainsKey(index))
{
termNums.Add(index, 1 );
terms[term].docNum ++ ; // 词的文档数
}
else
{
termNums[index] ++ ;
}
}
double len = ( double )doc.Length;
Dictionary < int , double > tf = new Dictionary < int , double > (); // 词频
foreach (KeyValuePair < int , int > kvp in termNums)
{
tf.Add(kvp.Key, ( double )kvp.Value / len); // 当前词的词数/总词数
}
tfs.Add(tf);
}
}
#endregion
#region IDF
/// <summary>
/// 计算逆文档频率(inverse document frequency)
/// </summary>
/// <param name="docs"></param>
/// <param name="terms"></param>
/// <param name="idfs"></param>
private static void CalcIDF( string [][] docs, Dictionary < string , Term > terms, Dictionary < int , double > idfs)
{
double len = ( double )docs.Length;
foreach (KeyValuePair < string , Term > kvp in terms)
{
double idf = Math.Log(len / ( double )kvp.Value.docNum, Math.E); // ln(总文档数/当前词出现过的文档数)
idfs.Add(kvp.Value.index, idf);
}
}
#endregion
#region TF-IDF
/// <summary>
///
/// </summary>
/// <param name="tfs"></param>
/// <param name="idfs"></param>
/// <param name="tfidfs"></param>
private static void CalcTFIDF(List < Dictionary < int , double >> tfs, Dictionary < int , double > idfs, List < Dictionary < int , double >> tfidfs)
{
foreach (Dictionary < int , double > tf in tfs)
{
Dictionary < int , double > tfidf = new Dictionary < int , double > ();
foreach (KeyValuePair < int , double > kvp in tf)
{
tfidf.Add(kvp.Key, kvp.Value * idfs[kvp.Key]);
}
tfidfs.Add(tfidf);
}
}
#endregion
}
}