//分词
#include "TFIDFMeasure.h"
#include <limits>
#include <cmath>
using namespace std;
TFIDFMeasure::~TFIDFMeasure(void)
{
//销毁分词器
if (this->_tokenizer!=NULL)
{
delete _tokenizer;
_tokenizer = NULL;
}
//清空数据
_docs.clear();
_terms.clear();
_wordsIndex.clear();
}
TFIDFMeasure::TFIDFMeasure(const StrVec& documents,ITokeniser* tokeniser)
{
_docs=documents;
_numDocs=documents.size();
_tokenizer = tokeniser;
this->Init();
}
void TFIDFMeasure::GenerateTerms(const StrVec& docs,StrVec& terms)
{
for (int i=0; i < docs.size() ; i++)
{
StrVec words;
_tokenizer->Partition(docs[i],words);//分词
for (int j=0; j < words.size(); j++)
{
//不在单词表中,则加入
if (find(terms.begin(),terms.end(),words[j])==terms.end())
{
terms.push_back(words[j]);
}
}
}
}
void TFIDFMeasure::Init()
{//初始化
this->GenerateTerms (_docs,_terms);//分出所有词项
this->_numTerms=_terms.size() ;//所有文档中的词项数目
//准备好存储空间
_maxTermFreq.resize(_numDocs);
_docFreq.resize(_numTerms);
_termFreq.resize(_numTerms);
_termWeight.resize(_numTerms);
for(int i=0; i < _terms.size() ; i++)
{
_termWeight[i].resize(_numDocs);
_termFreq[i].resize(_numDocs) ;
_wordsIndex[_terms[i]] = i;//将单词放入单词映射表中
}
this->GenerateTermFrequency ();//计算单词频率
this->GenerateTermWeight();//计算单词权重
}
void TFIDFMeasure::GetWordFrequency(string& input,map<string,int>& freq)
{//计算单词频率
transform(input.begin(),input.end(),input.begin(),tolower);
StrVec temp;
this->_tokenizer->Partition(input,temp);//对当前文档分词
unique(temp.begin(),temp.end());
StrVec::iterator iter;
for (iter=temp.begin();iter!=temp.end();++iter)
{
int count = CountWords(*iter, temp);//计算单词在文档中出现的次数
freq[*iter] = count;//保存单词频率
}
}
void TFIDFMeasure::GetTermVector(int doc,DoubleVec& vec)
{
vec.resize(this->_numTerms);
for (int i=0; i < this->_numTerms; i++)
vec[i]=_termWeight[i][doc];//第i个单词在文档doc中的权重
}
//用于字符串比较的仿函数
class WordComp
{
public:
WordComp(string& sWord) : word(sWord)
{
}
bool operator() (const string& lhs)
{
return lhs.compare(word)==0;
}
private:
string word;
};
int TFIDFMeasure::CountWords(string& word, const StrVec& words)
{
int nCount = 0;
nCount = count_if(words.begin(),words.end(),WordComp(word));
return nCount;
}
int TFIDFMeasure::GetTermIndex(const string& term)
{
map<string,int>::iterator pos = _wordsIndex.find(term);
if (pos!=_wordsIndex.end())
{
return pos->second;
}
else
return -1;
}
void TFIDFMeasure::GenerateTermFrequency()
{//计算每个单词在每份文档出现的频率
for(int i=0; i < _numDocs ; i++)
{
string curDoc=_docs[i];//当前待处理的文档
map<string,int> freq;
this->GetWordFrequency(curDoc,freq);
map<string,int>::iterator iter;
_maxTermFreq[i]=numeric_limits<int>::min();
for (iter = freq.begin();iter!=freq.end();++iter)
{
string word=iter->first;
int wordFreq=iter->second ;
int termIndex=GetTermIndex(word);//单词下标
if(termIndex == -1)
continue;
_termFreq [termIndex][i]=wordFreq;//单词在第i份文档中出现的频率
_docFreq[termIndex]++;//出现第termIndex单词的文档频率加1
if (wordFreq > _maxTermFreq[i]) _maxTermFreq[i]=wordFreq;//记录第i份文档中的最大词频
}
}
}
void TFIDFMeasure::GenerateTermWeight()
{//计算每个单词在每份文档中的权重
for(int i=0; i < _numTerms; i++)
{
for(int j=0; j < _numDocs ; j++)
{
_termWeight[i][j]=ComputeTermWeight (i, j);
}
}
}
double TFIDFMeasure::GetTermFrequency(int term, int doc)
{
int freq=_termFreq [term][doc];//词频
int maxfreq=_maxTermFreq[doc];
return ( (float) freq/(float)maxfreq );
}
double TFIDFMeasure::ComputeTermWeight(int term, int doc)
{//计算单词在文档中的权重
float tf=GetTermFrequency (term, doc);
float idf=GetInverseDocumentFrequency(term);
return tf * idf;
}
double TFIDFMeasure::GetInverseDocumentFrequency(int term)
{
int df=_docFreq[term];//包含单词term的文档数目
return log((float) (_numDocs) / (float) df );
}
//k-means
#include "KMeans.h"
#include <time.h>
#include "Cluster.h"
#include "TermVector.h"
#include <limits>
KMeans::KMeans(Double2DVec &data, int K)
{
int i;
this->_coordinates.resize(data.size());
for (i=0;i<data.size();++i)
{
copy(data[i].begin(),data[i].end(),back_inserter(_coordinates[i]));
}
_coordCount = data.size();
_k = K;
_clusters.resize(K);
_clusterAssignments.resize(_coordCount);
_nearestCluster.resize(_coordCount);
_distanceCache.resize(_coordCount);
for (i=0;i<_coordCount;++i)
{
_distanceCache[i].resize(_coordCount);
}
InitRandom();
}
void KMeans::InitRandom()
{
srand(unsigned(time(NULL)));
for (int i = 0; i < _k; i++)
{
int temp = rand()%(_coordCount);//产生随机数
_clusterAssignments[temp] = i; //记录第temp个资料属于第i个聚类
_clusters[i] = new Cluster(temp,_coordinates[temp]);
}
}
void KMeans::Start()
{
int iter = 0,i,j;
while (true)
{
cout<<"Iteration "<<iter++<< "..."<<endl;
//1、重新计算每个聚类的均值
for (i = 0; i < _k; i++)
{
_clusters[i]->UpdateMean(_coordinates);
}
//2、计算每个数据和每个聚类中心的距离
for (i = 0; i < _coordCount; i++)
{
for (j = 0; j < _k; j++)
{
double dist = getDistance(_coordinates[i], _clusters[j]->Mean);
_distanceCache[i][j] = dist;
}
}
//3、计算每个数据离哪个聚类最近
for (i = 0; i < _coordCount; i++)
{
_nearestCluster[i] = this->NearestCluster(i);
}
//4、比较每个数据最近的聚类是否就是它所属的聚类
//如果全相等表示所有的点已经是最佳距离了,直接返回;
int k = 0;
for (i = 0; i < _coordCount; i++)
{
if (_nearestCluster[i] == _clusterAssignments[i])
k++;
}
if (k == _coordCount)
break;
//5、否则需要重新调整资料点和群聚类的关系,调整完毕后再重新开始循环;
//需要修改每个聚类的成员和表示某个数据属于哪个聚类的变量
for (j = 0; j < _k; j++)
{
_clusters[j]->CurrentMembership.clear();
}
for (i = 0; i < _coordCount; i++)
{
_clusters[_nearestCluster[i]]->CurrentMembership.push_back(i);
_clusterAssignments[i] = _nearestCluster[i];
}
}
}
double KMeans::getDistance(const DoubleVec& coord, const DoubleVec& center)
{
return 1- TermVector::ComputeCosineSimilarity(coord, center);
}
int KMeans::NearestCluster(int ndx)
{
int nearest = -1;
double min = numeric_limits<double>::max();
for (int c = 0; c < _k; c++)
{
double d = _distanceCache[ndx][c];
if (d < min)
{
min = d;
nearest = c;
}
}
return nearest;
}
KMeans::~KMeans(void)
{
vector<Cluster*>::iterator iter;
for (iter=this->_clusters.begin();iter!=_clusters.end();++iter)
{
delete (*iter);
}
_clusters.clear();
}