//
// new_keyword_extract1.0.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
//boost库的静态链接问题
//#include <boost/regex.hpp>
#include <iostream>
#include <string>
#include <regex>
#include <fstream>
#include <sstream>
#include <utility>
#include <algorithm>
#include <string>
#include <cctype>
#include <set>
#include <queue>
#include <regex>
#include <hash_map>
#include "Term.h"
using namespace std;
//using namespace boost;
typedef struct //用于一个簇类中文档相似度排列的结构
{
string word;
double final_value;
}Node;
//struct cmp //重载比较函数
//{
// bool operator()(const Node &t1,const Node &t2)
// {
// return t1.final_value<t2.final_value;
// //相当于less,小顶堆
// }
//};
Node* keyword_extra_entropy(string text);
string pre_treatment(string text);
void Quick_sort(Node ArrayInput[],int nLow,int nHigh);
int Partition(Node ArrayInput[],int nLow,int nHigh);
void Swap(Node &p,Node &q);
string LoadDoc(string address);
void main(){
string text=LoadDoc("D:\\test.txt");
text=pre_treatment(text);
int i;
Node *result;
result=keyword_extra_entropy(text);
for(i=0;i<30;i++)
cout<<endl<<result[i].word<<"-"<<result[i].final_value;
system("pause");
}
void Swap(Node &p,Node &q)
{
Node temp = p;
p=q;
q=temp;
}
//Partition function
int Partition(Node ArrayInput[],int nLow,int nHigh)
{
double nTemp=ArrayInput[nHigh].final_value;
int i = nLow, j=nLow-1;
for(; i<nHigh; i++)
{
if( ArrayInput[i].final_value>=nTemp )
{
j++;
if(i !=j )
{
Swap(ArrayInput[i], ArrayInput[j]);
}
}
}
Swap(ArrayInput[j+1],ArrayInput[nHigh]);
return (j+1);
}
//Quick sort
void Quick_sort(Node ArrayInput[],int nLow,int nHigh)
{
if(nLow < nHigh)
{
int nIndex=Partition(ArrayInput , nLow, nHigh);
Quick_sort(ArrayInput , nLow, nIndex-1);
Quick_sort(ArrayInput , nIndex+1, nHigh);
}
}
string pre_treatment(string text){
regex pattern1("\\s{2,}");
regex pattern2("[^a-z \u4e00-\u9fa5]");
//regex pattern4("[!@#$%^&*()+=|\}]{[:;<,>?/\"]");
//regex pattern3("[—]");
text=regex_replace(text,pattern2,string(" "));
//text=regex_replace(text,pattern3,string(" "));
// text=regex_replace(text,pattern4,string(" "));
text=regex_replace(text,pattern1,string(" "));
return text;
}
Node* keyword_extra_entropy(string text){
//算法部分
hash_map<string,int> word_frequency;
hash_map<string,vector<int>> word_loc;
stringstream q;
q.str(text);
vector<string> wordlist;
int i=0;
while(q){
string asd,we;
q>>asd;
we.resize(asd.size());
transform(asd.begin(),asd.end(),we.begin(),tolower);
if(we.empty())
continue;
// if(!excluded_word.count(we))
//{
i++;
++word_frequency[we];
word_loc[we].push_back(i);
wordlist.push_back(we);
//}
}
int sum=i;
vector<Term> Term_list;
Term temp;
hash_map<string,vector<int>>::const_iterator map_it=word_loc.begin();
vector<int> r;
while(map_it!=word_loc.end()){
Term temp;
temp.Set_Term(map_it->first,map_it->second,map_it->second.size(),sum);
Term_list.push_back(temp);
++map_it;
}
Node *result;
result=new Node[Term_list.size()];
for(i=0;i<Term_list.size();i++){
Term_list[i].Cal_Distance(sum);
Term_list[i].divide_Mode();
Term_list[i].Cal_Entropy();
Term_list[i].CAL_geo(sum);
Term_list[i].Cal_EDnor();
if(_isnan(Term_list[i].EDnor))
continue;
result[i].word=Term_list[i].word;
result[i].final_value=Term_list[i].EDnor;
}
Quick_sort(result,0,Term_list.size()-1);
return result;
}
string LoadDoc(string address){
filebuf *pbuf;
ifstream filestr;
long size;
char * buffer;
// 要读入整个文件,必须采用二进制打开
filestr.open (address,ios::binary); //TheOriginofSpeciesv6_rvPun
// 获取filestr对应buffer对象的指针
pbuf=filestr.rdbuf();
// 调用buffer对象方法获取文件大小
size=pbuf->pubseekoff (0,ios::end,ios::in);
pbuf->pubseekpos (0,ios::in);
// 分配内存空间
buffer=new char[size];
// 获取文件内容
pbuf->sgetn (buffer,size);
filestr.close();
// 输出到标准输出
string text=buffer;
return text;
}