分类

yonggeno1

于 2013-11-04 21:33:49 发布

阅读量803

点赞数

分类专栏：分类c++ 文章标签：应用程序

本文链接：https://blog.csdn.net/yonggeno1/article/details/14142673

版权

分类c++ 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

// new_keyword_extract1.0.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"



//boost库的静态链接问题
//#include <boost/regex.hpp>
#include <iostream>
#include <string>
#include <regex>
#include <fstream>  

#include <sstream>
#include <utility>
#include <algorithm>
#include <string>
#include <cctype>
#include <set>
#include <queue>

#include <regex>
#include <hash_map>
#include "Term.h"

using namespace std;
//using namespace boost;

typedef struct  //用于一个簇类中文档相似度排列的结构
{
	string word;
	double final_value;      
}Node;


//struct cmp  //重载比较函数
//{
//	bool operator()(const Node &t1,const Node &t2)
//	{
//		 return t1.final_value<t2.final_value;
//			//相当于less,小顶堆   
//	}
//};
Node* keyword_extra_entropy(string text);
string pre_treatment(string text);
void Quick_sort(Node ArrayInput[],int nLow,int nHigh);
int Partition(Node ArrayInput[],int nLow,int nHigh);
void Swap(Node &p,Node &q);
string LoadDoc(string address);

void main(){


	string text=LoadDoc("D:\\test.txt");
	text=pre_treatment(text);


	int i;
	Node *result;
	result=keyword_extra_entropy(text);
	for(i=0;i<30;i++)
		cout<<endl<<result[i].word<<"-"<<result[i].final_value;

	system("pause");
}




void Swap(Node &p,Node &q)                          
{                                                      
	Node temp = p;
	p=q;
	q=temp;
} 

//Partition function
int Partition(Node ArrayInput[],int nLow,int nHigh)                
{                                                 

	double nTemp=ArrayInput[nHigh].final_value;   
	int i = nLow, j=nLow-1;  
	for(; i<nHigh; i++)
	{
		if( ArrayInput[i].final_value>=nTemp )
		{
			j++;
			if(i !=j )
			{
				Swap(ArrayInput[i], ArrayInput[j]);
			}
		}
	}

	Swap(ArrayInput[j+1],ArrayInput[nHigh]);

	return (j+1);                                        
}

//Quick sort
void Quick_sort(Node ArrayInput[],int nLow,int nHigh)            
{                                                                                                       
	if(nLow < nHigh)                                        
	{                                                
		int nIndex=Partition(ArrayInput , nLow, nHigh);                         
		Quick_sort(ArrayInput , nLow, nIndex-1);                           
		Quick_sort(ArrayInput , nIndex+1, nHigh);                           
	}                                                 
}

string pre_treatment(string text){
	regex pattern1("\\s{2,}");
	regex pattern2("[^a-z \u4e00-\u9fa5]");
	//regex pattern4("[!@#$%^&*()+=|\}]{[:;<,>?/\"]");
	//regex pattern3("[—]");


	text=regex_replace(text,pattern2,string(" "));
	//text=regex_replace(text,pattern3,string(" "));
	//	text=regex_replace(text,pattern4,string(" "));
	text=regex_replace(text,pattern1,string(" "));

	return text;

}

Node* keyword_extra_entropy(string text){

	//算法部分
	hash_map<string,int> word_frequency;
	hash_map<string,vector<int>> word_loc;


	stringstream q;
	q.str(text);
	vector<string> wordlist;
	int i=0;
	while(q){
		string asd,we;
		q>>asd;

		we.resize(asd.size());
		transform(asd.begin(),asd.end(),we.begin(),tolower);

		if(we.empty())
			continue;

		//	if(!excluded_word.count(we))
		//{   
		i++;
		++word_frequency[we];
		word_loc[we].push_back(i);

		wordlist.push_back(we);	
		//}
	}

	int sum=i;

	vector<Term> Term_list;
	Term temp;
	hash_map<string,vector<int>>::const_iterator map_it=word_loc.begin();

	vector<int> r;
	while(map_it!=word_loc.end()){	
		Term temp;
		temp.Set_Term(map_it->first,map_it->second,map_it->second.size(),sum);
		Term_list.push_back(temp);
		++map_it;
	}

	Node *result;
	result=new Node[Term_list.size()];

	for(i=0;i<Term_list.size();i++){
		Term_list[i].Cal_Distance(sum);
		Term_list[i].divide_Mode();
		Term_list[i].Cal_Entropy();
		Term_list[i].CAL_geo(sum);
		Term_list[i].Cal_EDnor();

		if(_isnan(Term_list[i].EDnor))
			continue;

		result[i].word=Term_list[i].word;
		result[i].final_value=Term_list[i].EDnor;
	}

	Quick_sort(result,0,Term_list.size()-1);

	return result;

}

string LoadDoc(string address){

	filebuf *pbuf;  
	ifstream filestr;  
	long size;  
	char * buffer;  
	// 要读入整个文件，必须采用二进制打开   
	filestr.open (address,ios::binary);  //TheOriginofSpeciesv6_rvPun
	// 获取filestr对应buffer对象的指针   
	pbuf=filestr.rdbuf();  

	// 调用buffer对象方法获取文件大小  
	size=pbuf->pubseekoff (0,ios::end,ios::in);  
	pbuf->pubseekpos (0,ios::in);  

	// 分配内存空间  
	buffer=new char[size];  

	// 获取文件内容  
	pbuf->sgetn (buffer,size);  

	filestr.close();  
	// 输出到标准输出  

	string text=buffer;

	return text;
}