词频统计

最新推荐文章于 2022-01-29 23:44:04 发布

wintersense

最新推荐文章于 2022-01-29 23:44:04 发布

阅读量888

点赞数

分类专栏： NLP

本文链接：https://blog.csdn.net/wintersense/article/details/37699847

版权

NLP 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

最近在做词频统计，先贴出简单的分英语单词的程序

#include <map>
#include <iostream>
#include <string>

using namespace std;
int main(){
	string word;
	map<string,int> word_count;
	map<string,int>::iterator iter;
	cout <<"请输入一句话：";
	while(cin>>word){
		cout <<"收录词语：";
		word_count[word]++;
		
	}

	cout<<"*****************************************"<<endl;
	cout <<"**           单词种类        单词数目 **"<<endl;

	for(iter=word_count.begin();iter!=word_count.end();iter++)
	{
		cout <<"**           "<<(*iter).first<<"        "<<(*iter).second<<" **"<<endl;
	}
	system("pause");
	return 0;
}

由于map容器太高级了，直接把空格的工作替我完成了，所以我要绕过map，应用set做下词频统计。

#include <iostream>
#include <string>
#include <set>

using namespace std;

int main(){
	string sentence;
	unsigned int pos1,pos2;
	string word;
	/*定义set容器来存储sentence里的单词种类*/
	set<string> s;
    set<string>::iterator si; 

	cout <<"输入一句已经分好词的汉语"<<endl;	
	while(cin>>sentence){
		pos1=0;
	    pos2=sentence.find(" ",pos1);
		word = sentence.substr(pos1, pos2 - pos1);
	    s.insert(word);
	    pos1=pos2+1;
	}
		
	cout << "统计结果"<<endl;
	cout <<"*************************"<<endl;
	cout <<"**       词语          **"<<endl;
	for(si=s.begin();si!=s.end();si++){
		cout <<"**       "<<*si<<"          **"<<endl;
	}
	cout <<"*************************"<<endl;
	system("pause");
	return 0;
}

经过了修改，已经不光可以统计文本词语的种类，而且可以统计词语的数量。

统计文本：

我/与/父亲/不/相见/已/二年/余/了/，/我/最/不能/忘记/的/是/他的/背影/。
那年/冬天/，/祖母/死了/，/父亲/的/差使/也/交卸/了/，/正是/祸不单行/的/日子/。/我/从/北京/到/徐州/，/打算/跟着/父亲/奔丧/回家/。/到/徐州/见着/父亲/，/看见/满院狼藉/的/东西/，/又/想起/祖母/，/不禁/簌簌地/流下/眼泪/。/父亲/说/：/“/事已如此/，/不必/难过/，/好在/天无绝人之路/！/”/

#include <string>
#include <iostream>
#include <set>
#include <map>
#include <fstream>
using namespace std;

int main(){
	ifstream infile;
	
	ofstream outfile;
	string sentence;
	unsigned int pos1,pos2;
	string word;
	int order=1;
	/*定义set容器来存储sentence里的单词种类*/
	set<string> s;
    set<string>::iterator si; 

	/*定义map容器来存储统计词频的数量*/
	map<string,int>  word_count;  
    map<string,int>::iterator mapIter;

	pos1=0;
	/*1.txt存储读取文件*/
	infile.open("1.txt");
	if (!infile)
		cout<<"can not open file"<<endl;
	
	/*录入每个自然段*/
	while(true){
		infile>>sentence;
		/*判断是否读到文件末尾，如果读到文件末尾，则跳出while()循环*/	
			if(infile.eof()) 
				break;	
			/*录入每段中的词语*/
			while(true){
				pos2=sentence.find("/",pos1);
				word=sentence.substr(pos1,pos2-pos1);
				s.insert(word);
				pos1=pos2+1;
				if((int)pos2==-1)
					break;
		      }
				
	}
	infile.close();
	for(si=s.begin();si!=s.end();si++){  
        word=*si;
		word_count[word]=0; 
		
    }
	/*以上操作统计了文章中的词语种类，接下来要做的是统计每种词语出现次数*/
	infile.open("1.txt");
	/*录入每个自然段*/
	while(true){
		infile>>sentence;
		/*判断是否读到文件末尾，如果读到文件末尾，则跳出while()循环*/	
			if(infile.eof()) 
				break;			
		/*录入每段中的词语*/
			while(true){
				pos2=sentence.find("/",pos1);
				word=sentence.substr(pos1,pos2-pos1);
				word_count[word]=word_count[word]+1;
				pos1=pos2+1;
				if((int)pos2==-1)
					break;
		      }		
	}
	infile.close();
	
	cout << "统计结果"<<endl;
	cout <<"*******************************************************"<<endl;
	cout <<"**       词语                          词频          **"<<endl;
	for(mapIter=word_count.begin();mapIter!=word_count.end();mapIter++){
		cout <<"**"<<order<<"    "<<(*mapIter).first<<"                          "<<(*mapIter).second<<"          **"<<endl;
		order++;
	}
	cout <<"********************************************************"<<endl;

	system("pause");
	return 0;
}

上面的代码还无法实现对字符排序，经过再次修改，下面的版本已经可以对词语进行排序。

#include <string>
#include <iostream>
#include <set>
#include <vector>
#include <map>
#include <fstream>
#include <algorithm>
using namespace std;
typedef pair<string, int> PAIR;  
  
struct CmpByValue {  
  bool operator()(const PAIR& lhs, const PAIR& rhs) {  
    return lhs.second > rhs.second;  
  }  
};  
int main(){
	ifstream infile;
	
	ofstream outfile;
	string sentence;
	unsigned int pos1,pos2;
	string word;
	int order=1;
	/*定义set容器来存储sentence里的单词种类*/
	set<string> s;
    set<string>::iterator si; 

	/*定义map容器来存储统计词频的数量*/
	map<string,int>  word_count;  
    map<string,int>::iterator mapIter;

	pos1=0;
	/*1.txt存储读取文件*/
	infile.open("1.txt");
	if (!infile)
		cout<<"can not open file"<<endl;
	
	/*录入每个自然段*/
	while(true){
		infile>>sentence;
		/*判断是否读到文件末尾，如果读到文件末尾，则跳出while()循环*/	
			if(infile.eof()) 
				break;	
			/*录入每段中的词语*/
			while(true){
				pos2=sentence.find("/",pos1);
				word=sentence.substr(pos1,pos2-pos1);
				s.insert(word);
				pos1=pos2+1;
				if((int)pos2==-1)
					break;
		      }
				
	}
	infile.close();
	for(si=s.begin();si!=s.end();si++){  
        word=*si;
		word_count[word]=0; 
		
    }
	/*以上操作统计了文章中的词语种类，接下来要做的是统计每种词语出现次数*/
	infile.open("1.txt");
	/*录入每个自然段*/
	while(true){
		infile>>sentence;
		/*判断是否读到文件末尾，如果读到文件末尾，则跳出while()循环*/	
			if(infile.eof()) 
				break;			
		/*录入每段中的词语*/
			while(true){
				pos2=sentence.find("/",pos1);
				word=sentence.substr(pos1,pos2-pos1);
				word_count[word]=word_count[word]+1;
				pos1=pos2+1;
				if((int)pos2==-1)
					break;
		      }		
	}
	infile.close();
	/*下面将map赋值给vector，再对vector中元素以value从大到小的顺序排序*/

	vector<PAIR> word_count_vec(word_count.begin(), word_count.end());
	vector<PAIR>::const_iterator vecIter;
	sort(word_count_vec.begin(), word_count_vec.end(), CmpByValue());

	cout << "统计结果"<<endl;
	cout <<"*******************************************************"<<endl;
	cout <<"**       词语                          词频          **"<<endl;
	for(vecIter=word_count_vec.begin();vecIter!=word_count_vec.end();vecIter++){
		cout <<"**"<<order<<"    "<<(*vecIter).first<<"                          "<<(*vecIter).second<<"          **"<<endl;
		order++;
	}
	cout <<"********************************************************"<<endl;

	system("pause");
	return 0;
}

更改了上面的一个bug，又增加了读取一个文件夹中的多个txt文件的功能。

//
//创建者：李航前
//创建时间：2014.7.19
//文件内容：读取文件中txt文件
/

#include <string>
#include <iostream>
#include <set>
#include <vector>
#include <map>
#include <fstream>
#include <algorithm>
#include <io.h>
using namespace std;

typedef pair<int, string> PAIR;  
typedef pair<int,string>  FPAIR;

int main(){
	 _finddata_t fileDir;
	 char* dir="temp\\*.txt";
	 long lfDir;
	 int fi=0;
	 string  file_name_str;


	ifstream infile;
	ofstream outfile;
	string sentence;
	unsigned int pos1,pos2;
	string word;
	int order=1;

	/*定义set容器来存储sentence里的单词种类*/
	set<string> s;
    set<string>::iterator si; 

	/*定义map容器来存储统计词频的数量*/
	map<string,int>  word_count;  
    map<string,int>::iterator mapIter;

	pos1=0;
	/*读取文件夹temp中文件并存储在vector数组中（由于不会使用vector二维数组，现在暂时任然使用pair结构，增加了一个没用的int键值）*/
	vector<FPAIR> file_name;
	vector<FPAIR>::iterator fileIter;
	if((lfDir = _findfirst(dir,&fileDir))==-1l)
        printf("No file is found\n");
    else{
        do{
			file_name.push_back(make_pair<int,string>(fi,fileDir.name));
			fi++;
        }while( _findnext( lfDir, &fileDir ) == 0 );
    }
	_findclose(lfDir);
	for(fileIter=file_name.begin();fileIter!=file_name.end();fileIter++){		
		file_name_str="temp/"+(*fileIter).second;
		infile.open(file_name_str);
		if (!infile)
			cout<<"can not open file"<<endl;
		/*录入每个自然段*/
		while(true){
			infile>>sentence;			
				/*录入每段中的词语*/
				while(true){
					pos2=sentence.find("/",pos1);
					word=sentence.substr(pos1,pos2-pos1);
					s.insert(word);
					pos1=pos2+1;
					if((int)pos2==-1)
						break;
				  }
				/*判断是否读到文件末尾，如果读到文件末尾，则跳出while()循环*/	
				if(infile.eof()) 
					break;	
		}

		infile.close();

	}
		for(si=s.begin();si!=s.end();si++){  
			word=*si;
			word_count[word]=0; 
		}
		/*以上操作统计了文章中的词语种类，接下来要做的是统计每种词语出现次数*/
	for(fileIter=file_name.begin();fileIter!=file_name.end();fileIter++){
		file_name_str="temp/"+(*fileIter).second;
		infile.open(file_name_str);
		/*录入每个自然段*/
		while(true){
			infile>>sentence;				
			/*录入每段中的词语*/
				while(true){
					pos2=sentence.find("/",pos1);
					word=sentence.substr(pos1,pos2-pos1);
					word_count[word]=word_count[word]+1;
					pos1=pos2+1;
					if((int)pos2==-1)
						break;
				  }
				/*判断是否读到文件末尾，如果读到文件末尾，则跳出while()循环*/	
				if(infile.eof()) 
					break;		
		}
		infile.close();
	}
	/*下面将map赋值给vector，再对vector中元素以value从大到小的顺序排序*/
	vector<PAIR> word_count_vec;
	for(mapIter=word_count.begin();mapIter!=word_count.end();mapIter++){
		word_count_vec.push_back(make_pair<int,string>((*mapIter).second,(*mapIter).first));
	}	
	vector<PAIR>::const_iterator vecIter;
	sort(word_count_vec.begin(), word_count_vec.end(),greater<PAIR>());

	cout << "统计结果"<<endl;
	cout <<"*******************************************************"<<endl;
	cout <<"**       词语                          词频          **"<<endl;
	for(vecIter=word_count_vec.begin();vecIter!=word_count_vec.end();vecIter++){
		cout <<"**"<<order<<"    "<<(*vecIter).second<<"                          "<<(*vecIter).first<<"          **"<<endl;
		order++;
	}
	cout <<"********************************************************"<<endl;
	system("pause");
	return 0;
}

更改了一下代码，并用二分查找完善了一下程序

/
//创建人：李航前
//时间：2014.8.17
//内容：tf*idf练习
/

#include <iostream>
#include <set>
#include <string>
#include <io.h>
#include <fstream>
#include <vector>
#include <algorithm>
#include <map>

using namespace std;

typedef pair<double,string> PAIR;    
    
void ReadFile(set<string>& a,char* dir){
	//读取dir文件夹中所有txt文件名,并把文件名存储到set数组中
	 _finddata_t fileDir;  
     //char* dir="temp\\*.txt";  
     long lfDir;
    if((lfDir = _findfirst(dir,&fileDir))==-1l)  
        printf("No file is found\n");  
    else{  
        do{  
            a.insert(fileDir.name);  
        }while( _findnext( lfDir, &fileDir ) == 0 );  
    }  
    _findclose(lfDir);  
}

void ReadWord(string &filename,vector<string> &s){
	//取读txt文件中所有分词，并把他们存储在s数组中；函数中参数值分别表示txt文件名称，存储分词结果的数组s
	ifstream infile;
	infile.open(filename);
	string sentence;//整个自然段
	string word;
	int pos1,pos2;
	pos1=0;
	while(true){
		getline(infile,sentence);
		while(true){  
            pos2=sentence.find(" ",pos1);  
            word=sentence.substr(pos1,pos2-pos1);  
			s.push_back(word);  
            pos1=pos2+1;  
            if((int)pos2==-1)  
                break;  
            }  
		if(infile.eof())
			break;
	}
}

int BinSearch(string &word,vector<string> &s){
	/*二分查找算法,函数前是词语word，后面的是处理过的过滤词表*/
	int low=0,high=s.size()-1,mid;
	while(low<=high){
		mid=(low+high)/2;
		if(s[mid]==word)
			return 0;
		if(s[mid]>word)
			high=mid-1;
		else
			low=mid+1;
	}
	return 1;
}

void ReadTest(map<string,int> &s,string &filename ,vector<string> &stop){
	/*读取test文件*/
	ifstream infile;
	infile.open(filename);
	string sentence;
	string word;
	int pos1,pos2;
	pos1=0;
	while(true){
		getline(infile,sentence);
		while(true){  
            pos2=sentence.find(" ",pos1);  
            word=sentence.substr(pos1,pos2-pos1);  
			if(BinSearch(word,stop)){
				s[word]++;  
				}
				pos1=pos2+1;  
			
			if((int)pos2==-1)  
                break;  
            }  
		if(infile.eof())
			break;
	}
}
int main(){
	set<string> fileName;
	set<string>::iterator fileIter;
	ReadFile(fileName,"temp\\*.txt");
	string file_name_str;

	/******************************************************************/
	/*下面结构将语料库的各个txt文件读取后存储在多个二维数组中，*/
	vector< vector<string>> array;
	vector<vector<string>>::iterator arrayIter;
	vector<string> line;
	vector<string>::iterator lineIter;
	int i=0;
	for(fileIter=fileName.begin();fileIter!=fileName.end();fileIter++){
		array.push_back(line);
		file_name_str="temp\\"+*fileIter;
		ReadWord(file_name_str,array[i]);
		sort(array[i].begin(),array[i].end());
		for(lineIter=array[i].begin();lineIter!=array[i].end()-1;){
			if(*lineIter==*(lineIter+1))
				lineIter=array[i].erase(lineIter);
			else
				++lineIter;
		}
		i=i+1;
	}
	/******************************************************************/
	/*读取停用词表，并存储在stop容器中*/
	vector<string> stop;
	vector<string>::iterator stopIter;
	string stopname;
	stopname="stop.txt";
	ReadWord(stopname,stop);
	sort(stop.begin(),stop.end());
	
	/******************************************************************/
	/*读取test.txt的文件，吧所有词通过过滤词表后存储在test中*/
	map<string,int> test;
	map<string,int>::iterator testIter;
	string test_file_name;
	test_file_name="test.txt";
	ReadTest(test,test_file_name,stop);

	/******************************************************************/
	/*下面是idf方面的工作*/
	map<string,int> idf;
	map<string,int>::iterator idfIter;
	int z;
	string word;
	int size;
	
	for(testIter=test.begin();testIter!=test.end();testIter++){
		word=(*testIter).first;
		z=0;
		for(arrayIter=array.begin();arrayIter!=array.end();arrayIter++,z++){
			if(z==array.size()){
					break;
			}
			
			if(BinSearch(word,array[z])==0){
				idf[(*testIter).first]++;
			}
		}
	}
	//map<string,double> result;
	//map<string,double>::iterator rIter;
	//double end;
	/*for(idfIter=idf.begin(),testIter=test.begin();idfIter!=idf.end(),testIter!=test.end();idfIter++,testIter++){		
		end=(*testIter).second*log((double)(array.size()/(*idfIter).second));

		result[(*idfIter).first]=end;
		//cout<<(*idfIter).first<<"***********"<<end<<endl;
	}
	for(rIter=result.begin();rIter!=result.end();rIter++){
		cout<<(*rIter).first<<"*****************"<<(*rIter).second<<endl;
	}*/
	vector<PAIR> result;
	vector<PAIR>::iterator rIter;
	double end;
	for(idfIter=idf.begin(),testIter=test.begin();idfIter!=idf.end(),testIter!=test.end();idfIter++,testIter++){		
		end=(double)(*testIter).second*log((double)(array.size()/(*idfIter).second));
		result.push_back(make_pair<double,string>(end,(*idfIter).first));
	}
	sort(result.begin(),result.end(),greater<PAIR>());
	for(rIter=result.begin();rIter!=result.end();rIter++){
		cout<<(*rIter).first<<"************"<<(*rIter).second<<endl;
	}
	system("pause");
}