最近在做词频统计,先贴出简单的分英语单词的程序
#include <map>
#include <iostream>
#include <string>
using namespace std;
int main(){
string word;
map<string,int> word_count;
map<string,int>::iterator iter;
cout <<"请输入一句话:";
while(cin>>word){
cout <<"收录词语:";
word_count[word]++;
}
cout<<"*****************************************"<<endl;
cout <<"** 单词种类 单词数目 **"<<endl;
for(iter=word_count.begin();iter!=word_count.end();iter++)
{
cout <<"** "<<(*iter).first<<" "<<(*iter).second<<" **"<<endl;
}
system("pause");
return 0;
}
由于map容器太高级了,直接把空格的工作替我完成了,所以我要绕过map,应用set做下词频统计。
#include <iostream>
#include <string>
#include <set>
using namespace std;
int main(){
string sentence;
unsigned int pos1,pos2;
string word;
/*定义set容器来存储sentence里的单词种类*/
set<string> s;
set<string>::iterator si;
cout <<"输入一句已经分好词的汉语"<<endl;
while(cin>>sentence){
pos1=0;
pos2=sentence.find(" ",pos1);
word = sentence.substr(pos1, pos2 - pos1);
s.insert(word);
pos1=pos2+1;
}
cout << "统计结果"<<endl;
cout <<"*************************"<<endl;
cout <<"** 词语 **"<<endl;
for(si=s.begin();si!=s.end();si++){
cout <<"** "<<*si<<" **"<<endl;
}
cout <<"*************************"<<endl;
system("pause");
return 0;
}
经过了修改,已经不光可以统计文本词语的种类,而且可以统计词语的数量。
统计文本:
我/与/父亲/不/相见/已/二年/余/了/,/我/最/不能/忘记/的/是/他的/背影/。
那年/冬天/,/祖母/死了/,/父亲/的/差使/也/交卸/了/,/正是/祸不单行/的/日子/。/我/从/北京/到/徐州/,/打算/跟着/父亲/奔丧/回家/。/到/徐州/见着/父亲/,/看见/满院狼藉/的/东西/,/又/想起/祖母/,/不禁/簌簌地/流下/眼泪/。/父亲/说/:/“/事已如此/,/不必/难过/,/好在/天无绝人之路/!/”/
#include <string>
#include <iostream>
#include <set>
#include <map>
#include <fstream>
using namespace std;
int main(){
ifstream infile;
ofstream outfile;
string sentence;
unsigned int pos1,pos2;
string word;
int order=1;
/*定义set容器来存储sentence里的单词种类*/
set<string> s;
set<string>::iterator si;
/*定义map容器来存储统计词频的数量*/
map<string,int> word_count;
map<string,int>::iterator mapIter;
pos1=0;
/*1.txt存储读取文件*/
infile.open("1.txt");
if (!infile)
cout<<"can not open file"<<endl;
/*录入每个自然段*/
while(true){
infile>>sentence;
/*判断是否读到文件末尾,如果读到文件末尾,则跳出while()循环*/
if(infile.eof())
break;
/*录入每段中的词语*/
while(true){
pos2=sentence.find("/",pos1);
word=sentence.substr(pos1,pos2-pos1);
s.insert(word);
pos1=pos2+1;
if((int)pos2==-1)
break;
}
}
infile.close();
for(si=s.begin();si!=s.end();si++){
word=*si;
word_count[word]=0;
}
/*以上操作统计了文章中的词语种类,接下来要做的是统计每种词语出现次数*/
infile.open("1.txt");
/*录入每个自然段*/
while(true){
infile>>sentence;
/*判断是否读到文件末尾,如果读到文件末尾,则跳出while()循环*/
if(infile.eof())
break;
/*录入每段中的词语*/
while(true){
pos2=sentence.find("/",pos1);
word=sentence.substr(pos1,pos2-pos1);
word_count[word]=word_count[word]+1;
pos1=pos2+1;
if((int)pos2==-1)
break;
}
}
infile.close();
cout << "统计结果"<<endl;
cout <<"*******************************************************"<<endl;
cout <<"** 词语 词频 **"<<endl;
for(mapIter=word_count.begin();mapIter!=word_count.end();mapIter++){
cout <<"**"<<order<<" "<<(*mapIter).first<<" "<<(*mapIter).second<<" **"<<endl;
order++;
}
cout <<"********************************************************"<<endl;
system("pause");
return 0;
}
上面的代码还无法实现对字符排序,经过再次修改,下面的版本已经可以对词语进行排序。
#include <string>
#include <iostream>
#include <set>
#include <vector>
#include <map>
#include <fstream>
#include <algorithm>
using namespace std;
typedef pair<string, int> PAIR;
struct CmpByValue {
bool operator()(const PAIR& lhs, const PAIR& rhs) {
return lhs.second > rhs.second;
}
};
int main(){
ifstream infile;
ofstream outfile;
string sentence;
unsigned int pos1,pos2;
string word;
int order=1;
/*定义set容器来存储sentence里的单词种类*/
set<string> s;
set<string>::iterator si;
/*定义map容器来存储统计词频的数量*/
map<string,int> word_count;
map<string,int>::iterator mapIter;
pos1=0;
/*1.txt存储读取文件*/
infile.open("1.txt");
if (!infile)
cout<<"can not open file"<<endl;
/*录入每个自然段*/
while(true){
infile>>sentence;
/*判断是否读到文件末尾,如果读到文件末尾,则跳出while()循环*/
if(infile.eof())
break;
/*录入每段中的词语*/
while(true){
pos2=sentence.find("/",pos1);
word=sentence.substr(pos1,pos2-pos1);
s.insert(word);
pos1=pos2+1;
if((int)pos2==-1)
break;
}
}
infile.close();
for(si=s.begin();si!=s.end();si++){
word=*si;
word_count[word]=0;
}
/*以上操作统计了文章中的词语种类,接下来要做的是统计每种词语出现次数*/
infile.open("1.txt");
/*录入每个自然段*/
while(true){
infile>>sentence;
/*判断是否读到文件末尾,如果读到文件末尾,则跳出while()循环*/
if(infile.eof())
break;
/*录入每段中的词语*/
while(true){
pos2=sentence.find("/",pos1);
word=sentence.substr(pos1,pos2-pos1);
word_count[word]=word_count[word]+1;
pos1=pos2+1;
if((int)pos2==-1)
break;
}
}
infile.close();
/*下面将map赋值给vector,再对vector中元素以value从大到小的顺序排序*/
vector<PAIR> word_count_vec(word_count.begin(), word_count.end());
vector<PAIR>::const_iterator vecIter;
sort(word_count_vec.begin(), word_count_vec.end(), CmpByValue());
cout << "统计结果"<<endl;
cout <<"*******************************************************"<<endl;
cout <<"** 词语 词频 **"<<endl;
for(vecIter=word_count_vec.begin();vecIter!=word_count_vec.end();vecIter++){
cout <<"**"<<order<<" "<<(*vecIter).first<<" "<<(*vecIter).second<<" **"<<endl;
order++;
}
cout <<"********************************************************"<<endl;
system("pause");
return 0;
}
更改了上面的一个bug,又增加了读取一个文件夹中的多个txt文件的功能。
//
//创建者:李航前
//创建时间:2014.7.19
//文件内容:读取文件中txt文件
/
#include <string>
#include <iostream>
#include <set>
#include <vector>
#include <map>
#include <fstream>
#include <algorithm>
#include <io.h>
using namespace std;
typedef pair<int, string> PAIR;
typedef pair<int,string> FPAIR;
int main(){
_finddata_t fileDir;
char* dir="temp\\*.txt";
long lfDir;
int fi=0;
string file_name_str;
ifstream infile;
ofstream outfile;
string sentence;
unsigned int pos1,pos2;
string word;
int order=1;
/*定义set容器来存储sentence里的单词种类*/
set<string> s;
set<string>::iterator si;
/*定义map容器来存储统计词频的数量*/
map<string,int> word_count;
map<string,int>::iterator mapIter;
pos1=0;
/*读取文件夹temp中文件并存储在vector数组中(由于不会使用vector二维数组,现在暂时任然使用pair结构,增加了一个没用的int键值)*/
vector<FPAIR> file_name;
vector<FPAIR>::iterator fileIter;
if((lfDir = _findfirst(dir,&fileDir))==-1l)
printf("No file is found\n");
else{
do{
file_name.push_back(make_pair<int,string>(fi,fileDir.name));
fi++;
}while( _findnext( lfDir, &fileDir ) == 0 );
}
_findclose(lfDir);
for(fileIter=file_name.begin();fileIter!=file_name.end();fileIter++){
file_name_str="temp/"+(*fileIter).second;
infile.open(file_name_str);
if (!infile)
cout<<"can not open file"<<endl;
/*录入每个自然段*/
while(true){
infile>>sentence;
/*录入每段中的词语*/
while(true){
pos2=sentence.find("/",pos1);
word=sentence.substr(pos1,pos2-pos1);
s.insert(word);
pos1=pos2+1;
if((int)pos2==-1)
break;
}
/*判断是否读到文件末尾,如果读到文件末尾,则跳出while()循环*/
if(infile.eof())
break;
}
infile.close();
}
for(si=s.begin();si!=s.end();si++){
word=*si;
word_count[word]=0;
}
/*以上操作统计了文章中的词语种类,接下来要做的是统计每种词语出现次数*/
for(fileIter=file_name.begin();fileIter!=file_name.end();fileIter++){
file_name_str="temp/"+(*fileIter).second;
infile.open(file_name_str);
/*录入每个自然段*/
while(true){
infile>>sentence;
/*录入每段中的词语*/
while(true){
pos2=sentence.find("/",pos1);
word=sentence.substr(pos1,pos2-pos1);
word_count[word]=word_count[word]+1;
pos1=pos2+1;
if((int)pos2==-1)
break;
}
/*判断是否读到文件末尾,如果读到文件末尾,则跳出while()循环*/
if(infile.eof())
break;
}
infile.close();
}
/*下面将map赋值给vector,再对vector中元素以value从大到小的顺序排序*/
vector<PAIR> word_count_vec;
for(mapIter=word_count.begin();mapIter!=word_count.end();mapIter++){
word_count_vec.push_back(make_pair<int,string>((*mapIter).second,(*mapIter).first));
}
vector<PAIR>::const_iterator vecIter;
sort(word_count_vec.begin(), word_count_vec.end(),greater<PAIR>());
cout << "统计结果"<<endl;
cout <<"*******************************************************"<<endl;
cout <<"** 词语 词频 **"<<endl;
for(vecIter=word_count_vec.begin();vecIter!=word_count_vec.end();vecIter++){
cout <<"**"<<order<<" "<<(*vecIter).second<<" "<<(*vecIter).first<<" **"<<endl;
order++;
}
cout <<"********************************************************"<<endl;
system("pause");
return 0;
}
更改了一下代码,并用二分查找完善了一下程序
/
//创建人:李航前
//时间:2014.8.17
//内容:tf*idf练习
/
#include <iostream>
#include <set>
#include <string>
#include <io.h>
#include <fstream>
#include <vector>
#include <algorithm>
#include <map>
using namespace std;
typedef pair<double,string> PAIR;
void ReadFile(set<string>& a,char* dir){
//读取dir文件夹中所有txt文件名,并把文件名存储到set数组中
_finddata_t fileDir;
//char* dir="temp\\*.txt";
long lfDir;
if((lfDir = _findfirst(dir,&fileDir))==-1l)
printf("No file is found\n");
else{
do{
a.insert(fileDir.name);
}while( _findnext( lfDir, &fileDir ) == 0 );
}
_findclose(lfDir);
}
void ReadWord(string &filename,vector<string> &s){
//取读txt文件中所有分词,并把他们存储在s数组中;函数中参数值分别表示txt文件名称,存储分词结果的数组s
ifstream infile;
infile.open(filename);
string sentence;//整个自然段
string word;
int pos1,pos2;
pos1=0;
while(true){
getline(infile,sentence);
while(true){
pos2=sentence.find(" ",pos1);
word=sentence.substr(pos1,pos2-pos1);
s.push_back(word);
pos1=pos2+1;
if((int)pos2==-1)
break;
}
if(infile.eof())
break;
}
}
int BinSearch(string &word,vector<string> &s){
/*二分查找算法,函数前是词语word,后面的是处理过的过滤词表*/
int low=0,high=s.size()-1,mid;
while(low<=high){
mid=(low+high)/2;
if(s[mid]==word)
return 0;
if(s[mid]>word)
high=mid-1;
else
low=mid+1;
}
return 1;
}
void ReadTest(map<string,int> &s,string &filename ,vector<string> &stop){
/*读取test文件*/
ifstream infile;
infile.open(filename);
string sentence;
string word;
int pos1,pos2;
pos1=0;
while(true){
getline(infile,sentence);
while(true){
pos2=sentence.find(" ",pos1);
word=sentence.substr(pos1,pos2-pos1);
if(BinSearch(word,stop)){
s[word]++;
}
pos1=pos2+1;
if((int)pos2==-1)
break;
}
if(infile.eof())
break;
}
}
int main(){
set<string> fileName;
set<string>::iterator fileIter;
ReadFile(fileName,"temp\\*.txt");
string file_name_str;
/******************************************************************/
/*下面结构将语料库的各个txt文件读取后存储在多个二维数组中,*/
vector< vector<string>> array;
vector<vector<string>>::iterator arrayIter;
vector<string> line;
vector<string>::iterator lineIter;
int i=0;
for(fileIter=fileName.begin();fileIter!=fileName.end();fileIter++){
array.push_back(line);
file_name_str="temp\\"+*fileIter;
ReadWord(file_name_str,array[i]);
sort(array[i].begin(),array[i].end());
for(lineIter=array[i].begin();lineIter!=array[i].end()-1;){
if(*lineIter==*(lineIter+1))
lineIter=array[i].erase(lineIter);
else
++lineIter;
}
i=i+1;
}
/******************************************************************/
/*读取停用词表,并存储在stop容器中*/
vector<string> stop;
vector<string>::iterator stopIter;
string stopname;
stopname="stop.txt";
ReadWord(stopname,stop);
sort(stop.begin(),stop.end());
/******************************************************************/
/*读取test.txt的文件,吧所有词通过过滤词表后存储在test中*/
map<string,int> test;
map<string,int>::iterator testIter;
string test_file_name;
test_file_name="test.txt";
ReadTest(test,test_file_name,stop);
/******************************************************************/
/*下面是idf方面的工作*/
map<string,int> idf;
map<string,int>::iterator idfIter;
int z;
string word;
int size;
for(testIter=test.begin();testIter!=test.end();testIter++){
word=(*testIter).first;
z=0;
for(arrayIter=array.begin();arrayIter!=array.end();arrayIter++,z++){
if(z==array.size()){
break;
}
if(BinSearch(word,array[z])==0){
idf[(*testIter).first]++;
}
}
}
//map<string,double> result;
//map<string,double>::iterator rIter;
//double end;
/*for(idfIter=idf.begin(),testIter=test.begin();idfIter!=idf.end(),testIter!=test.end();idfIter++,testIter++){
end=(*testIter).second*log((double)(array.size()/(*idfIter).second));
result[(*idfIter).first]=end;
//cout<<(*idfIter).first<<"***********"<<end<<endl;
}
for(rIter=result.begin();rIter!=result.end();rIter++){
cout<<(*rIter).first<<"*****************"<<(*rIter).second<<endl;
}*/
vector<PAIR> result;
vector<PAIR>::iterator rIter;
double end;
for(idfIter=idf.begin(),testIter=test.begin();idfIter!=idf.end(),testIter!=test.end();idfIter++,testIter++){
end=(double)(*testIter).second*log((double)(array.size()/(*idfIter).second));
result.push_back(make_pair<double,string>(end,(*idfIter).first));
}
sort(result.begin(),result.end(),greater<PAIR>());
for(rIter=result.begin();rIter!=result.end();rIter++){
cout<<(*rIter).first<<"************"<<(*rIter).second<<endl;
}
system("pause");
}
(未完待续)