我继续完善次品统计的工作,1)该文件可以读入temp中的各个文章,
2)读入stop文件夹中的过滤词汇,
3)将输出结果反应在result.txt文件中
//
//创建者:李航前
//创建时间:2014.7.19
//文件内容:读取文件中txt文件
/
//存在问题:1)模块越来越多,下一步实现分治。
//
//
#include <string>
#include <iostream>
#include <set>
#include <vector>
#include <map>
#include <fstream>
#include <algorithm>
#include <io.h>
using namespace std;
typedef pair<int, string> PAIR;
int main(){
_finddata_t fileDir;
char* dir="temp\\*.txt";
long lfDir;
string file_name_str;
ifstream infile;
ifstream stop_infile;
ofstream outfile;
string sentence;
unsigned int pos1,pos2;
string word;
string stop_word;
int order=1;
int sign;//相当于停用词的一个开关
/*定义set容器来存储sentence里的单词种类*/
set<string> s;
set<string>::iterator si;
/*定义map容器来存储统计词频的数量*/
map<string,int> word_count;
map<string,int>::iterator mapIter;
pos1=0;
/*读取文件夹temp中文件并存储在set数组中*/
set<string> file_name;
set<string>::iterator fileIter;
if((lfDir = _findfirst(dir,&fileDir))==-1l)
printf("No file is found\n");
else{
do{
file_name.insert(fileDir.name);
}while( _findnext( lfDir, &fileDir ) == 0 );
}
_findclose(lfDir);
/*下面是停用词阶段*/
/*以set存储含停用词停用词文件*/
_finddata_t stop_fileDir;
char* stop_dir="stop\\*.txt";
long stop_lfDir;
string stop_file_name_str;
set<string> stop_file_name;/*存储stop文件夹中所有txt文件*/
set<string>::iterator stop_fileIter;
if((stop_lfDir = _findfirst(stop_dir,&stop_fileDir))==-1l)
printf("No stop file is found\n");
else{
do{
stop_file_name.insert(stop_fileDir.name);
}while( _findnext( stop_lfDir, &stop_fileDir ) == 0 );
}
_findclose(stop_lfDir);
/*以set存储所有停用词*/
set<string> stop_word_set;/*存储所有停用词*/
set<string>::iterator stop_worditer;
for(stop_fileIter=stop_file_name.begin();stop_fileIter!=stop_file_name.end();stop_fileIter++){
stop_file_name_str="stop\\"+*stop_fileIter;
stop_infile.open(stop_file_name_str);
if (!stop_infile)
cout<<"stop file can not open file"<<endl;
while(true){
//stop_infile>>stop_word;
getline(stop_infile,stop_word);
stop_word_set.insert(stop_word);
if(stop_infile.eof())
break;
}
stop_infile.close();
}
for(fileIter=file_name.begin();fileIter!=file_name.end();fileIter++){
file_name_str="temp/"+*fileIter;
infile.open(file_name_str);
if (!infile)
cout<<"can not open file"<<endl;
/*录入每个自然段*/
while(true){
//infile>>sentence;
getline(infile,sentence);
/*录入每段中的词语*/
while(true){
pos2=sentence.find(" ",pos1);
word=sentence.substr(pos1,pos2-pos1);
/*在此增加停用词*/
sign=0;
for(stop_worditer=stop_word_set.begin();stop_worditer!=stop_word_set.end();stop_worditer++){
if(word==*stop_worditer){
sign=1;
break;
}
}
if(sign==0){
s.insert(word);
}
pos1=pos2+1;
if((int)pos2==-1)
break;
}
/*判断是否读到文件末尾,如果读到文件末尾,则跳出while()循环*/
if(infile.eof())
break;
}
infile.close();
}
for(si=s.begin();si!=s.end();si++){
cout<<*si;
word=*si;
word_count[word]=0;
}
/*以上操作统计了文章中的词语种类,接下来要做的是统计每种词语出现次数*/
for(fileIter=file_name.begin();fileIter!=file_name.end();fileIter++){
file_name_str="temp/"+*fileIter;
infile.open(file_name_str);
/*录入每个自然段*/
while(true){
//infile>>sentence;
getline(infile,sentence);
/*录入每段中的词语*/
while(true){
pos2=sentence.find(" ",pos1);
word=sentence.substr(pos1,pos2-pos1);
/*以下增加停用词操作*/
sign=0;
for(stop_worditer=stop_word_set.begin();stop_worditer!=stop_word_set.end();stop_worditer++){
if(word==*stop_worditer){
sign=1;
break;
}
}
if(sign==0){
word_count[word]=word_count[word]+1;
}
pos1=pos2+1;
if((int)pos2==-1)
break;
}
/*判断是否读到文件末尾,如果读到文件末尾,则跳出while()循环*/
if(infile.eof())
break;
}
infile.close();
}
/*下面将map赋值给vector,再对vector中元素以value从大到小的顺序排序*/
vector<PAIR> word_count_vec;
for(mapIter=word_count.begin();mapIter!=word_count.end();mapIter++){
word_count_vec.push_back(make_pair<int,string>((*mapIter).second,(*mapIter).first));
}
vector<PAIR>::const_iterator vecIter;
sort(word_count_vec.begin(), word_count_vec.end(),greater<PAIR>());
/*下面把读出内容写入文件中*/
ofstream fout("result.txt");
fout << "统计结果"<<endl;
fout <<"*******************************************************"<<endl;
fout <<"** 词语 词频 **"<<endl;
for(vecIter=word_count_vec.begin();vecIter!=word_count_vec.end();vecIter++){
fout <<"**"<<order<<" "<<(*vecIter).second<<" "<<(*vecIter).first<<" **"<<endl;
order++;
}
fout <<"********************************************************"<<endl;
cout<<"执行结束"<<endl;
system("pause");
return 0;
}
/
//创建人:李航前
//时间:2014.8.17
//内容:tf*idf练习
/
#include <iostream>
#include <set>
#include <string>
#include <io.h>
#include <fstream>
#include <vector>
#include <algorithm>
#include <map>
using namespace std;
typedef pair<double,string> PAIR;
void ReadFile(set<string>& a,char* dir){
//读取dir文件夹中所有txt文件名,并把文件名存储到set数组中
_finddata_t fileDir;
//char* dir="temp\\*.txt";
long lfDir;
if((lfDir = _findfirst(dir,&fileDir))==-1l)
printf("No file is found\n");
else{
do{
a.insert(fileDir.name);
}while( _findnext( lfDir, &fileDir ) == 0 );
}
_findclose(lfDir);
}
void ReadWord(string &filename,vector<string> &s){
//取读txt文件中所有分词,并把他们存储在s数组中;函数中参数值分别表示txt文件名称,存储分词结果的数组s
ifstream infile;
infile.open(filename);
string sentence;//整个自然段
string word;
int pos1,pos2;
pos1=0;
while(true){
getline(infile,sentence);
while(true){
pos2=sentence.find(" ",pos1);
word=sentence.substr(pos1,pos2-pos1);
s.push_back(word);
pos1=pos2+1;
if((int)pos2==-1)
break;
}
if(infile.eof())
break;
}
}
int BinSearch(string &word,vector<string> &s){
/*二分查找算法,函数前是词语word,后面的是处理过的过滤词表*/
int low=0,high=s.size()-1,mid;
while(low<=high){
mid=(low+high)/2;
if(s[mid]==word)
return 0;
if(s[mid]>word)
high=mid-1;
else
low=mid+1;
}
return 1;
}
void ReadTest(map<string,int> &s,string &filename ,vector<string> &stop){
/*读取test文件*/
ifstream infile;
infile.open(filename);
string sentence;
string word;
int pos1,pos2;
pos1=0;
while(true){
getline(infile,sentence);
while(true){
pos2=sentence.find(" ",pos1);
word=sentence.substr(pos1,pos2-pos1);
if(BinSearch(word,stop)){
s[word]++;
}
pos1=pos2+1;
if((int)pos2==-1)
break;
}
if(infile.eof())
break;
}
}
int main(){
set<string> fileName;
set<string>::iterator fileIter;
ReadFile(fileName,"temp\\*.txt");
string file_name_str;
/******************************************************************/
/*下面结构将语料库的各个txt文件读取后存储在多个二维数组中,*/
vector< vector<string>> array;
vector<vector<string>>::iterator arrayIter;
vector<string> line;
vector<string>::iterator lineIter;
int i=0;
for(fileIter=fileName.begin();fileIter!=fileName.end();fileIter++){
array.push_back(line);
file_name_str="temp\\"+*fileIter;
ReadWord(file_name_str,array[i]);
sort(array[i].begin(),array[i].end());
for(lineIter=array[i].begin();lineIter!=array[i].end()-1;){
if(*lineIter==*(lineIter+1))
lineIter=array[i].erase(lineIter);
else
++lineIter;
}
i=i+1;
}
/******************************************************************/
/*读取停用词表,并存储在stop容器中*/
vector<string> stop;
vector<string>::iterator stopIter;
string stopname;
stopname="stop.txt";
ReadWord(stopname,stop);
sort(stop.begin(),stop.end());
/******************************************************************/
/*读取test.txt的文件,吧所有词通过过滤词表后存储在test中*/
map<string,int> test;
map<string,int>::iterator testIter;
string test_file_name;
test_file_name="test.txt";
ReadTest(test,test_file_name,stop);
/******************************************************************/
/*下面是tf*idf方面的工作*/
map<string,int> idf;
map<string,int>::iterator idfIter;
int z;
string word;
int size;
for(testIter=test.begin();testIter!=test.end();testIter++){
word=(*testIter).first;
z=0;
for(arrayIter=array.begin();arrayIter!=array.end();arrayIter++,z++){
if(z==array.size()){
break;
}
if(BinSearch(word,array[z])==0){
idf[(*testIter).first]++;
}
}
}
vector<PAIR> result;
vector<PAIR>::iterator rIter;
double end;
for(idfIter=idf.begin(),testIter=test.begin();idfIter!=idf.end(),testIter!=test.end();idfIter++,testIter++){
end=(double)(*testIter).second*log((double)(array.size()/(*idfIter).second));//计算被分词文章每个分词的tf*idf值
result.push_back(make_pair<double,string>(end,(*idfIter).first));
}
sort(result.begin(),result.end(),greater<PAIR>());
for(rIter=result.begin();rIter!=result.end();rIter++){
cout<<(*rIter).first<<"************"<<(*rIter).second<<endl;
}
system("pause");
}