使用C/C++统计文章词汇数目,并统计重复词汇出现的次数。
#include<iostream>
#include<ios>
#include<iomanip>
#include<vector>
#include<ostream>
#include<math.h>
#include<algorithm>
#include<istream>
#include<iterator>
#include<string.h>
#include<fstream>
using namespace std;
int main()
{
// char *src=(char *)"A CRC is a long-division remainder. You add the CRC to the message,\
// and the whole thing (message+CRC) is a multiple of the given \
// CRC polynomial. To check the CRC, you can either check that the \
// CRC matches the recomputed value, *or* you can check that the \
// remainder computed on the message+CRC is 0. This latter approach \
// is used by a lot of hardware implementations, and is why so many \
// protocols put the end-of-frame flag after the CRC";
char *src;
src=(char*)malloc(1024*1024);
memset(src,0,1024*1024);
FILE *fd;
fd=fopen("/home/hx/Desktop/Cpp_project/test.txt","rb+");
fread(src,1024*1024,1,fd);
char word_first[256][32];
memset(word_first,0,sizeof(char)*256*32);
char word[256][32];
memset(word,0,sizeof(char)*256*32);
char *word_src;
int word_len=0;
int num=1;
while(1)
{
word_src=strchr(src,' ');
if(word_src==NULL)
{
strcpy(word_first[num-1],src);
break;
}
word_len=(int)(word_src-src);
memcpy(word_first[num-1],src,word_len);
++num;
src=word_src+1;
}
printf("Total vocabulary number:%d\n",num);
int mm=0;
for(int i=0;i<num;i++)
{
if(strlen(word_first[i])!=0)
{
strcpy(word[mm],word_first[i]);
mm++;
}
}
char *tmp=new(char);
char *tmp_j=new(char);
//tmp=(char*)malloc(sizeof(char)*32);
//tmp_j=(char*)malloc(sizeof(char)*32);
for(int i=0;i<num-1;i++)
{
int count=1;
memset(tmp,0,sizeof(char)*32);
if(strlen(word[i])!=0)
{
for(int m=0;m<strlen(word[i]);m++)
{
if((word[i][m]>=32&&int(word[i][m])<=64) || (word[i][m]>=91&&int(word[i][m])<=96) || (word[i][m]>=123&&int(word[i][m])<=127))
{
if((m!=0&&word[i][m-1]>=65&&word[i][m+1]<=90&&m!=strlen(word[i])-1)||(m!=0&&word[i][m-1]>=97&&word[i][m+1]<=122&&m!=strlen(word[i])-1))
word[i][m]=word[i][m];
else
word[i][m]=' ';
}
}
strcpy(tmp,word[i]);
int blank_i=0;
int len_i=strlen(tmp);
for(;;)
{
if(strchr(tmp,' '))
{
tmp=tmp+1;
}
if(blank_i==len_i-1)
break;
blank_i++;
}
for(int j=i+1;j<num;j++)
{
if(strlen(word[j])!=0)
{
memset(tmp_j,0,sizeof(char)*32);
for(int m=0;m<strlen(word[j]);m++)
{
if((word[j][m]>=32&&word[j][m]<=64) || (word[j][m]>=91&&word[j][m]<=96) || (word[j][m]>=123&&word[j][m]<=127))
{
if((m!=0&&word[j][m-1]>=65&&word[j][m+1]<=90)&&m!=strlen(word[j])-1||(m!=0&&word[j][m-1]>=97&&word[j][m+1]<=122&&m!=strlen(word[j])-1))
word[j][m]=word[j][m];
else
word[j][m]=' ';
}
}
strcpy(tmp_j,word[j]);
int blank_j=0;
int len_j=strlen(tmp_j);
for(;;)
{
if(strchr(tmp_j,' '))
{
tmp_j=tmp_j+1;
}
if(blank_j==len_j-1)
break;
blank_j++;
}
if(strcmp(tmp,tmp_j)==0)
{
count++;
memset(word[i],0,strlen(word[i]));
memset(word[j],0,strlen(word[j]));
}
}
}
}
if(strlen(tmp)!=0)
cout<<setfill(' ')<<left<<setw(32)<<tmp<<setfill(' ')<<right<<setw(6)<<count<<endl;
if(i==num-2&&strlen(word[num-1])!=0)
cout<<setfill(' ')<<left<<setw(32)<<word[num-1]<<setfill(' ')<<right<<setw(6)<<1<<endl;
}
//free(tmp);
//free(tmp_j);
}