//By Aba
#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <map>
#define FILENAMELENGTH 256
#define TYPES 6 //手动更新
using namespace std;
typedef long long Character;
class Reader{
public:
virtual Character read()=0; //read a char from the file
};
class Counter{
public:
virtual int sum(Character ch) const=0; //the sum of ch appeared in the file
virtual int sum(Character ch1,Character ch2) const=0; //the sum of chars between ch1 ang ch2
virtual int sum() const=0; //the sum of chars of the file
virtual Character next() const=0;
};
class Reporter{
public:
virtual void report(const Counter& ct,FILE* out) const=0;
};
long wcode(const char* st){
long c=0;
for(int i=0;st[i]!='\0';i++){
c<<=8;
c+=(unsigned char)st[i];
}
return c;
}
class UTF8Reader:public Reader{
private:
// char *fpath; //save filename
FILE *in; //input file
bool isava; //is avaliable
int eof(){
close();
return -1;
}
public:
UTF8Reader(FILE* in){
this->in=in;
if(NULL!=in) isava=true;
}
~UTF8Reader(){
close();
isava=false;
}
void close(){
if(isava){
fclose(in);
isava=false;
}
}
Character read(){ //遍历文件,统计字符种类及个数
Character UTF8ch;
int h,buf;
if(!isava)
return -1;
if(EOF==(buf=fgetc(in))){ //the first byte
return eof();
}
UTF8ch=h=buf;
if(h&0x80)
for(unsigned char i=0x40;i&h;i>>=1){
UTF8ch<<=8;
if(EOF==(buf=fgetc(in))){ //the first byte
return eof();
}
UTF8ch+=buf;
}
return UTF8ch;
}
};
class UTF8Reporter:public Reporter{
public:
void report(const Counter& ct,FILE* out) const{
Character wt;
unsigned char buf;
unsigned int totally=0,thistime=0;
while((wt=ct.next())!=-1){
if(wt<0x20)continue;
for(int i=56;i>=0;i-=8){
buf=(wt>>i)&0xff;
if(buf) fputc(buf,out);
}
thistime=ct.sum(wt);
fprintf(out," (%I64x): %d\n",wt,thistime);
totally+=thistime;
}
fprintf(out,"totally: %u\n",totally);
}
};
class GB18030Reader:public Reader{
private:
// char *fpath; //save filename
FILE *in; //input file
bool isava; //is avaliable
int eof(){
close();
return -1;
}
public:
GB18030Reader(FILE* in){
this->in=in;
if(NULL!=in) isava=true;
}
~GB18030Reader(){
close();
isava=false;
}
void close(){
if(isava){
fclose(in);
isava=false;
}
}
Character read(){
unsigned int GB18030ch;
int buf;
if(!isava)
return -1;
if(EOF==(buf=fgetc(in))){ //the first byte
return eof();
}
if(buf<=0x80)
return buf;
GB18030ch=buf;
buf=fgetc(in); //the second byte
if(EOF==buf){
return eof();
}
GB18030ch<<=8;
GB18030ch+=buf;
if(buf>=30 && buf<=0x39){
if(EOF==(buf=fgetc(in))){ //the third byte
return eof();
}
GB18030ch<<=8;
GB18030ch+=buf;
if(EOF==(buf=fgetc(in))){ //the forth byte
return eof();
}
GB18030ch<<=8;
GB18030ch+=buf;
}
return GB18030ch;
}
};
class GB18030Reporter:public Reporter{
public:
void report(const Counter& ct,FILE* out) const{
Character wt;
unsigned int hanzinum=0,hanzicount=0,totally=0,thistime=0;
while((wt=ct.next())!=-1){
if(wt<0x20)continue; //skip control chars
if(wt<=0x80){
fputc(wt&0xff,out); //one byte
}
else{ //two or more byte
if(wt>=0x81308130){ //four bytes character
fputc((wt>>24)&0xff,out);
fputc((wt>>16)&0xff,out);
fputc((wt>>8)&0xff,out);
fputc(wt&0xff,out);
++hanzinum;
hanzicount+=ct.sum(wt);
}else if(wt>=0x8140&&wt<=0xa0fe || wt>=0xaa40&&wt<=0xfea0){ //two bytes character
fputc((wt>>8)&0xff,out);
fputc(wt&0xff,out);
++hanzinum;
hanzicount+=ct.sum(wt);
}
else{ //two byte not character
fputc((wt>>8)&0xff,out);
fputc(wt&0xff,out);
}
}
thistime=ct.sum(wt);
totally+=thistime;
fprintf(out," (%I64x): %d\n",wt,thistime);
}
fprintf(out,"不同汉字的个数:%u\n总汉字数(不包括全角标点符号):%u\n",hanzinum,hanzicount);
fprintf(out,"不包括控制字符的总字数:%u\n",totally);
}
};
class WordCounter:public Counter{
public:
WordCounter(Reader& in){
Character ch;
ch=in.read();
while(ch!=-1){
map<Character,int>::iterator iter=countmap.find(ch);
if(iter!=countmap.end()){
iter->second=iter->second+1;
}
else{
countmap[ch]=1;
}
ch=in.read();
}
}
~WordCounter(){
countmap.clear();
}
int sum(Character ch) const{
map<Character,int>::const_iterator citer=countmap.find(ch);
if(citer==countmap.end()) return 0;
else return citer->second;
}
int sum(Character ch1,Character ch2) const{
Character i;
int sigma=0;
for(Character i=ch1;i<=ch2;i++){
if(sum(i)!=0)break;
}
for(map<Character,int>::const_iterator citer=countmap.find(i);citer!=countmap.end() && citer->first<=ch2;++citer){
sigma+=citer->second;
}
return sigma;
}
int sum() const{
int sigma=0;
map<Character,int>::const_iterator citer;
for(citer=countmap.begin();citer!=countmap.end();++citer){
sigma+=citer->second;
}
return sigma;
}
Character next() const{
static map<Character,int>::const_iterator citer=countmap.begin();
if(citer!=countmap.end())return (citer++)->first;
else return -1;
};
private:
map<Character,int> countmap;
};
const unsigned char types[TYPES][6]={
{0} //ANSI 0
,{3,0xef,0xbb,0xbf} //UTF-8 1
,{2,0xfe,0xff} //UTF-16(大端序) 2
,{2,0xff,0xfe} //UTF-16(小端序) 3
,{4,0xfe,0xff,0,0} //UTF-32(大端序) 4
,{4,0xff,0xfe,0,0} //UTF-32(小端序) 5
};
int putftmark(int t,FILE *out){ //添加标记
if(NULL==out) return -1;
for(int i=1;i<=types[t][0];++i){
fputc(types[t][i],out);
}
return 0;
}
int getfiletype(FILE* in){
int i,j;
if(NULL==in) return -1;
for(i=1;i<TYPES;++i){
fseek(in,0,SEEK_SET);
for(j=1;j<=types[i][0];++j){
if(fgetc(in)!=types[i][j])break;
}
if(j>types[i][0]) return i;
}
fseek(in,0,SEEK_SET);
return 0;
}
void filenameformulize(char *str) //去掉由于文件路径带有空格系统自动加的引号
{
int i;
if(str[0]=='"'){ //move "
for(i=0;str[i+1]!=0;i++)
str[i]=str[i+1];
str[i-1]=0;
}
}
int main(int argc, char *argv[]){
char *filein,*fileout,*sstr;
Reader *gr=NULL;
WordCounter *wc=NULL;
Reporter *grp=NULL;
FILE *in=NULL,*out=NULL;
int filetype;
switch(argc){
case 1:
filein=new char[FILENAMELENGTH];
fileout=new char[FILENAMELENGTH];
printf("字符统计\n输入文本文档地址:");
gets(filein);
printf("输出文本文档地址:");
gets(fileout);
break;
case 3:
filein=argv[1];
fileout=argv[2];
break;
default:
printf("字符统计. 参数: [输入文件 输出文件]\n");
system("pause");
return 0;
}
filenameformulize(filein);
in=fopen(filein,"rb");
filetype=getfiletype(in);
switch (filetype){
case 0: //ANSI
gr=new GB18030Reader(in);
break;
case 1: //UTF-8
gr=new UTF8Reader(in);
break;
case -1:
printf("无法打开%s\n",filein);
system("pause");
return -1;
break;
default:
printf("不支持的编码格式\n");
system("pause");
return filetype;
}
if(!(out=fopen(fileout,"w"))){
printf("无法打开%s\n",fileout);
system("pause");
return -1;
}
wc=new WordCounter(*gr);
putftmark(filetype,out);
switch (filetype){
case 0: //ANSI
fprintf(out,"字符统计:%s\n",filein);
grp=new GB18030Reporter();
break;
case 1: //UTF-8
grp=new UTF8Reporter();
break;
default:
printf("不支持的编码格式\n");
system("pause");
return filetype;
}
grp->report(*wc,out);
fclose(out);
if(3!=argc){ //如果带输入文件和输出文件两个参数,认作批量处理,执行完后就不打开记事本了
sstr=new char[FILENAMELENGTH + 15];
strcpy(sstr,"start notepad ");
strcat(sstr,fileout);
system(sstr);
delete []sstr;
}
delete[] filein;
delete[] fileout;
return 0;
}
使用