一个字符统计的程序

//By Aba

#include <cstdlib> #include <cstdio> #include <cstring> #include <map> #define FILENAMELENGTH 256 #define TYPES 6 //手动更新 using namespace std; typedef long long Character; class Reader{ public: virtual Character read()=0; //read a char from the file }; class Counter{ public: virtual int sum(Character ch) const=0; //the sum of ch appeared in the file virtual int sum(Character ch1,Character ch2) const=0; //the sum of chars between ch1 ang ch2 virtual int sum() const=0; //the sum of chars of the file virtual Character next() const=0; }; class Reporter{ public: virtual void report(const Counter& ct,FILE* out) const=0; }; long wcode(const char* st){ long c=0; for(int i=0;st[i]!='\0';i++){ c<<=8; c+=(unsigned char)st[i]; } return c; } class UTF8Reader:public Reader{ private: // char *fpath; //save filename FILE *in; //input file bool isava; //is avaliable int eof(){ close(); return -1; } public: UTF8Reader(FILE* in){ this->in=in; if(NULL!=in) isava=true; } ~UTF8Reader(){ close(); isava=false; } void close(){ if(isava){ fclose(in); isava=false; } } Character read(){ //遍历文件,统计字符种类及个数 Character UTF8ch; int h,buf; if(!isava) return -1; if(EOF==(buf=fgetc(in))){ //the first byte return eof(); } UTF8ch=h=buf; if(h&0x80) for(unsigned char i=0x40;i&h;i>>=1){ UTF8ch<<=8; if(EOF==(buf=fgetc(in))){ //the first byte return eof(); } UTF8ch+=buf; } return UTF8ch; } }; class UTF8Reporter:public Reporter{ public: void report(const Counter& ct,FILE* out) const{ Character wt; unsigned char buf; unsigned int totally=0,thistime=0; while((wt=ct.next())!=-1){ if(wt<0x20)continue; for(int i=56;i>=0;i-=8){ buf=(wt>>i)&0xff; if(buf) fputc(buf,out); } thistime=ct.sum(wt); fprintf(out," (%I64x): %d\n",wt,thistime); totally+=thistime; } fprintf(out,"totally: %u\n",totally); } }; class GB18030Reader:public Reader{ private: // char *fpath; //save filename FILE *in; //input file bool isava; //is avaliable int eof(){ close(); return -1; } public: GB18030Reader(FILE* in){ this->in=in; if(NULL!=in) isava=true; } ~GB18030Reader(){ close(); isava=false; } void close(){ if(isava){ fclose(in); isava=false; } } Character read(){ unsigned int GB18030ch; int buf; if(!isava) return -1; if(EOF==(buf=fgetc(in))){ //the first byte return eof(); } if(buf<=0x80) return buf; GB18030ch=buf; buf=fgetc(in); //the second byte if(EOF==buf){ return eof(); } GB18030ch<<=8; GB18030ch+=buf; if(buf>=30 && buf<=0x39){ if(EOF==(buf=fgetc(in))){ //the third byte return eof(); } GB18030ch<<=8; GB18030ch+=buf; if(EOF==(buf=fgetc(in))){ //the forth byte return eof(); } GB18030ch<<=8; GB18030ch+=buf; } return GB18030ch; } }; class GB18030Reporter:public Reporter{ public: void report(const Counter& ct,FILE* out) const{ Character wt; unsigned int hanzinum=0,hanzicount=0,totally=0,thistime=0; while((wt=ct.next())!=-1){ if(wt<0x20)continue; //skip control chars if(wt<=0x80){ fputc(wt&0xff,out); //one byte } else{ //two or more byte if(wt>=0x81308130){ //four bytes character fputc((wt>>24)&0xff,out); fputc((wt>>16)&0xff,out); fputc((wt>>8)&0xff,out); fputc(wt&0xff,out); ++hanzinum; hanzicount+=ct.sum(wt); }else if(wt>=0x8140&&wt<=0xa0fe || wt>=0xaa40&&wt<=0xfea0){ //two bytes character fputc((wt>>8)&0xff,out); fputc(wt&0xff,out); ++hanzinum; hanzicount+=ct.sum(wt); } else{ //two byte not character fputc((wt>>8)&0xff,out); fputc(wt&0xff,out); } } thistime=ct.sum(wt); totally+=thistime; fprintf(out," (%I64x): %d\n",wt,thistime); } fprintf(out,"不同汉字的个数:%u\n总汉字数(不包括全角标点符号):%u\n",hanzinum,hanzicount); fprintf(out,"不包括控制字符的总字数:%u\n",totally); } }; class WordCounter:public Counter{ public: WordCounter(Reader& in){ Character ch; ch=in.read(); while(ch!=-1){ map<Character,int>::iterator iter=countmap.find(ch); if(iter!=countmap.end()){ iter->second=iter->second+1; } else{ countmap[ch]=1; } ch=in.read(); } } ~WordCounter(){ countmap.clear(); } int sum(Character ch) const{ map<Character,int>::const_iterator citer=countmap.find(ch); if(citer==countmap.end()) return 0; else return citer->second; } int sum(Character ch1,Character ch2) const{ Character i; int sigma=0; for(Character i=ch1;i<=ch2;i++){ if(sum(i)!=0)break; } for(map<Character,int>::const_iterator citer=countmap.find(i);citer!=countmap.end() && citer->first<=ch2;++citer){ sigma+=citer->second; } return sigma; } int sum() const{ int sigma=0; map<Character,int>::const_iterator citer; for(citer=countmap.begin();citer!=countmap.end();++citer){ sigma+=citer->second; } return sigma; } Character next() const{ static map<Character,int>::const_iterator citer=countmap.begin(); if(citer!=countmap.end())return (citer++)->first; else return -1; }; private: map<Character,int> countmap; }; const unsigned char types[TYPES][6]={ {0} //ANSI 0 ,{3,0xef,0xbb,0xbf} //UTF-8 1 ,{2,0xfe,0xff} //UTF-16(大端序) 2 ,{2,0xff,0xfe} //UTF-16(小端序) 3 ,{4,0xfe,0xff,0,0} //UTF-32(大端序) 4 ,{4,0xff,0xfe,0,0} //UTF-32(小端序) 5 }; int putftmark(int t,FILE *out){ //添加标记 if(NULL==out) return -1; for(int i=1;i<=types[t][0];++i){ fputc(types[t][i],out); } return 0; } int getfiletype(FILE* in){ int i,j; if(NULL==in) return -1; for(i=1;i<TYPES;++i){ fseek(in,0,SEEK_SET); for(j=1;j<=types[i][0];++j){ if(fgetc(in)!=types[i][j])break; } if(j>types[i][0]) return i; } fseek(in,0,SEEK_SET); return 0; } void filenameformulize(char *str) //去掉由于文件路径带有空格系统自动加的引号 { int i; if(str[0]=='"'){ //move " for(i=0;str[i+1]!=0;i++) str[i]=str[i+1]; str[i-1]=0; } } int main(int argc, char *argv[]){ char *filein,*fileout,*sstr; Reader *gr=NULL; WordCounter *wc=NULL; Reporter *grp=NULL; FILE *in=NULL,*out=NULL; int filetype; switch(argc){ case 1: filein=new char[FILENAMELENGTH]; fileout=new char[FILENAMELENGTH]; printf("字符统计\n输入文本文档地址:"); gets(filein); printf("输出文本文档地址:"); gets(fileout); break; case 3: filein=argv[1]; fileout=argv[2]; break; default: printf("字符统计. 参数: [输入文件 输出文件]\n"); system("pause"); return 0; } filenameformulize(filein); in=fopen(filein,"rb"); filetype=getfiletype(in); switch (filetype){ case 0: //ANSI gr=new GB18030Reader(in); break; case 1: //UTF-8 gr=new UTF8Reader(in); break; case -1: printf("无法打开%s\n",filein); system("pause"); return -1; break; default: printf("不支持的编码格式\n"); system("pause"); return filetype; } if(!(out=fopen(fileout,"w"))){ printf("无法打开%s\n",fileout); system("pause"); return -1; } wc=new WordCounter(*gr); putftmark(filetype,out); switch (filetype){ case 0: //ANSI fprintf(out,"字符统计:%s\n",filein); grp=new GB18030Reporter(); break; case 1: //UTF-8 grp=new UTF8Reporter(); break; default: printf("不支持的编码格式\n"); system("pause"); return filetype; } grp->report(*wc,out); fclose(out); if(3!=argc){ //如果带输入文件和输出文件两个参数,认作批量处理,执行完后就不打开记事本了 sstr=new char[FILENAMELENGTH + 15]; strcpy(sstr,"start notepad "); strcat(sstr,fileout); system(sstr); delete []sstr; } delete[] filein; delete[] fileout; return 0; }

使用

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值