/*//
文件名: h2t.c v0.2
作者: 苏晓(suxiaojack)
日期: 2008.7
用途: 转换HTML内容为TXT文本
许可 ( License ): GPL
v0.2
处理Bug
1、修正无法识别&#数字;问题 UNICODE=>GB2312
2、添加©和 ® 处理
3、修正&处理死循环.
v0.1
//*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <windows.h>
#include <locale.h>
UnicodeToGB2312(char* pOut,unsigned short uData)
{
WideCharToMultiByte(CP_ACP,NULL,&uData,1,pOut,sizeof(WCHAR),NULL,NULL);
return;
}
#define BUFSIZE 1024*1024*2
char buf[1024*1024*20];
char shadowbuf[1024*1024*20];
char buffer[BUFSIZE];
long size;
int type=0;
#define tocsize 14
//这个东西太多了!常用的可能也就这些吧。
//Windows控制台太变态!注册商标等一些符号无法输出!
char* toc[tocsize]={" "," ","<","<",">",">",""","/"","&","&","©","◎版权","®","◎注册"};
void usage(char** argv)
{
char *us="用来转换html =>txt. ver0.2/n"
"suxiaojack写于2008.7/n";
char *ue="tstart_in_tag_text:开始的Tag标记中的特征文字,好理解end_in_tag_text了。/n"
"jump_num:跳过几次开始找到的,默认为0./n"
"注意不支持正则式!未曾处理水印文字。/n";
printf("%s",us);
printf("使用方法:%s <file> [ <start_in_tag_text> [jump_num] <end_in_tag_text> ] /n",argv[0]);
printf("%s",ue);
};
//strstr快速比较
int ministrstr(char* s,char* f)
{
char minibuf[16];
memcpy(minibuf,s,15);
minibuf[15]=0;
return strstr(minibuf,f)-minibuf;
};
//strstr转换为小写快速比较。
int ministrstri(char* s,char *f)
{
char minibuf[16];
memcpy(minibuf,s,15);
minibuf[15]=0;
strlwr(minibuf);
return strstr(minibuf,f)-minibuf;
}
// 等标记转换
int isintoc(char* streamstart)
{
int i=0;
int ret=0;
while(i<tocsize)
{
if(!ministrstr(streamstart,toc[i]))
{
printf("%s",toc[i+1]);
ret=strlen(toc[i]);
break;
};
i+=2;
};
if(ret==0) //没有转换处理
{
printf("&");
ret=1;
};
return ret;
};
int num2txt(char* numstart)
{
char tmp[256];
int pos=0;
char* s=numstart;
unsigned short word;
char os[3];
while( *s>='0' && *s <='9' )
{
tmp[pos++]=*s++;
};
tmp[pos]=0;
word=atoi(tmp);
memset(os,0,3);
UnicodeToGB2312(os,word);
printf("%s",os);
//s是;跳过
s++;
return s-numstart;
}
//文件全部进入缓存
void read2buf(FILE* fp)
{
buf[0]=0;
size=0;
while(!feof(fp))
{
fgets(buffer,sizeof(buffer),fp);
strcat(buf+size,buffer);
size+=strlen(buffer);
};
buf[size]=0;
memcpy(shadowbuf,buf,size+1);
strlwr(shadowbuf);
};
//找标记的开始位置。返回找到后'>'之后的第一个字符位置。
int findstart(char* start,int jump)
{
char* pos=shadowbuf;
strlwr(start);
do
{
pos=strstr(pos,start);
if(pos-shadowbuf < 0 )return -1;
pos++;
}while(jump--);
while(*pos++ != '>')
{};
return pos-shadowbuf;
};
//找标记的结束位置。返回找到后'<'之前的最后字符位置。
int findend(char* end,int start)
{
char* pos=shadowbuf+start;
strlwr(end);
pos=strstr(pos,end);
if(pos-shadowbuf<0)return -1;
while(*pos-- != '<')
{
};
return pos-shadowbuf;
};
void printline()
{
switch(type)
{
case 1:
printf("%c",'/r');
break;
case 2:
printf("%s","/r/n");
break;
case 3:
printf("%c",'/n');
break;
default:
break;
};
};
//转换输出
void h2t(char* s,int len)
{
char* ss=s;
while(ss-s<len)
{
//判断一下文章换行符号类型
if(type==0 && ( *ss=='/r'|| *ss=='/n'))
{
if(*ss=='/r' &&*(ss+1)=='/n')
{
type=2;
}else if(*ss=='/n')
{
type=3;
}else
{
type=1;
};
};
if(*ss!='<')
{//非标记
if(*ss=='&')
{
if(*(ss+1)=='#')
{
ss+=2;
int may=num2txt(ss);
ss+=may;
}else
{
int may=isintoc(ss);
if(may>0)
{
ss+=may;
};
}
}
else
{
printf("%c",*ss);
ss++;
};
}
else
{
//<script标记
if(!ministrstri(ss,"<script"))
{
ss++;
findnext:
while(*ss!='<' && ss-s <len)
{
ss++;
};
if(ss-s>=len)break;
while(ministrstri(ss,"</script")!=0 && ss-s<len)
{
ss++;
goto findnext;
};
if(ss-s>=len)break;
while(*ss!='>')ss++;
ss++;
}else if(!ministrstri(ss,"<style")) //<style标记
{
ss++;
findnext2:
while(*ss!='<' && ss-s <len)
{
ss++;
};
if(ss-s>=len)break;
while(ministrstri(ss,"</style")!=0 && ss-s<len)
{
ss++;
goto findnext2;
};
if(ss-s>=len)break;
while(*ss!='>')ss++;
ss++;
}else if(!ministrstri(ss,"</br>"))
{
printline();
ss+=5;
}else if(!ministrstri(ss,"</p>"))
{
printline();
ss+=4;
}else if(!ministrstri(ss,"<br>"))
{
printline();
ss+=4;
}
else //普通标记
{
while(*ss!='>' && ss-s<len)
{
ss++;
};
if(ss-s>=len)break;
ss++;
};
};
};
};
int main(int argc,char *argv[])
{
FILE* fp=0;
int start,end,jump;
if(argc==2)
{
fp=fopen(argv[1],"r");
if(!fp)
{
usage(argv);
exit(0);
};
read2buf(fp);
h2t(buf,size);
fclose(fp);
}else if(argc==4)
{
fp=fopen(argv[1],"r");
if(!fp)
{
usage(argv);
exit(0);
};
read2buf(fp);
start=findstart(argv[2],0);
if(start<0)
{
printf("can't find:%s/n",argv[2]);
exit(1);
};
end=findend(argv[3],start);
if(end<0)
{
printf("can't find:%s/n",argv[3]);
exit(1);
}
if(start<end)
{
h2t(buf+start,end-start);
}else
{
usage(argv);
};
fclose(fp);
}else if(argc == 5)
{
fp=fopen(argv[1],"r");
if(!fp)
{
usage(argv);
exit(0);
};
read2buf(fp);
jump=atoi(argv[3]);
start=findstart(argv[2],jump);
if(start<0)
{
printf("can't find:%s/n",argv[2]);
exit(1);
};
end=findend(argv[4],start);
if(end<0)
{
printf("can't find:%s/n",argv[4]);
exit(1);
}
if(start<end)
{
h2t(buf+start,end-start);
}else
{
usage(argv);
};
fclose(fp);
}
else
{
usage(argv);
}
return 0;
}
/*
与noblank联合使用
h2t filename.htm |noblank >out.txt
*/
html2txt h2t.c
最新推荐文章于 2021-07-03 01:53:36 发布