最近在做一个类似垂直下载的爬虫系统。下载之后有个解析模块,解析之后要求编码一致的向后传入索引,便遇到了编码转换问题。
1. 编码的识别
推荐使用 libchardet, 可以在这个页面下载,使用说明就算了,直接读头文件吧。
这是一个简单的示例,嘿嘿。
//#include "chardetect.h"
//char out_encode[CHARDET_MAX_ENCODING_NAME]
char* EncodeUtil::GetLocalEncoding(constchar* in_str, unsignedintstr_len,char* out_encode){
chardet_t chardect=NULL;
if(chardet_create(&chardect)==CHARDET_RESULT_OK){
if(chardet_handle_data(chardect, in_str, (unsignedint)str_len) == CHARDET_RESULT_OK)
if(chardet_data_end(chardect) == CHARDET_RESULT_OK)
chardet_get_charset(chardect, out_encode, CHARDET_MAX_ENCODING_NAME);
}
if(chardect)
chardet_destroy(chardect);
returnout_encode;
}
//#include "chardetect.h"
//char out_encode[CHARDET_MAX_ENCODING_NAME]
char * EncodeUtil::GetLocalEncoding(const char* in_str, unsigned int str_len, char* out_encode){
chardet_t chardect=NULL;
if(chardet_create(&chardect)==CHARDET_RESULT_OK){
if(chardet_handle_data(chardect, in_str, (unsigned int)str_len) == CHARDET_RESULT_OK)
if(chardet_data_end(chardect) == CHARDET_RESULT_OK)
chardet_get_charset(chardect, out_encode, CHARDET_MAX_ENCODING_NAME);
}
if(chardect)
chardet_destroy(chardect);
return out_encode;
}
2.编码的转换
编码转换,当然要知道源编码和目的编码了,源编码可以使用1的方法获取,当然你必须相信他能检测出来大部分的编码。对于检测不出来的,你就根据应用来尝试几种编码吧。比如对于我的应用,当检测不到html源码的编码时,绝大部分情况下,UTF-8 尝试是正确的(使用浏览器进行先验的,嘿嘿)。 这里呢,有一个问题,就是Linux系统的iconv并不能完美的转码。特别是我发现很多被识别为UTF-8的竟然用iconv 转 GB 编码失败,于是乎,找到了iconv的升级版==>http://www.gnu.org/software/libiconv/ 。对,就是它!
下载、编译、安装、使用,嘿嘿,这个有文档,你慢慢看吧,哈哈。 下面是一个比较好用的使用示例,相信够用了。呵呵
#ifndef ICONV_CONST
# define ICONV_CONST const
#endif
intEncodeUtil::charsetConvert(constchar*from_charset,constchar*to_charset,constchar*src,constintsrclen,char* save,intsavelen) {
if(save==NULL||srclen == 0) {
return-1;
}
save[0] = 0;
if(strcmp(from_charset, to_charset) == 0) {
if(savelen<=srclen)
strncat(save, src, savelen);
else
strncat(save, src, srclen);
returnsavelen>srclen ? srclen : savelen;
}
//convert
iconv_t cd;
intstatus = 0;//result
char*outbuf = save;//iconv outptr begin
ICONV_CONSTchar* inptr = src;
char* outptr = outbuf;
size_tinsize = srclen;
size_toutsize = savelen;
cd = iconv_open(to_charset, from_charset);
if((iconv_t)(-1) == cd){
return-1;
}
iconv(cd, NULL, NULL, NULL, NULL);
while(insize > 0) {
size_tres = iconv(cd, (ICONV_CONSTchar**) &inptr, &insize, &outptr,&outsize);
if(outptr != outbuf) {
outbuf=outptr;
*outbuf=0;
}
if(res == (size_t) (-1)) {
if(errno == EILSEQ) {
intone = 1;
iconvctl(cd, ICONV_SET_DISCARD_ILSEQ, &one);
status = -3;
}elseif(errno == EINVAL) {
if(srclen == 0) {
status = -4;
gotodone;
}else{
break;
}
}elseif(errno == E2BIG) {
status = -5;
gotodone;
}else{
status = -6;
gotodone;
}
}
}
status = strlen(save);// === outbuf - save ;
done:
iconv_close(cd);
returnstatus;
}
#ifndef ICONV_CONST
# define ICONV_CONST const
#endif
int EncodeUtil::charsetConvert(const char *from_charset,const char *to_charset, const char *src, const int srclen, char* save,int savelen) {
if(save==NULL||srclen == 0) {
return -1;
}
save[0] = 0;
if (strcmp(from_charset, to_charset) == 0) {
if(savelen<=srclen)
strncat(save, src, savelen);
else
strncat(save, src, srclen);
return savelen>srclen ? srclen : savelen;
}
//convert
iconv_t cd;
int status = 0; //result
char *outbuf = save;//iconv outptr begin
ICONV_CONST char* inptr = src;
char* outptr = outbuf;
size_t insize = srclen;
size_t outsize = savelen;
cd = iconv_open(to_charset, from_charset);
if((iconv_t)(-1) == cd){
return -1;
}
iconv(cd, NULL, NULL, NULL, NULL);
while (insize > 0) {
size_t res = iconv(cd, (ICONV_CONST char**) &inptr, &insize, &outptr,&outsize);
if (outptr != outbuf) {
outbuf=outptr;
*outbuf=0;
}
if (res == (size_t) (-1)) {
if (errno == EILSEQ) {
int one = 1;
iconvctl(cd, ICONV_SET_DISCARD_ILSEQ, &one);
status = -3;
} else if (errno == EINVAL) {
if (srclen == 0) {
status = -4;
goto done;
} else {
break;
}
} else if (errno == E2BIG) {
status = -5;
goto done;
} else {
status = -6;
goto done;
}
}
}
status = strlen(save);// === outbuf - save ;
done:
iconv_close(cd);
return status;
}
需要唠叨一句,不知道为什么,为什么这么底层的接口竟然 src 不用const 修饰,所以我传进来的const char *src 实际在函数中是有警告的,无奈,忽略吧。
3.OK ,结束了。哈哈。贴上一个多余的玩意儿,就是我的系统实际访问的接口。
intEncodeUtil::ConvertToGb(constchar* in_str,intstr_len,char* out_str,intout_str_len){
charencode[CHARDET_MAX_ENCODING_NAME];
encode[0]=0;
GetLocalEncoding(in_str,str_len,encode);
if(encode[0]==0){
//I'll try UTF-8 ,If you think it dosn't matter about undetect encode ,return -1 is ok
sprintf(encode,"%s","UTF-8");
//return -1;
}
returncharsetConvert(encode,"GB18030",in_str,str_len,out_str,out_str_len);
}
int EncodeUtil::ConvertToGb(const char* in_str, int str_len,char* out_str, int out_str_len){
char encode[CHARDET_MAX_ENCODING_NAME];
encode[0]=0;
GetLocalEncoding(in_str,str_len,encode);
if(encode[0]==0){
//I'll try UTF-8 ,If you think it dosn't matter about undetect encode ,return -1 is ok
sprintf(encode,"%s","UTF-8");
//return -1;
}
return charsetConvert(encode,"GB18030",in_str,str_len,out_str,out_str_len);
}