UTF8 和 GBK混合的文本识别转换.....

http://bbs.chinaunix.net/thread-971041-1-1.html


#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <iconv.h>
#include <stdint.h>
#include <errno.h>

static int charconv(char *from, char *to,
const char *input, int inlen, char **output, int *outlen)
{
char *inbuf;
char *outbuf;
size_t inleft;
size_t outleft;
iconv_t cd;
size_t result;

cd = iconv_open(to, from);
if (cd == (iconv_t) (-1)) {
*outlen = -1;
*output = NULL;
return -1;
}

if (inlen == 0)
inlen = strlen(input);
*outlen = 4 * inlen;

inbuf = (char *)input;
outbuf = (char *) malloc(*outlen);

inleft = inlen;
outleft = *outlen;

*output = outbuf;

#if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2)
result = iconv(cd, &inbuf, &inleft, &outbuf, &outleft);
#else
result =
iconv(cd, (const char **) &inbuf, &inleft, &outbuf, &outleft);
#endif

iconv_close(cd);

*outlen = *outlen - outleft;
(*output)[*outlen] = 0;
return inlen - inleft;
}

int isutf8(char *s, size_t ns)
{
uint8_t x = 0, i = 0, j = 0, nbytes = 0, n = 0;

for(i = 1; i < 7; i++)
{
x = (uint8_t)(255 << i);
if(((uint8_t)*s & x) == x)
{
n = nbytes = (8 - i);
for(j = 0; (j < nbytes && j < ns); j++)
{
if((uint8_t)s[j] <= 0x80 && (uint8_t)s[j] >= 0xc0)break;
else n--;
}
if(n == 0) return nbytes;
}
}
return 0;
}

int isgbk(char *s, size_t ns)
{
if(ns > 2 && (uint8_t)*s >= 0x81 && (uint8_t)*s <= 0xfe
&& (
((uint8_t)*(s+1) >= 0x80 && (uint8_t)*(s+1) <= 0x7e)
|| ((uint8_t)*(s+1) >= 0xa1 && (uint8_t)*(s+1) <= 0xfe)
)
)
{
return 1;
}
return 0;
}

void convert(char *src, size_t nsrc, char **dst, int *ndst,
const char *codefrom, const char *codeto)
{
char *s = src, *d = (*dst), *end = (src + nsrc), *p = NULL;
iconv_t handler, cd;
size_t n = 0, ns = nsrc, nd = (*ndst), result = 0;
size_t x = 0, nbytes = 0, nbuf = 16;
char buf[nbuf];

handler = iconv_open(codeto, codefrom);
while(ns > 0)
{
n = ns;
if((nbytes = isutf8(s, ns)) > 0)
{
memcpy(d, s, nbytes);
s += nbytes;
d += nbytes;
ns -= nbytes;
nd -= nbytes;
//fprintf(stdout, "utf8:%d\n", ns);
//fprintf(stdout, "utf8:%d:%s\n", nbytes, (d - nbytes));
}
else if(isgbk(s, ns))
{
memset(buf, 0, nbuf);
memcpy(buf, s, 2);
x = 2;
p = buf;
result = iconv(handler, &p, &x, &d, &nd);
ns -= 2;
s += 2;
//fprintf(stdout, "gbk:%d\n", ns);
}
else
{
*d++ = *s++;
ns--;
nd--;
}
if(ns == n) break;
}
//fprintf(stdout, "%s\n", *dst);
iconv_close(handler);
}

#ifdef _DEBUG_UTF8_FILE
int main(int argc, char **argv)
{
char *file = NULL;
char *s = NULL, *inbuffer = NULL, *outbuffer = NULL;
struct stat st;
int i = 0, n = 0;
FILE *fp = NULL;
size_t nout;

if(argc < 2)
{
fprintf(stderr, "Usage:%s file ...\n", argv[0]);
_exit(-1);
}

for(i = 1; i <= argc; i++)
{
if((fp = fopen(argv[i], "r")))
{
if(stat(argv[i], &st) == 0 && st.st_size > 0)
{
s = inbuffer = (char *)calloc(1, (st.st_size + 1));
outbuffer = (char *)calloc(1, (st.st_size * 4 + 1));
nout = st.st_size * 4;
while((n = fread(s, 1, st.st_size, fp)) > 0)
{
s += n;
}
if(( n = (s - inbuffer)) > 0)
{
convert(inbuffer, n,
&outbuffer, &nout,
"gbk", "utf8");
fprintf(stdout, "%s\n", outbuffer);
}
else
{
fprintf(stderr, "read %s %d bytes failed, %s\n",
argv[i], n, strerror(errno));
}
free(inbuffer);
free(outbuffer);
}
fclose(fp);
}
}
}
#endif

#ifdef _DEBUG_UTF8_STRING
int main(int argc, char **argv)
{
int i = 0, n = 0;
if(argc < 2)
{
fprintf(stderr, "Usage:%s string ...\n", argv[0]);
_exit(-1);
}

for(i = 1; i < argc; i++)
{
if((n = isutf8(argv[i], strlen(argv[i]))) > 0)
{
fprintf(stdout, "\"%s\" is %d bytes UTF8 charset\n", argv[i], n);
}
else if(isgbk(argv[i], strlen(argv[i])))
{
fprintf(stdout, "\"%s\" is GBK charset\n", argv[i]);
}
else
{
fprintf(stdout, "\"%s\" is unknown charset\n", argv[i]);
}
}
}
#endif
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值