截取出来的字符串中文乱码
#if 0
NSString *retStr;
// 1,utf8编码的
NSStringEncoding dec = CFStringConvertEncodingToNSStringEncoding(kCFStringEncodingMacJapanese);
NSStringEncoding enc = CFStringConvertEncodingToNSStringEncoding(kCFStringEncodingGB_18030_2000);
const char * sc = [string cStringUsingEncoding:dec];
DLog(@"sc = %s",sc);
if (sc == NULL) {//正常编码
DLog(@"null");
NSData *data=[string dataUsingEncoding:dec];
retStr = [[NSString alloc] initWithData:data encoding:enc];
DLog(@"retStr = %@",retStr);
retStr = [self handleCharacterSet:string withEncoding:enc];
}else if (IsTextUTF8(sc, strlen(sc))) {
DLog(@"utf-8");
NSData *data=[string dataUsingEncoding:dec];
retStr = [[NSString alloc] initWithData:data encoding:NSUTF8StringEncoding];
}else if (isgbk(sc, strlen(sc))){
DLog(@"gbk");
retStr = [self gb_2312ToUTF_8WithString:string];
}else{
DLog(@"normal");
retStr = [self gb_2312ToUTF_8WithString:string];
}
return retStr;
#endif
- (NSString *)gb_2312ToUTF_8WithString:(NSString *)string
{
NSStringEncoding dec = CFStringConvertEncodingToNSStringEncoding(kCFStringEncodingMacJapanese);
NSStringEncoding enc = CFStringConvertEncodingToNSStringEncoding(kCFStringEncodingGB_18030_2000);
NSData *data=[string dataUsingEncoding:dec];
NSString *retStr = [[NSString alloc] initWithData:data encoding:enc];
DLog(@"retStr = %@",retStr);
return retStr;
}
- (NSString *)handleCharacterSet:(NSString *)string withEncoding:(NSStringEncoding)enc
{
NSInteger max = [string length];
char *nbytes = malloc(max + 1);
for (int i = 0; i < max; i++)
{
unichar ch = [string characterAtIndex: i];
nbytes[i] = (char) ch;
}
nbytes[max] = '\0';
NSString * retStr = [NSString stringWithCString:nbytes encoding:enc];
free(nbytes);
if (retStr == NULL)
{
retStr = string;
}
return retStr;
}
- (NSData *)dataByHealingGB18030Stream:(NSData *)sender {
NSUInteger length = [sender length];
if (length == 0) {
return sender;
}
static NSString * replacementCharacter = @"?";
NSStringEncoding enc =CFStringConvertEncodingToNSStringEncoding(kCFStringEncodingGB_18030_2000);
NSData *replacementCharacterData = [replacementCharacter dataUsingEncoding:enc];
NSMutableData *resultData = [NSMutableData dataWithCapacity:sender.length];
const Byte *bytes = [sender bytes];
static const NSUInteger bufferMaxSize = 1024;
Byte buffer[bufferMaxSize];
NSUInteger bufferIndex = 0;
NSUInteger byteIndex = 0;
BOOL invalidByte = NO;
#define FlushBuffer() if (bufferIndex > 0) {[resultData appendBytes:buffer length:bufferIndex];bufferIndex = 0;}
#define CheckBuffer() if ((bufferIndex+5) >= bufferMaxSize) {[resultData appendBytes:buffer length:bufferIndex];bufferIndex = 0;}
while (byteIndex < length) {
Byte byte = bytes[byteIndex];
//检查第一位
if (byte >= 0 && byte <= (Byte)0x7f) {
//单字节文字
CheckBuffer();
buffer[bufferIndex++] = byte;
} else if (byte >= (Byte)0x81 && byte <= (Byte)0xfe){
//可能是双字节,可能是四字节
if (byteIndex + 1 >= length) {
//这是最后一个字节了,但是这个字节表明后面应该还有1或3个字节,那么这个字节一定是错误字节
FlushBuffer();
return resultData;
}
Byte byte2 = bytes[++byteIndex];
if (byte2 >= (Byte)0x40 && byte <= (Byte)0xfe && byte != (Byte)0x7f) {
//是双字节,并且可能合法
Byte tuple[] = {byte, byte2};
CFStringRef cfstr = CFStringCreateWithBytes(kCFAllocatorDefault, tuple, 2,kCFStringEncodingGB_18030_2000, false);
if (cfstr) {
CFRelease(cfstr);
CheckBuffer();
buffer[bufferIndex++] = byte;
buffer[bufferIndex++] = byte2;
} else {
//这个双字节字符不合法,但byte2可能是下一字符的第一字节
byteIndex -= 1;
invalidByte = YES;
}
} else if (byte2 >= (Byte)0x30 && byte2 <= (Byte)0x39) {
//可能是四字节
if (byteIndex + 2 >= length) {
FlushBuffer();
return resultData;
}
Byte byte3 = bytes[++byteIndex];
if (byte3 >= (Byte)0x81 && byte3 <= (Byte)0xfe) {
// 第三位合法,判断第四位
Byte byte4 = bytes[++byteIndex];
if (byte4 >= (Byte)0x30 && byte4 <= (Byte)0x39) {
//第四位可能合法
Byte tuple[] = {byte, byte2, byte3, byte4};
CFStringRef cfstr = CFStringCreateWithBytes(kCFAllocatorDefault, tuple,4, kCFStringEncodingGB_18030_2000, false);
if (cfstr) {
CFRelease(cfstr);
CheckBuffer();
buffer[bufferIndex++] = byte;
buffer[bufferIndex++] = byte2;
buffer[bufferIndex++] = byte3;
buffer[bufferIndex++] = byte4;
} else {
//这个四字节字符不合法,但是byte2可能是下一个合法字符的第一字节,回退3位
//并且将byte1,byte2用?替代
byteIndex -= 3;
invalidByte = YES;
}
} else {
//第四字节不合法
byteIndex -= 3;
invalidByte = YES;
}
} else {
// 第三字节不合法
byteIndex -= 2;
invalidByte = YES;
}
} else {
// 第二字节不是合法的第二位,但可能是下一个合法的第一位,所以回退一个byte
invalidByte = YES;
byteIndex -= 1;
}
if (invalidByte) {
invalidByte = NO;
FlushBuffer();
[resultData appendData:replacementCharacterData];
}
}
byteIndex++;
}
FlushBuffer();
return resultData;
}
int isgbk(const char *s, size_t ns)
{
if(ns > 2 && (uint8_t)*s >= 0x81 && (uint8_t)*s <= 0xfe
&& (
((uint8_t)*(s+1) >= 0x80 && (uint8_t)*(s+1) <= 0x7e)
|| ((uint8_t)*(s+1) >= 0xa1 && (uint8_t)*(s+1) <= 0xfe)
)
)
{
return 1;
}
return 0;
}
int IsTextUTF8(const char* str,long length)
{
int i;
int nBytes=0;//UFT8可用1-6个字节编码,ASCII用一个字节
unsigned char chr;
bool bAllAscii=true; //如果全部都是ASCII, 说明不是UTF-8
for(i=0;i<length;i++)
{
chr= *(str+i);
if( (chr&0x80) != 0 ) // 判断是否ASCII编码,如果不是,说明有可能是UTF-8,ASCII用7位编码,但用一个字节存,最高位标记为0,o0xxxxxxx
bAllAscii= false;
if(nBytes==0) //如果不是ASCII码,应该是多字节符,计算字节数
{
if(chr>=0x80)
{
if(chr>=0xFC&&chr<=0xFD)
nBytes=6;
else if(chr>=0xF8)
nBytes=5;
else if(chr>=0xF0)
nBytes=4;
else if(chr>=0xE0)
nBytes=3;
else if(chr>=0xC0)
nBytes=2;
else
{
return false;
}
nBytes--;
}
}
else //多字节符的非首字节,应为 10xxxxxx
{
if( (chr&0xC0) != 0x80 )
{
return false;
}
nBytes--;
}
}
if( nBytes > 0 ) //违返规则
{
return false;
}
if( bAllAscii ) //如果全部都是ASCII, 说明不是UTF-8
{
return false;
}
return true;
}