场景:
1.在没有HTML库时(谁有好的html库介绍下,C/C++的?tinyXML?),以SAX方式解析HTML时,会读入特殊字符,这时候需要转义成正常字符才能使用。
2.耗时,4-6小时(被打扰)。
3.replace虽然挺好,但是会循环整个字符串执行替换,效率应该没有一次过替换高.
#include <algorithm>
void replace( iterator start, iterator end, const TYPE& old_value, const TYPE& new_value );
4.所有转义字符的网址:
http://114.xixik.com/character/
文件1:test_htmlescape.cpp
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <assert.h>
using namespace std;
int IsLittleEndian()
{
int x = 1;
if (*(char*) &x == 1)
{
return 1;
}
else
{
return 0;
}
}
void HtmEscapeEntityCodeToUnicode(const char* entity_code,char* html_char)
{
int decimal_value = atoi(entity_code);
char* uchari = (char*)&decimal_value;
if(IsLittleEndian())
{
html_char[0] = uchari[0] & 0xFF;
html_char[1] = uchari[1] & 0xFF;
}else
{
html_char[0] = uchari[1] & 0xFF;
html_char[1] = uchari[0] & 0xFF;
}
}
void OneUnicode2UTF8(const char* unicode_char,size_t unicode_char_length,
char* utf_char)
{
//unicode: 0x192->110010010 ,utf8:0xC692->1100011010010010
int value = 0;
memcpy(&value,unicode_char,unicode_char_length);
if (value >= 0x0000 && value <= 0x007F)
{
utf_char[0] = unicode_char[0];
}
else if (value >= 0x0080 && value <= 0x07FF)
{
utf_char[0] = ((value >> 6) | 0xC0);
utf_char[1] = ((value & 0x3F) | 0x80);
}
else if (value >= 0x0800 && value <= 0xFFFF)
{
utf_char[0] = ((value >> 12) | 0xE0);
utf_char[1] = ((value >> 6 & 0x3F) | 0x80);
utf_char[2] = ((value & 0x3F) | 0x80);
}
else if (value >= 0x10000 && value <= 0x10FFFF)
{
utf_char[0] = (value >> 18 | 0xF0);
utf_char[1] = ((value >> 12 & 0x3F) | 0x80);
utf_char[2] = ((value >> 6 & 0x3F) | 0x80);
utf_char[3] = ((value & 0x3F) | 0x80);
}
else
{
cerr << "value too big." << endl;
assert(0);
}
}
static const char* kEntityNameToEntityCodeMap[] =
{
"oelig","339","amp","38","rArr","8658","fnof","402"
};
//1.这里可以优化的余地很大.
const char* HtmEscapeEntityNameToEntityCode(const char* entity_name)
{
static size_t length = sizeof(kEntityNameToEntityCodeMap)/sizeof(char*);
for(size_t i = 0; i < length; i+=2)
{
if(!strcmp(entity_name,kEntityNameToEntityCodeMap[i]))
{
return kEntityNameToEntityCodeMap[i+1];
}
}
return NULL;
}
string UnescapeUTFHTMLContent(const char* str)
{
string temp;
char* pos_amp = NULL;
char* pos_semicolon = (char*)str;
const char* start_amp = str;
int entity_length = 0;
char entity_code[5];
const int kMaxEntityLength = 4;
char entity_name[20];
const int kMaxEntityNameLength = 18;
char unicode[3];
char utf8[4];
while(true)
{
if(!start_amp || !(*start_amp))
{
break;
}
pos_amp = strchr(start_amp,'&');
if(!pos_amp)
{
temp.append(start_amp);
break;
}
int pos_no = pos_amp - pos_semicolon;
if(pos_no > 0)
{
temp.append(start_amp,pos_no);
start_amp = pos_amp;
}
char* pos_amp1 = pos_amp+1;
if(!pos_amp1 || !(*pos_amp1))
{
string t2(start_amp);
temp.append(start_amp);
break;
}
if(isalpha(*pos_amp1))
{
pos_semicolon = strchr(pos_amp1,';');
if(pos_semicolon)
{
//调用 HtmEscapeEntityNameToEntityCode
memset(entity_name,0,sizeof(entity_name));
entity_length = ((pos_semicolon - pos_amp1) >
kMaxEntityNameLength)?kMaxEntityNameLength:
(pos_semicolon - pos_amp1);
strncpy(entity_name,pos_amp1,entity_length);
const char* entity_code_c =
HtmEscapeEntityNameToEntityCode(entity_name);
if(entity_code_c)
{
memset(unicode,0,sizeof(unicode));
memset(utf8,0,sizeof(utf8));
HtmEscapeEntityCodeToUnicode(entity_code_c,unicode);
OneUnicode2UTF8(unicode,2,utf8);
temp.append(utf8);
}else
{
temp.append(entity_name);
}
//1.entity_name转换为entity_code之后再转换为utf8字符.
start_amp = pos_semicolon + 1;
pos_semicolon+=1;
}else
{
start_amp = pos_amp1;
}
}else if(*pos_amp1 =='#')
{
char* pos_digit = (pos_amp1+1);
if(!pos_digit)
{
break;
}
if(isdigit(*pos_digit))
{
//1.需要判断数值小于10000.
pos_semicolon = strchr(start_amp,';');
if(pos_semicolon)
{
memset(entity_code,0,sizeof(entity_code));
entity_length = ((pos_semicolon - pos_digit) >
kMaxEntityLength)?kMaxEntityLength:
(pos_semicolon - pos_digit);
strncpy(entity_code,pos_digit,entity_length);
memset(unicode,0,sizeof(unicode));
memset(utf8,0,sizeof(utf8));
HtmEscapeEntityCodeToUnicode(entity_code,unicode);
OneUnicode2UTF8(unicode,2,utf8);
temp.append(utf8);
start_amp = pos_semicolon + 1;
pos_semicolon+=1;
}else
{
start_amp = pos_digit;
}
}
}else
{
string sa(start_amp,pos_amp1 - start_amp);
temp.append(sa);
start_amp = pos_amp1;
}
}
return temp;
}
int main(int argc, char *argv[])
{
printf("Hello, world\n");
string str;
const char *html_str = "ΖabcdΕhello©<a>⇒⇒"
"ƒ…</a>"
"asfas‡dfeΥΨΩ<img>n↓n⋅nωmmm</img>1jh";
str = UnescapeUTFHTMLContent(html_str);
cout << "str: " << str << endl;
html_str = "<td>&#8736;</td>";
str = UnescapeUTFHTMLContent(html_str);
cout << "str: " << str << endl;
html_str = "<td>&#8736;</td>;;#8736;";
str = UnescapeUTFHTMLContent(html_str);
cout << "str: " << str << endl;
html_str = "<td>&#8736;</td>;&";
str = UnescapeUTFHTMLContent(html_str);
cout << "str: " << str << endl;
html_str = "<td>&#8736;</td>;&";
str = UnescapeUTFHTMLContent(html_str);
cout << "str: " << str << endl;
html_str = "<td>&#8736;</td>;&";
str = UnescapeUTFHTMLContent(html_str);
cout << "str1: " << str << endl;
html_str = "ab";
str = UnescapeUTFHTMLContent(html_str);
cout << "ab: " << str << endl;
return 0;
}
str: ΖabcdΕhello©<a>⇒⇒ƒ…</a>asfas‡dfeΥPsiΩ<img>n↓nsdotnωmmm</img>1jh
str: <td>∠</td>
str: <td>∠</td>;;#8736;
str: <td>∠</td>;&
str: <td>∠</td>;amp
str1: <td>∠</td>;&
ab: ab