unicode码
在我写爬虫的过程中,经常遇到一些网站的中文是经过Unicode转码的。在对网页进行解析时,需要将其进行转码,转为中文字符。
例如,
\u5317\u4eac\u767e\u5ea6\u7f51\u8baf\u79d1\u6280\u6709\u9650\u516c\u53f8
对应的中文是
北京百度网讯科技有限公司
爬虫中的Unicode转中文
package navi.main;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class JsonTest2 {
public static void main(String[] args) throws Exception {
//该网页中文是经过处理的
String url="http://bbs.paidai.com/api.php?act=ajax_get_more_olist&topicid=1132970&start=50&offset=50";
//获取网页的html
Document document1 = Jsoup.connect(url).timeout(50000).userAgent("bbbb").get();
//输入经过转码后的html
System.out.println(convertUnicode(document1.toString()));
}
//转码类
public static String convertUnicode(String ori){
char aChar;
int len = ori.length();
StringBuffer outBuffer = new StringBuffer(len);
for (int x = 0; x < len;) {
aChar = ori.charAt(x++);
if (aChar == '\\') {
aChar = ori.charAt(x++);
if (aChar == 'u') {
// Read the xxxx
int value = 0;
for (int i = 0; i < 4; i++) {
aChar = ori.charAt(x++);
switch (aChar) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
value = (value << 4) + aChar - '0';
break;
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
value = (value << 4) + 10 + aChar - 'a';
break;
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
value = (value << 4) + 10 + aChar - 'A';
break;
default:
throw new IllegalArgumentException(
"Malformed \\uxxxx encoding.");
}
}
outBuffer.append((char) value);
} else {
if (aChar == 't')
aChar = '\t';
else if (aChar == 'r')
aChar = '\r';
else if (aChar == 'n')
aChar = '\n';
else if (aChar == 'f')
aChar = '\f';
outBuffer.append(aChar);
}
} else
outBuffer.append(aChar);
}
return outBuffer.toString();
}
}