httpclient自动获取页面编码设置进行字符编码,使httpclient适用所有网页抓取不乱码
链接: http://blog.csdn.net/yangbobo1992/article/details/8560512
/**
* 获取页面html内容
* @param method
* @param methodType
* @return String
* @throws UnsupportedEncodingException
* @throws IOException
*/
private static String readInputStream(HttpMethod method) throws Exception{
String charset = "UTF-8";
if(method instanceof PostMethod){
charset = ((PostMethod)method).getResponseCharSet();
}else{
charset = ((GetMethod)method).getResponseCharSet();
}
byte[] bytes = method.getResponseBody();
String body = new String(bytes,"UTF-8");
charset = getCharSetByBody(body,charset);
return new String(bytes,charset);
}
/**
* 根据页面body获取字符编码
* @param html
* @param charset
* @return
*/
private static String getCharSetByBody(String html,String charset){
Document document = parseJSoupDocumentFromHtml(html, Constants.parseBaseUri);
Elements elements = document.select("meta");
for(Element metaElement : elements){
if(metaElement!=null && StringUtils.isNotBlank(metaElement.attr("http-equiv")) && metaElement.attr("http-equiv").toLowerCase().equals("content-type")){
String content = metaElement.attr("content");
charset = getCharSet(content);
break;
}
}
return charset;
}
/**
* 正则获取字符编码
* @param content
* @return
*/
private static String getCharSet(String content){
String regex = ".*charset=([^;]*).*";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(content);
if(matcher.find())
return matcher.group(1);
else
return null;
}
链接: http://blog.csdn.net/yangbobo1992/article/details/8560512