package com.tl.spider.download;
import com.tl.spider.utils.StaticValue;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @ClassName CharsetDetectorUtil
* @Description 给定一个url,获取其网页源代码的编码
* @Author Administrator
* @Date 2019/5/19 13:57
* @Version 1.0
**/
public class CharsetDetectorUtil {
/**
* 获取网页源代码的编码方式
* @param url
* @return
* @throws IOException
*/
public static String getCharset(String url) throws Exception {
String charSet = null;
URL urlObject = new URL(url);
URLConnection urlConnection = urlObject.openConnection();
Map<String, List<String>> map = urlConnection.getHeaderFields();
List<String> list = map.get("Content-Type");
if(list != null && !list.isEmpty()) {
String line = list.get(0);
String[] array = line.split(";");
for(String str : array) {
String[] eleArray = str.split("=");
if(eleArray.length == 2) {
if(eleArray[0].equals("charset")) {
charSet = eleArray[1].trim();
}
}
}
}
/**
* 由于网页的编码方式的说明只在网页源代码的前几行,所以不需要获取所有的网页源代码
*/
if(charSet == null) {
// 启用meta获取网页的编码方式
BufferedReader bufferedReader = WebPageDownLoadUtil.getBR(url, StaticValue.ENCODING_DEFAULT);
String tmp = null;
while((tmp = bufferedReader.readLine()) != null) {
tmp = tmp.toLowerCase();
String charset = getCharSetValue4Line(tmp);
if(charset != null) {
charSet = charset;
break;
}
if(tmp.contains("</head>")) {
break;
}
}
if(bufferedReader != null) {
bufferedReader.close();
}
}
return charSet;
}
/**
*
* @param line
* @return
*/
public static String getCharSetValue4Line(String line) {
String charsetValue = null;
String regex = "charset=\"?(.+?)\"?\\s?/?>"; // 这个地方需要综合多个网页进行相应的修改
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(line);
if(matcher.find()) {
charsetValue = matcher.group(1);
}
return charsetValue;
}
public static void main(String[] args) throws Exception {
String url = "http://news.youth.cn/";
//String url = "https://www.baidu.com/";
//String url= "https://hao.360.com/?s0001";
String charSet = getCharset(url);
System.out.println(charSet);
}
}
其中getBr函数为:
/**
* 获取BufferedReader
* @param url
* @param charset
* @return
* @throws Exception
*/
public static BufferedReader getBR(String url, String charset) throws Exception {
URL urlObject = new URL(url);
InputStream inputStream = urlObject.openStream();
InputStreamReader inputStreamReader = new InputStreamReader(inputStream, charset);
BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
return bufferedReader;
}