首先读取头部信息,读到字符编码就停止,设定字符编码后继续读取网页内容。
package test;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
public class Main {
static String[] coding={"utf-8", "gb2312", "gbk", "unicode"};
public static void main(String args[]) throws IOException{
String urlStr = "http://www.sina.com.cn";
URL url = new URL(urlStr);
URLConnection connection = url.openConnection();
String ss = connection.getContentType();
System.out.println(ss);
BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream()));
System.out.println("build connection");
String s = null;
String pageCoding = null;
while((s = br.readLine())!=null){
if(s.contains("charset")){
br.close();
for(int i=0;i<coding.length;i++){
if(s.contains(coding[i])){
pageCoding=coding[i];
System.out.println(pageCoding);
br=new BufferedReader(new InputStreamReader(url.openStream(), pageCoding));
System.out.println("build connection2");
break;
}
}
if(pageCoding==null){
System.out.println("error"+s);
}
break;
}
}
while((s=br.readLine())!=null){
System.out.println(s);
}
}
}