java抓取网页的乱码问题(通用)2010-12-22 13:34//解决抓取页面的乱码问题
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.net.URLConnection;
public class DownPage {
public static void main(String args[]) throws IOException{
//抓取的页面地址
String urlStr = "http://www.baidu.com";
URL url = new URL(urlStr);
URLConnection connection = url.openConnection();
//获得该网页的编码
String ss = connection.getContentType();
System.out.println(ss);
//查InputStreamReader的构造方法,gb2312为该页面的编码
BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream(),"gb2312"));
File file = new File("d:/a.html");
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file)));
if(br != null){
String s = null;
while((s = br.readLine())!=null){
//String sss = new String(s.getBytes("gb2312"),"windows-31J");
System.out.println(s);
bw.write(s);
bw.flush();
}
bw.close();
br.close();
}
}
}
分享到:
2011-03-17 23:59
浏览 6395
评论