java下载网页的方法主要有两种:java自带的HttpURLConnection类和HttpClient类包,这两种方法有各自的好处,另外对于中文乱码的处理,本文在代码中有详细体现和比较,能够很好的消除中文乱码问题,供大家参考。下面就让我们在代码中领悟吧!
方法1:HttpURLConnection的两种不同解码方式
package com.learn.http.impl;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import com.learn.http.Http;
import com.learn.util.SingleMatch;
public class HttpURLConnectionImp1 implements Http {
/**
* 采用Java自带的HttpURLConnection,优点:方便,不用导入其他包
* 缺点:在该方法中虽然对编码进行了转换,但是由于缓冲区大小的确定,如大小为1024字节,有可能会引起文字的切割不正确导致部分中文字乱码
* @param pageUrl
* @param encoding
* @return
*/
public String getHtmlcodeWithoutHeader(String pageUrl, String encoding) {
URL url = null;
HttpURLConnection conn = null;
InputStream in = null;
StringBuffer sb = null;
try {
url = new URL(pageUrl);
conn = (HttpURLConnection) url.openConnection();
sb = new StringBuffer();
if (conn.getResponseCode() == HttpURLConnection.HTTP_OK) {
in = conn.getInputStream();
byte[] buf = new byte[1024];
int len = 0;
while ((len=in.read(buf)) != -1)
sb.append(new String(buf, 0, len, encoding));
in.close();
}
else System.err.println("访问网络失败!"+conn.getResponseCode());
} catch (MalformedURLException e) {
System.err.println("url格式不规范:"+e.getMessage());
} catch (IOException e) {
System.err.println("IO操作错误:"+e.getMessage())