package com.gesoft.html;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.Proxy;
import java.net.URL;
import java.net.URLConnection;
public class DownloadUtil {
/**
* 下载网页源代码
* 输入类型:链接url,超时时间,编码,cookies,代理
* 返回 String源码
*/
public static String getHtml(String strUrl, int timeout,String strEnCoding, String cookies, Proxy proxy) {
if (strUrl == null || strUrl.length() == 0) {
return null;
}
boolean isMemoryError = false;
StringBuffer strHtml = null;
String strLine = "";
HttpURLConnection httpConnection = null;// 这里可以定义成HttpURLConnection
InputStream urlStream = null;
BufferedInputStream buff = null;
BufferedReader br = null;
Reader r = null;
boolean isError = false;
try {
// 链接网络得到网页源代码
URL url = new URL(strUrl);
httpConnection = (HttpURLConnection) url.openConnection();
if (proxy != null) {
httpConnection = (HttpURLConnection) url.openConnection(proxy);
} else {
httpConnection = (HttpURLConnection) url.openConnection();
}
httpConnection.addRequestProperty("User-Agent","Mozilla/4.0");
//httpConnection.addRequestProperty("User-Agent","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1");
httpConnection.addRequestProperty("Accept","www/source; text/html; image/gif; */*");
httpConnection.addRequestProperty("Accept-Language","zh-cn,zh;q=0.5");
httpConnection.setFollowRedirects(true);
if (proxy == null && strEnCoding != null) {
httpConnection.addRequestProperty("Accept-Charset", strEnCoding);
}
if (cookies != null) {
httpConnection.setRequestProperty("Cookie", cookies);
}
httpConnection.setConnectTimeout(timeout);
httpConnection.setReadTimeout(timeout);
urlStream = httpConnection.getInputStream();
buff = new BufferedInputStream(urlStream);
if (strEnCoding == null || strEnCoding.compareTo("null") == 0) {
r = new InputStreamReader(buff);
} else {
try {
r = new InputStreamReader(buff, strEnCoding);
} catch (UnsupportedEncodingException e) {
r = new InputStreamReader(buff);
}
}
br = new BufferedReader(r);
strHtml = new StringBuffer("");
System.out.println(strLine);
System.out.println(br.readLine());
while ((strLine = br.readLine()) != null) {
strHtml.append(strLine + "\r\n");
}
} catch (java.lang.OutOfMemoryError out) {
out.printStackTrace();
System.out.println(out.getClass() + "下载网页" + strUrl + "失败");
isError = true;
isMemoryError = true;
} catch (Exception e) {
isError = true;
} finally {
try {
if (httpConnection != null) {
httpConnection.disconnect();
httpConnection = null;
}
if (br != null) {
br.close();
br = null;
}
if (r != null) {
r.close();
r = null;
}
if (buff != null) {
buff.close();
buff = null;
}
if (isMemoryError)
buff = null;
if (urlStream != null) {
urlStream.close();
urlStream = null;
}
if (isMemoryError)
System.gc();
} catch (Exception e) {
return null;
}
}
if (strHtml == null || isError){
String strcontent=getURLContent(strUrl,strEnCoding);
if(strcontent!=null){
return strcontent;
}else
return null;
}
if (isMemoryError)
return null;
//return strHtml.toString();
String strHtmlDecode=decodeUnicode(strHtml.toString());
return strHtmlDecode;
}
public static String getURLContent(String url, String encoding) {
if (url == null || "".equals(url.trim()))
return null;
StringBuffer content = new StringBuffer();
try {
// 新建URL对象
URL u = new URL(url);
InputStream in = new BufferedInputStream(u.openStream());
InputStreamReader theHTML = new InputStreamReader(in,encoding != null ? encoding : "gb2312");
int c;
while ((c = theHTML.read()) != -1) {
content.append((char) c);
}
}
// 处理异常
catch (MalformedURLException e) {
System.err.println(e);
return null;
} catch (IOException e) {
System.err.println(e);
return null;
}
return content.toString();
}
/**
* 获取跳转后的url
**/
public static String getUrlTrue(String urlStr){
URL url = null;
try {
url = new URL(urlStr);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
return urlStr;
}
String strUrl = "";
try {
URLConnection c = url.openConnection();
c.connect();
if (c instanceof HttpURLConnection) {
HttpURLConnection h = (HttpURLConnection) c;
h.getRequestMethod();
h.getResponseMessage();
h.getResponseCode();
strUrl=h.getURL().toString();
}
} catch (IOException e) {
// TODO Auto-generated catch block
return urlStr;
}
return strUrl;
}
/**
* 将unicode编码转换成汉字
* @param theString
* @return
*/
public static String decodeUnicode(String theString) {
char aChar;
int len = theString.length();
StringBuffer outBuffer = new StringBuffer(len);
try {
for (int x = 0; x < len;) {
aChar = theString.charAt(x++);
if (aChar == '\\') {
aChar = theString.charAt(x++);
if (aChar == 'u') {
int value = 0;
for (int i = 0; i < 4; i++) {
aChar = theString.charAt(x++);
switch (aChar) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
value = (value << 4) + aChar - '0';
break;
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
value = (value << 4) + 10 + aChar - 'a';
break;
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
value = (value << 4) + 10 + aChar - 'A';
break;
default:
throw new IllegalArgumentException(
"Malformed \\uxxxx encoding.");
}
}
outBuffer.append((char) value);
} else {
if (aChar == 't')
aChar = '\t';
else if (aChar == 'r')
aChar = '\r';
else if (aChar == 'n')
aChar = '\n';
else if (aChar == 'f')
aChar = '\f';
outBuffer.append(aChar);
}
} else
outBuffer.append(aChar);
}
} catch (Exception e) {
// TODO: handle exception
return theString;
}
return outBuffer.toString();
}
public void getHtml1() throws IOException{
URL url = new URL("http://www.baidu.com");
URLConnection rulConnection = url.openConnection();// 此处的urlConnection对象实际上是根据URL的
// 请求协议(此处是http)生成的URLConnection类 的子类HttpURLConnection,故此处最好将其转化
// 为HttpURLConnection类型的对象,以便用到
// HttpURLConnection更多的API.如下:
HttpURLConnection httpUrlConnection = (HttpURLConnection) rulConnection;
System.out.println(httpUrlConnection);
}
public void getHtml2() throws IOException{
// List<NameValuePair> formparams = new ArrayList<NameValuePair>();
// formparams.add(new BasicNameValuePair("param1", "value1"));
// formparams.add(new BasicNameValuePair("param2", "value2"));
// UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formparams, "UTF-8");
// HttpPost httppost = new HttpPost("http://localhost/handler.do");
// httppost.setEntity(entity);
}
public static void main(String[] args){
DownloadUtil downtime= new DownloadUtil();
String str=downtime.getHtml("http://www.baidu.com", 400, "utf-8", null, null);
// System.out.println(str);
// try {
// downtime.getHtml1();
// } catch (IOException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
}
}
通过httpclient进行打包进行采集数据方法。
最新推荐文章于 2021-07-07 15:33:42 发布