通过httpclient进行打包进行采集数据方法。

package com.gesoft.html;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.Proxy;
import java.net.URL;
import java.net.URLConnection;


public class DownloadUtil {
	/**
	 * 下载网页源代码
	 * 输入类型:链接url,超时时间,编码,cookies,代理
	 * 返回 String源码
	 */
	public static String getHtml(String strUrl, int timeout,String strEnCoding, String cookies, Proxy proxy) {
		if (strUrl == null || strUrl.length() == 0) {
			return null;
		}
		boolean isMemoryError = false;
		StringBuffer strHtml = null;
		String strLine = "";
		HttpURLConnection httpConnection = null;// 这里可以定义成HttpURLConnection
		InputStream urlStream = null;
		BufferedInputStream buff = null;
		BufferedReader br = null;
		Reader r = null;
		boolean isError = false;
		try {
			// 链接网络得到网页源代码
			URL url = new URL(strUrl);
			httpConnection = (HttpURLConnection) url.openConnection();
			if (proxy != null) {
				httpConnection = (HttpURLConnection) url.openConnection(proxy);
			} else {
				httpConnection = (HttpURLConnection) url.openConnection();
			}
			httpConnection.addRequestProperty("User-Agent","Mozilla/4.0");
			//httpConnection.addRequestProperty("User-Agent","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1");
    		httpConnection.addRequestProperty("Accept","www/source; text/html; image/gif; */*");
    		httpConnection.addRequestProperty("Accept-Language","zh-cn,zh;q=0.5");
    		
    		httpConnection.setFollowRedirects(true);
			if (proxy == null && strEnCoding != null) {
				httpConnection.addRequestProperty("Accept-Charset", strEnCoding);
			}
			if (cookies != null) {
				httpConnection.setRequestProperty("Cookie", cookies);
			}
			
			httpConnection.setConnectTimeout(timeout);
			httpConnection.setReadTimeout(timeout);
			urlStream = httpConnection.getInputStream();
			buff = new BufferedInputStream(urlStream);
			if (strEnCoding == null || strEnCoding.compareTo("null") == 0) {
				r = new InputStreamReader(buff);
			} else {
				try {
					r = new InputStreamReader(buff, strEnCoding);
				} catch (UnsupportedEncodingException e) {
					r = new InputStreamReader(buff);
				}
			}
			br = new BufferedReader(r);
			strHtml = new StringBuffer("");
			System.out.println(strLine);
			System.out.println(br.readLine());
			while ((strLine = br.readLine()) != null) {
				strHtml.append(strLine + "\r\n");
			}
			
		} catch (java.lang.OutOfMemoryError out) {
			out.printStackTrace();
			System.out.println(out.getClass() + "下载网页" + strUrl + "失败");
			isError = true;
			isMemoryError = true;
		} catch (Exception e) {
			isError = true;
		} finally {
			try {
				if (httpConnection != null) {
					httpConnection.disconnect();
					httpConnection = null;
				}
				if (br != null) {
					br.close();
					br = null;
				}
				if (r != null) {
					r.close();
					r = null;
				}
				if (buff != null) {
					buff.close();
					buff = null;
				}
				if (isMemoryError)
					buff = null;
				if (urlStream != null) {
					urlStream.close();
					urlStream = null;
				}
				if (isMemoryError)
					System.gc();
			} catch (Exception e) {
				return null;
			}
		}
		if (strHtml == null || isError){
			String strcontent=getURLContent(strUrl,strEnCoding);
			if(strcontent!=null){
				return strcontent;
			}else
			   return null;
		}
		if (isMemoryError)
			return null;
		//return strHtml.toString();
		String strHtmlDecode=decodeUnicode(strHtml.toString());
		return strHtmlDecode;
	}
	
	
	public static String getURLContent(String url, String encoding) {
		if (url == null || "".equals(url.trim()))
			return null;

		StringBuffer content = new StringBuffer();
		try {
			// 新建URL对象
			URL u = new URL(url);
			InputStream in = new BufferedInputStream(u.openStream());
			InputStreamReader theHTML = new InputStreamReader(in,encoding != null ? encoding : "gb2312");
			int c;
			while ((c = theHTML.read()) != -1) {
				content.append((char) c);
			}
		}
		// 处理异常
		catch (MalformedURLException e) {
			System.err.println(e);
			return null;
		} catch (IOException e) {
			System.err.println(e);
			return null;
		}
		return content.toString();
	}
	/**
	 * 获取跳转后的url
	 **/
    public static String getUrlTrue(String urlStr){
    	URL url = null;
		try {
			url = new URL(urlStr);
		} catch (MalformedURLException e) {
			// TODO Auto-generated catch block
			return urlStr;
		}
        String strUrl = "";
        try {
        	URLConnection c = url.openConnection();
			c.connect();
			if (c instanceof HttpURLConnection) {
				HttpURLConnection h = (HttpURLConnection) c;
				h.getRequestMethod();
				h.getResponseMessage();
				h.getResponseCode();
				strUrl=h.getURL().toString();
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			return urlStr;
		}
		return strUrl;
    }
    /**
     * 将unicode编码转换成汉字
     * @param theString
     * @return
     */
	public static String decodeUnicode(String theString) {
		char aChar;
		int len = theString.length();
		StringBuffer outBuffer = new StringBuffer(len);
		try {
			for (int x = 0; x < len;) {
				aChar = theString.charAt(x++);
				if (aChar == '\\') {
					aChar = theString.charAt(x++);
					if (aChar == 'u') {
						int value = 0;
						for (int i = 0; i < 4; i++) {
							aChar = theString.charAt(x++);
							switch (aChar) {
							case '0':
							case '1':
							case '2':
							case '3':
							case '4':
							case '5':
							case '6':
							case '7':
							case '8':
							case '9':
								value = (value << 4) + aChar - '0';
								break;
							case 'a':
							case 'b':
							case 'c':
							case 'd':
							case 'e':
							case 'f':
								value = (value << 4) + 10 + aChar - 'a';
								break;
							case 'A':
							case 'B':
							case 'C':
							case 'D':
							case 'E':
							case 'F':
								value = (value << 4) + 10 + aChar - 'A';
								break;
							default:
								throw new IllegalArgumentException(
										"Malformed   \\uxxxx   encoding.");
							}

						}
						outBuffer.append((char) value);
					} else {
						if (aChar == 't')
							aChar = '\t';
						else if (aChar == 'r')
							aChar = '\r';
						else if (aChar == 'n')
							aChar = '\n';
						else if (aChar == 'f')
							aChar = '\f';
						outBuffer.append(aChar);
					}
				} else
					outBuffer.append(aChar);
			}
		} catch (Exception e) {
			// TODO: handle exception
			return theString;
		}
		return outBuffer.toString();
	} 
	public void getHtml1() throws IOException{
		 URL url = new URL("http://www.baidu.com");   
		 URLConnection rulConnection = url.openConnection();// 此处的urlConnection对象实际上是根据URL的     
		 // 请求协议(此处是http)生成的URLConnection类  的子类HttpURLConnection,故此处最好将其转化  
		 // 为HttpURLConnection类型的对象,以便用到   
		 // HttpURLConnection更多的API.如下:   
		 HttpURLConnection httpUrlConnection = (HttpURLConnection) rulConnection;		
		 System.out.println(httpUrlConnection);
	}
	public void getHtml2() throws IOException{
//		List<NameValuePair> formparams = new ArrayList<NameValuePair>();
//		formparams.add(new BasicNameValuePair("param1", "value1"));
//		formparams.add(new BasicNameValuePair("param2", "value2"));
//		UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formparams, "UTF-8");
//		HttpPost httppost = new HttpPost("http://localhost/handler.do");
//		httppost.setEntity(entity);
		
	}
	
    
    
    
	public static void main(String[] args){
		DownloadUtil downtime= new DownloadUtil();
		String str=downtime.getHtml("http://www.baidu.com", 400, "utf-8", null, null);
//		System.out.println(str);
//		try {
//			downtime.getHtml1();
//		} catch (IOException e) {
//			// TODO Auto-generated catch block
//			e.printStackTrace();
//		}
	}
}


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值