HttpClient之简单爬取页面的实现

    HTTP 协议可能是现在 Internet 上使用得最多、最重要的协议了,越来越多的 Java 应用程序需要直接通过 HTTP 协议来访问网络资源。 Commons-httpclient项目就是专门设计来简化HTTP客户端与服务器进行各种通讯编程。HttpClient是一个代码级的Http客户端工具,可以使用它模拟浏览器向Http服务器发送请求。

    在学习HttpClient相关知识前,我们先给出一个简单爬取页面的实例,直观的了解一下HttpClient的页面爬取过程。

    HttpClient官方网站:http://hc.apache.org/index.html

    使用HttpClient需要引入一下Jar

    

    注:由于代码中具有较清晰的注释,不再说具体的其他说明。

一、首先封装一个URL简易类,用于参数的基本设置

import java.util.HashMap;
import java.util.Map;

import org.apache.commons.httpclient.HttpURL;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.util.URIUtil;

public class SimpleHttpUrl extends Object{
	private static final long serialVersionUID = 1L;
	//例如 Url: http://www.baidu.com:80/OS/java/s?ie=utf8&oe=utf8&wd=java&tn=98010089_dg&ch=1
	private String scheme="http";//协议头(例如:http)
	private String host=null;//服务器Host(例如: www.baidu.com)
	private int port;//端口(例如:80)
//	//注 path=basePath+relativePath
//	private String path=null;
	private String basePath=null;//虚拟Path目录 (例如:/OS/java)
	private String relativePath=null;//相对路径 (例如:java/s	)
	private String query=null;//请求参数(例如:ie=utf8&oe=utf8&wd=java&tn=98010089_dg&ch=1) 

	private HttpURL httpUrl=null;//HttpURL对象
	
	public HttpURL getHttpUrl() throws URIException {
		if(host!=null&&(basePath!=null&&!basePath.equals("")))
			httpUrl=new HttpURL( this.getAbsoluteUrl());
		return httpUrl;
	}
	public String getHost() {
		return host;
	}
	public int getPort() {
		return port;
	}

	public void setRelativePath(String relativePath) {
		this.relativePath = relativePath;
	}
	public String getQuery() {
		return query;
	}
	public void setQuery(String query) {
		this.query = query;
	}
	
	/*
	 * 返回网站Root目录(例如:http://www.baidu.com:80)
	 */
	public String getBaseUrl(){
		if(host==null||host.equals(""))
			return null;
		StringBuffer urlBuffer=new StringBuffer(scheme+"://"+host+":"+port);
		urlBuffer.append(basePath);
		return urlBuffer.toString();
	}
	/*
	 * 获得网站相对请求地址(例如:/OS/java/s)
	 */
	public String getPath() {
		StringBuffer urlBuffer=new StringBuffer();
		if(relativePath!=null)
			urlBuffer.append(basePath+"/"+relativePath);
		return null;
	}
	/*
	 * 获得不包含请求参数的请求地址(例如: http://www.baidu.com:80/OS/java/s)
	 */
	public String getAllPath() {
		StringBuffer urlBuffer=new StringBuffer();
		urlBuffer.append(this.getBaseUrl());
		if(relativePath!=null)
			urlBuffer.append("/"+relativePath);
		return urlBuffer.toString();
	}

	/*
	 * 根据URL绝对路径实例化
	 * absoluteUrl:URL绝对路径(http://www.baidu.com:80/OS/java/s?ie=utf8&oe=utf8&wd=java&tn=98010089_dg&ch=1)
	 */
	public SimpleHttpUrl(String absoluteUrl) throws URIException{
		this(absoluteUrl,null);
	}
	/*
	 * 根据网站Root地址及相对路径实例化
	 * baseUrl:网站Root地址(例如:http://www.baidu.com:80)
	 * relativePath:相对路径(例如:/OS/java/s)
	 */
	public SimpleHttpUrl(String baseUrl,String relativePath) throws URIException{
		parseUrl(baseUrl);
		this.relativePath=relativePath;
	}

	/*
	 * 根据请求参数Map设置请求参数字符串
	 */
	public void setQueryFromParaMap(Map<String,String> paraQueryMap){
		StringBuffer queryBuffer=new StringBuffer();
		int i=0;
		for(Map.Entry<String, String> entry : paraQueryMap.entrySet()){
			queryBuffer.append(entry.getKey()+"="+entry.getValue()+"&");
		}
		if(queryBuffer.length()!=0)
			queryBuffer.deleteCharAt(queryBuffer.length()-1);
		query=queryBuffer.toString();
	}
	/*
	 * 解析Url,设置SimpleHttpUrl相关参数
	 */
	private void parseUrl(String url) throws URIException{
		if(!url.startsWith(scheme)){
			url=scheme+"://"+url;
		}
		HttpURL urlTemp=new HttpURL(url);
		scheme=urlTemp.getScheme();
		host=urlTemp.getHost();
		port=urlTemp.getPort();
		basePath=urlTemp.getPath();
		query=urlTemp.getQuery();
	}
	/*
	 * 获得Url绝对路径(例如: Url: http://www.baidu.com:80/OS/java/s?ie=utf8&oe=utf8&wd=java&tn=98010089_dg&ch=1)
	 */
	public String getAbsoluteUrl(){
		String baseUrl = getBaseUrl();
		if(baseUrl==null||baseUrl.equals(""))
			return null;
		if(host==null||host.equals(""))
			return null;
		StringBuffer urlBuffer=new StringBuffer(baseUrl);
		if(relativePath!=null)
			urlBuffer.append("/"+relativePath);
		if(query!=null&&!query.equals(""))
			urlBuffer.append("?"+query);
		return urlBuffer.toString();
	}
	/*
	 * 根据请求参数字符串获得请求参数对应的Map
	 */
	public Map<String,String> getQueryMapFromQuery(){
		Map<String,String> paraQueryMap=new HashMap<String,String>();
		if(query!=null&&!query.equals("")){
			String[] queryStr = query.split("&");
			for(int i=0;i<queryStr.length;i++){
				String[] paras=queryStr[i].split("=");
				paraQueryMap.put(paras[0], paras[1]);
			}
		}
		return paraQueryMap;
	}
}

 

 

二、对HttpClient类进行简易的封装,实现直观便捷的调用。其中请求方式暂时实现了Get、Post。

    在爬取页面时,常常涉及到页面的重定向问题,可以通过HttpMethod.getStatusCode()方法判断返回值是否为下表中的某个值来判断是否需要跳转。如果已经确认需要进行页面跳转了,那么可以通过读取HTTP头中的location属性来获取新的地址。例如实例中函数很好的演示处理页面的重定向问题。

状态码

对应HttpServletResponse的常量

详细描述

301

SC_MOVED_PERMANENTLY

页面已经永久移到另外一个新地址

302

SC_MOVED_TEMPORARILY

页面暂时移动到另外一个新的地址

303

SC_SEE_OTHER

客户端请求的地址必须通过另外的URL来访问

307

SC_TEMPORARY_REDIRECT

SC_MOVED_TEMPORARILY

 

 

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.URI;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;

import com.haiwi.util.HttpClientUtil;

public class SimpleHttpClient {

	//数据请求编码类型
	private String charset="UTF-8";
	
	//请求方法
	private HttpMethod httpMethod=null;
	//HttpClient实例化类
	private HttpClient httpClient=new HttpClient();
	//封装URL简易类
	private SimpleHttpUrl simpleHttpUrl=null;
	
	public String getCharset() {
		return charset;
	}

	public void setCharset(String charset) {
		this.charset = charset;
	}

	public SimpleHttpUrl getSimpleHttpUrl() {
		return simpleHttpUrl;
	}

	public void setHttpMethod(HttpMethod httpMethod) {
		this.httpMethod = httpMethod;
	}

	public HttpClient getHttpClient() {
		return httpClient;
	}

	public HttpMethod getHttpMethod() {
		return httpMethod;
	}

	public void close(){
		if(httpMethod!=null){
			httpMethod.releaseConnection();
		}
	}
	public SimpleHttpClient(SimpleHttpUrl simpleHttpUrl){
		this.simpleHttpUrl=simpleHttpUrl;
	}

	/*
	 * 下载HTML页面
	 */
	public String loadHtml(HttpClient httpClient,SimpleHttpUrl simpleHttpUrl,EnumData.HttpMethod httpMethodWay) throws HttpException, IOException{
		HttpMethod httpMethod=getHttpMethod(simpleHttpUrl,httpMethodWay);
		StringBuffer htmlBuffer=new StringBuffer();
		httpClient.executeMethod(httpMethod);
		htmlBuffer.append(httpMethod.getResponseBodyAsString());
		httpMethod.releaseConnection();
		return htmlBuffer.toString();
	}
	
	//下载无重定向的页面
	public String  getOneHtml(EnumData.HttpMethod httpMethodWay, Map<String,String> paraMap) throws HttpException, IOException, CloneNotSupportedException{
		String pageHtml=null;
		Map<String,String> htmlMap=getHtml(httpMethodWay,paraMap);
		
		for(Map.Entry<String, String> entry : htmlMap.entrySet()){
			pageHtml = entry.getValue();
			
		}
		return pageHtml;
	}
	/*
	 * 下载全部页面(兼容下载重定向页面)
	 * httpMethodWay:提交方式,暂时实现类Get、Post
	 */
	public Map<String,String> getHtml(EnumData.HttpMethod httpMethodWay, Map<String,String> paraMap) throws HttpException, IOException, CloneNotSupportedException{
		Map<String,String> htmlMap=new HashMap<String,String>();
		StringBuffer htmlBuffer =new StringBuffer(); 
		httpMethod=getHttpMethod(simpleHttpUrl,httpMethodWay);
		httpClient.executeMethod(httpMethod);
		if(httpMethod.getStatusCode()==HttpStatus.SC_OK){//成功返回则返回html文本
			htmlBuffer.append(httpMethod.getResponseBodyAsString());
		}
		this.close();
		// 检查是否重定向
		int statuscode = httpMethod.getStatusCode();
		if ((statuscode == HttpStatus.SC_MOVED_TEMPORARILY) || (statuscode == HttpStatus.SC_MOVED_PERMANENTLY) || (statuscode == HttpStatus.SC_SEE_OTHER) || (statuscode == HttpStatus.SC_TEMPORARY_REDIRECT)) {
			//兼容重定向多个页面
			Header[] headers=httpMethod.getResponseHeaders("location");
			for(int m=0;m<headers.length;m++){
				String newUrl=null;
				// 读取新的 URL 地址 
				Header header=httpMethod.getResponseHeader("location");
				if (header!=null){
					newUrl=header.getValue();
					if((newUrl==null)||(newUrl.equals("")))
						newUrl="/";
				}
				HttpClient mHttpClient= this.httpClient;
				SimpleHttpUrl mSimpleHttpUrl=null;
				if(newUrl.startsWith("http"))
					mSimpleHttpUrl=new SimpleHttpUrl(newUrl);
				else
					mSimpleHttpUrl=new SimpleHttpUrl(simpleHttpUrl.getBaseUrl(), newUrl);
				mSimpleHttpUrl.setQueryFromParaMap(paraMap);
				
				htmlMap.put(this.getSimpleHttpUrl().getAllPath(),  loadHtml(mHttpClient,mSimpleHttpUrl,httpMethodWay));
			}
		}
		else if(statuscode == HttpStatus.SC_OK){
			htmlMap.put(this.getSimpleHttpUrl().getAllPath(),  htmlBuffer.toString());
		}
		return htmlMap;
	}
	/*
	 * 使用Get提交
	 * allPath:绝对路径
	 * ( 例如: http://www.baidu.com:80/OS/java/s?ie=utf8&oe=utf8&wd=java&tn=98010089_dg&ch=1
	 * 则allPath: http://www.baidu.com:80/OS/java/s)
	 */
	public GetMethod getGetMethod(String allPath,Map<String,String> paraMap) throws URIException{
		if(paraMap==null)
			paraMap=new HashMap<String, String>();//接受为null的参数
		GetMethod get=new GetMethod();
		get.setURI(new URI(allPath));
		if(paraMap!=null)
			get.setQueryString(HttpClientUtil.getNameValuePairs(paraMap));
		return get;
	}
	
	/*
	 * 使用Post提交
	 */
	public PostMethod getPostMethod(String allPath,Map<String,String> paraMap) throws URIException{
		if(paraMap==null)
			paraMap=new HashMap<String, String>();//接受为null的参数
		PostMethod post=new PostMethod(allPath);
		post.setRequestBody(HttpClientUtil.getNameValuePairs(paraMap));
		post.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET, charset);
		post.addRequestHeader("Content-Type","text/html;charset="+charset);
		return post;
	}
	/*
	 * 类工厂函数
	 * 获得提交方式
	 * simpleHttpUrl:封装的URL,用于设置请求参数(包括:请求路径、请求参数等)。
	 */
	public HttpMethod getHttpMethod(SimpleHttpUrl simpleHttpUrl,EnumData.HttpMethod httpMethodWay) throws URIException{
		HttpMethod httpMethod=null;
		if(httpMethodWay==EnumData.HttpMethod.HTTPMETHOD_GET )
			httpMethod = getGetMethod(simpleHttpUrl.getAllPath(),simpleHttpUrl.getQueryMapFromQuery());
		else if(httpMethodWay==EnumData.HttpMethod.HTTPMETHOD_POST){
			httpMethod = getPostMethod(simpleHttpUrl.getAllPath(),simpleHttpUrl.getQueryMapFromQuery());
		}
		return httpMethod;
	}
}

 

三、工具类

public class EnumData {
	public enum HttpMethod{
		HTTPMETHOD_GET, //GetMethod方式提交
		HTTPMETHOD_POST //PostMethod方式提交
	}
}

 

import java.util.Map;

import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.URI;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;

public class HttpClientUtil {
	@SuppressWarnings("unused")
	public static NameValuePair[] getNameValuePairs(Map<String,String> paraMap){
		NameValuePair[] nameValues=new NameValuePair[paraMap.size()];
		int i=0;
		for(Map.Entry<String, String> entry : paraMap.entrySet()){
			NameValuePair nameVaulePair=new NameValuePair(entry.getKey(),entry.getValue());	
			nameValues[i++]=nameVaulePair;
		}
		return nameValues;
	}
}


 四、下面我们就可以测试一下上面的封装类了。

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.NameValuePair;

import com.haiwi.httpclient.EnumData;
import com.haiwi.httpclient.SimpleHttpClient;
import com.haiwi.httpclient.SimpleHttpUrl;

public class Test {
	public static void main(String[] args) throws HttpException, IOException, CloneNotSupportedException {
		System.out.println("Start!!!!");
		//以Post方式请求
		SimpleHttpUrl simpleHttpUrl = new SimpleHttpUrl("http://www.baidu.com","");
		Map<String,String> paraMap=new HashMap<String, String>();
		paraMap.put("tn","98010089_dg");
		simpleHttpUrl.setQueryFromParaMap(paraMap);
		SimpleHttpClient simpleHttpClient=new SimpleHttpClient(simpleHttpUrl);
		simpleHttpClient.setCharset("gbk");
		Map<String,String> htmlMap = simpleHttpClient.getHtml(EnumData.HttpMethod.HTTPMETHOD_POST, null);
		for(Map.Entry<String, String> entry : htmlMap.entrySet()){
			System.out.println(entry.getKey());
			System.out.println(entry.getValue());
			System.out.println();
			
		}
		

		//以Get方式请求
//		String url="http://www.baidu.com/?tn=98010089_dg";
//		SimpleHttpUrl simpleHttpUrl = new SimpleHttpUrl(url);
//		SimpleHttpClient simpleHttpClient=new SimpleHttpClient(simpleHttpUrl);
//		Map<String,String> htmlMap = simpleHttpClient.getHtml(EnumData.HttpMethod.HTTPMETHOD_GET, null);
//		for(Map.Entry<String, String> entry : htmlMap.entrySet()){
//			System.out.println(entry.getKey());
//			System.out.println(entry.getValue());
//			System.out.println();
//			
//		}
		System.out.println("End!!!!");
	}
}


 

 

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
http工具类:package com.tpl.util; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import net.sf.json.JSONArray; import net.sf.json.JSONObject; import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.methods.HttpPost; import org.apache.http.entity.StringEntity; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.util.EntityUtils; /** * */ public class HttpClientUtil { public static void main(String arg[]) throws Exception { String url = "http://xxx/project/getxxx.action"; JSONObject params= new JSONObject(); List res=new ArrayList(); JSONObject params1 = new JSONObject(); // params1.put("code", "200"); // params1.put("phone", "13240186028"); res.add(params1); params.put("result", res); String ret = doPost(url, params).toString(); System.out.println(ret); } /** httpClient的get请求方式2 * @return * @throws Exception */ public static String doGet(String url, String charset) throws Exception { /* * 使用 GetMethod 来访问一个 URL 对应的网页,实现步骤: 1:生成一个 HttpClinet 对象并设置相应的参数。 * 2:生成一个 GetMethod 对象并设置响应的参数。 3:用 HttpClinet 生成的对象来执行 GetMethod 生成的Get * 方法。 4:处理响应状态码。 5:若响应正常,处理 HTTP 响应内容。 6:释放连接。 */ /* 1 生成 HttpClinet 对象并设置参数 */ HttpClient httpClient = new HttpClient(); // 设置 Http 连接超时为5秒 httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(5000); /* 2 生成 GetMethod 对象并设置参数 */ GetMethod getMethod = new GetMethod(url); // 设置 get 请求超时为 5 秒
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值