HTTP 协议可能是现在 Internet 上使用得最多、最重要的协议了,越来越多的 Java 应用程序需要直接通过 HTTP 协议来访问网络资源。 Commons-httpclient项目就是专门设计来简化HTTP客户端与服务器进行各种通讯编程。HttpClient是一个代码级的Http客户端工具,可以使用它模拟浏览器向Http服务器发送请求。
在学习HttpClient相关知识前,我们先给出一个简单爬取页面的实例,直观的了解一下HttpClient的页面爬取过程。
HttpClient官方网站:http://hc.apache.org/index.html
使用HttpClient需要引入一下Jar
注:由于代码中具有较清晰的注释,不再说具体的其他说明。
一、首先封装一个URL简易类,用于参数的基本设置
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.httpclient.HttpURL;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.util.URIUtil;
public class SimpleHttpUrl extends Object{
private static final long serialVersionUID = 1L;
//例如 Url: http://www.baidu.com:80/OS/java/s?ie=utf8&oe=utf8&wd=java&tn=98010089_dg&ch=1
private String scheme="http";//协议头(例如:http)
private String host=null;//服务器Host(例如: www.baidu.com)
private int port;//端口(例如:80)
// //注 path=basePath+relativePath
// private String path=null;
private String basePath=null;//虚拟Path目录 (例如:/OS/java)
private String relativePath=null;//相对路径 (例如:java/s )
private String query=null;//请求参数(例如:ie=utf8&oe=utf8&wd=java&tn=98010089_dg&ch=1)
private HttpURL httpUrl=null;//HttpURL对象
public HttpURL getHttpUrl() throws URIException {
if(host!=null&&(basePath!=null&&!basePath.equals("")))
httpUrl=new HttpURL( this.getAbsoluteUrl());
return httpUrl;
}
public String getHost() {
return host;
}
public int getPort() {
return port;
}
public void setRelativePath(String relativePath) {
this.relativePath = relativePath;
}
public String getQuery() {
return query;
}
public void setQuery(String query) {
this.query = query;
}
/*
* 返回网站Root目录(例如:http://www.baidu.com:80)
*/
public String getBaseUrl(){
if(host==null||host.equals(""))
return null;
StringBuffer urlBuffer=new StringBuffer(scheme+"://"+host+":"+port);
urlBuffer.append(basePath);
return urlBuffer.toString();
}
/*
* 获得网站相对请求地址(例如:/OS/java/s)
*/
public String getPath() {
StringBuffer urlBuffer=new StringBuffer();
if(relativePath!=null)
urlBuffer.append(basePath+"/"+relativePath);
return null;
}
/*
* 获得不包含请求参数的请求地址(例如: http://www.baidu.com:80/OS/java/s)
*/
public String getAllPath() {
StringBuffer urlBuffer=new StringBuffer();
urlBuffer.append(this.getBaseUrl());
if(relativePath!=null)
urlBuffer.append("/"+relativePath);
return urlBuffer.toString();
}
/*
* 根据URL绝对路径实例化
* absoluteUrl:URL绝对路径(http://www.baidu.com:80/OS/java/s?ie=utf8&oe=utf8&wd=java&tn=98010089_dg&ch=1)
*/
public SimpleHttpUrl(String absoluteUrl) throws URIException{
this(absoluteUrl,null);
}
/*
* 根据网站Root地址及相对路径实例化
* baseUrl:网站Root地址(例如:http://www.baidu.com:80)
* relativePath:相对路径(例如:/OS/java/s)
*/
public SimpleHttpUrl(String baseUrl,String relativePath) throws URIException{
parseUrl(baseUrl);
this.relativePath=relativePath;
}
/*
* 根据请求参数Map设置请求参数字符串
*/
public void setQueryFromParaMap(Map<String,String> paraQueryMap){
StringBuffer queryBuffer=new StringBuffer();
int i=0;
for(Map.Entry<String, String> entry : paraQueryMap.entrySet()){
queryBuffer.append(entry.getKey()+"="+entry.getValue()+"&");
}
if(queryBuffer.length()!=0)
queryBuffer.deleteCharAt(queryBuffer.length()-1);
query=queryBuffer.toString();
}
/*
* 解析Url,设置SimpleHttpUrl相关参数
*/
private void parseUrl(String url) throws URIException{
if(!url.startsWith(scheme)){
url=scheme+"://"+url;
}
HttpURL urlTemp=new HttpURL(url);
scheme=urlTemp.getScheme();
host=urlTemp.getHost();
port=urlTemp.getPort();
basePath=urlTemp.getPath();
query=urlTemp.getQuery();
}
/*
* 获得Url绝对路径(例如: Url: http://www.baidu.com:80/OS/java/s?ie=utf8&oe=utf8&wd=java&tn=98010089_dg&ch=1)
*/
public String getAbsoluteUrl(){
String baseUrl = getBaseUrl();
if(baseUrl==null||baseUrl.equals(""))
return null;
if(host==null||host.equals(""))
return null;
StringBuffer urlBuffer=new StringBuffer(baseUrl);
if(relativePath!=null)
urlBuffer.append("/"+relativePath);
if(query!=null&&!query.equals(""))
urlBuffer.append("?"+query);
return urlBuffer.toString();
}
/*
* 根据请求参数字符串获得请求参数对应的Map
*/
public Map<String,String> getQueryMapFromQuery(){
Map<String,String> paraQueryMap=new HashMap<String,String>();
if(query!=null&&!query.equals("")){
String[] queryStr = query.split("&");
for(int i=0;i<queryStr.length;i++){
String[] paras=queryStr[i].split("=");
paraQueryMap.put(paras[0], paras[1]);
}
}
return paraQueryMap;
}
}
二、对HttpClient类进行简易的封装,实现直观便捷的调用。其中请求方式暂时实现了Get、Post。
在爬取页面时,常常涉及到页面的重定向问题,可以通过HttpMethod.getStatusCode()方法判断返回值是否为下表中的某个值来判断是否需要跳转。如果已经确认需要进行页面跳转了,那么可以通过读取HTTP头中的location属性来获取新的地址。例如实例中函数很好的演示处理页面的重定向问题。
状态码 | 对应HttpServletResponse的常量 | 详细描述 |
301 | SC_MOVED_PERMANENTLY | 页面已经永久移到另外一个新地址 |
302 | SC_MOVED_TEMPORARILY | 页面暂时移动到另外一个新的地址 |
303 | SC_SEE_OTHER | 客户端请求的地址必须通过另外的URL来访问 |
307 | SC_TEMPORARY_REDIRECT | 同SC_MOVED_TEMPORARILY |
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.URI;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import com.haiwi.util.HttpClientUtil;
public class SimpleHttpClient {
//数据请求编码类型
private String charset="UTF-8";
//请求方法
private HttpMethod httpMethod=null;
//HttpClient实例化类
private HttpClient httpClient=new HttpClient();
//封装URL简易类
private SimpleHttpUrl simpleHttpUrl=null;
public String getCharset() {
return charset;
}
public void setCharset(String charset) {
this.charset = charset;
}
public SimpleHttpUrl getSimpleHttpUrl() {
return simpleHttpUrl;
}
public void setHttpMethod(HttpMethod httpMethod) {
this.httpMethod = httpMethod;
}
public HttpClient getHttpClient() {
return httpClient;
}
public HttpMethod getHttpMethod() {
return httpMethod;
}
public void close(){
if(httpMethod!=null){
httpMethod.releaseConnection();
}
}
public SimpleHttpClient(SimpleHttpUrl simpleHttpUrl){
this.simpleHttpUrl=simpleHttpUrl;
}
/*
* 下载HTML页面
*/
public String loadHtml(HttpClient httpClient,SimpleHttpUrl simpleHttpUrl,EnumData.HttpMethod httpMethodWay) throws HttpException, IOException{
HttpMethod httpMethod=getHttpMethod(simpleHttpUrl,httpMethodWay);
StringBuffer htmlBuffer=new StringBuffer();
httpClient.executeMethod(httpMethod);
htmlBuffer.append(httpMethod.getResponseBodyAsString());
httpMethod.releaseConnection();
return htmlBuffer.toString();
}
//下载无重定向的页面
public String getOneHtml(EnumData.HttpMethod httpMethodWay, Map<String,String> paraMap) throws HttpException, IOException, CloneNotSupportedException{
String pageHtml=null;
Map<String,String> htmlMap=getHtml(httpMethodWay,paraMap);
for(Map.Entry<String, String> entry : htmlMap.entrySet()){
pageHtml = entry.getValue();
}
return pageHtml;
}
/*
* 下载全部页面(兼容下载重定向页面)
* httpMethodWay:提交方式,暂时实现类Get、Post
*/
public Map<String,String> getHtml(EnumData.HttpMethod httpMethodWay, Map<String,String> paraMap) throws HttpException, IOException, CloneNotSupportedException{
Map<String,String> htmlMap=new HashMap<String,String>();
StringBuffer htmlBuffer =new StringBuffer();
httpMethod=getHttpMethod(simpleHttpUrl,httpMethodWay);
httpClient.executeMethod(httpMethod);
if(httpMethod.getStatusCode()==HttpStatus.SC_OK){//成功返回则返回html文本
htmlBuffer.append(httpMethod.getResponseBodyAsString());
}
this.close();
// 检查是否重定向
int statuscode = httpMethod.getStatusCode();
if ((statuscode == HttpStatus.SC_MOVED_TEMPORARILY) || (statuscode == HttpStatus.SC_MOVED_PERMANENTLY) || (statuscode == HttpStatus.SC_SEE_OTHER) || (statuscode == HttpStatus.SC_TEMPORARY_REDIRECT)) {
//兼容重定向多个页面
Header[] headers=httpMethod.getResponseHeaders("location");
for(int m=0;m<headers.length;m++){
String newUrl=null;
// 读取新的 URL 地址
Header header=httpMethod.getResponseHeader("location");
if (header!=null){
newUrl=header.getValue();
if((newUrl==null)||(newUrl.equals("")))
newUrl="/";
}
HttpClient mHttpClient= this.httpClient;
SimpleHttpUrl mSimpleHttpUrl=null;
if(newUrl.startsWith("http"))
mSimpleHttpUrl=new SimpleHttpUrl(newUrl);
else
mSimpleHttpUrl=new SimpleHttpUrl(simpleHttpUrl.getBaseUrl(), newUrl);
mSimpleHttpUrl.setQueryFromParaMap(paraMap);
htmlMap.put(this.getSimpleHttpUrl().getAllPath(), loadHtml(mHttpClient,mSimpleHttpUrl,httpMethodWay));
}
}
else if(statuscode == HttpStatus.SC_OK){
htmlMap.put(this.getSimpleHttpUrl().getAllPath(), htmlBuffer.toString());
}
return htmlMap;
}
/*
* 使用Get提交
* allPath:绝对路径
* ( 例如: http://www.baidu.com:80/OS/java/s?ie=utf8&oe=utf8&wd=java&tn=98010089_dg&ch=1
* 则allPath: http://www.baidu.com:80/OS/java/s)
*/
public GetMethod getGetMethod(String allPath,Map<String,String> paraMap) throws URIException{
if(paraMap==null)
paraMap=new HashMap<String, String>();//接受为null的参数
GetMethod get=new GetMethod();
get.setURI(new URI(allPath));
if(paraMap!=null)
get.setQueryString(HttpClientUtil.getNameValuePairs(paraMap));
return get;
}
/*
* 使用Post提交
*/
public PostMethod getPostMethod(String allPath,Map<String,String> paraMap) throws URIException{
if(paraMap==null)
paraMap=new HashMap<String, String>();//接受为null的参数
PostMethod post=new PostMethod(allPath);
post.setRequestBody(HttpClientUtil.getNameValuePairs(paraMap));
post.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET, charset);
post.addRequestHeader("Content-Type","text/html;charset="+charset);
return post;
}
/*
* 类工厂函数
* 获得提交方式
* simpleHttpUrl:封装的URL,用于设置请求参数(包括:请求路径、请求参数等)。
*/
public HttpMethod getHttpMethod(SimpleHttpUrl simpleHttpUrl,EnumData.HttpMethod httpMethodWay) throws URIException{
HttpMethod httpMethod=null;
if(httpMethodWay==EnumData.HttpMethod.HTTPMETHOD_GET )
httpMethod = getGetMethod(simpleHttpUrl.getAllPath(),simpleHttpUrl.getQueryMapFromQuery());
else if(httpMethodWay==EnumData.HttpMethod.HTTPMETHOD_POST){
httpMethod = getPostMethod(simpleHttpUrl.getAllPath(),simpleHttpUrl.getQueryMapFromQuery());
}
return httpMethod;
}
}
三、工具类
public class EnumData {
public enum HttpMethod{
HTTPMETHOD_GET, //GetMethod方式提交
HTTPMETHOD_POST //PostMethod方式提交
}
}
import java.util.Map;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.URI;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
public class HttpClientUtil {
@SuppressWarnings("unused")
public static NameValuePair[] getNameValuePairs(Map<String,String> paraMap){
NameValuePair[] nameValues=new NameValuePair[paraMap.size()];
int i=0;
for(Map.Entry<String, String> entry : paraMap.entrySet()){
NameValuePair nameVaulePair=new NameValuePair(entry.getKey(),entry.getValue());
nameValues[i++]=nameVaulePair;
}
return nameValues;
}
}
四、下面我们就可以测试一下上面的封装类了。
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.NameValuePair;
import com.haiwi.httpclient.EnumData;
import com.haiwi.httpclient.SimpleHttpClient;
import com.haiwi.httpclient.SimpleHttpUrl;
public class Test {
public static void main(String[] args) throws HttpException, IOException, CloneNotSupportedException {
System.out.println("Start!!!!");
//以Post方式请求
SimpleHttpUrl simpleHttpUrl = new SimpleHttpUrl("http://www.baidu.com","");
Map<String,String> paraMap=new HashMap<String, String>();
paraMap.put("tn","98010089_dg");
simpleHttpUrl.setQueryFromParaMap(paraMap);
SimpleHttpClient simpleHttpClient=new SimpleHttpClient(simpleHttpUrl);
simpleHttpClient.setCharset("gbk");
Map<String,String> htmlMap = simpleHttpClient.getHtml(EnumData.HttpMethod.HTTPMETHOD_POST, null);
for(Map.Entry<String, String> entry : htmlMap.entrySet()){
System.out.println(entry.getKey());
System.out.println(entry.getValue());
System.out.println();
}
//以Get方式请求
// String url="http://www.baidu.com/?tn=98010089_dg";
// SimpleHttpUrl simpleHttpUrl = new SimpleHttpUrl(url);
// SimpleHttpClient simpleHttpClient=new SimpleHttpClient(simpleHttpUrl);
// Map<String,String> htmlMap = simpleHttpClient.getHtml(EnumData.HttpMethod.HTTPMETHOD_GET, null);
// for(Map.Entry<String, String> entry : htmlMap.entrySet()){
// System.out.println(entry.getKey());
// System.out.println(entry.getValue());
// System.out.println();
//
// }
System.out.println("End!!!!");
}
}