转载请注明出处:http://blog.csdn.net/xiaojimanman/article/details/40891791
基于HttpClient4.5实现网络爬虫请访问这里:http://blog.csdn.net/xiaojimanman/article/details/53178307
在以前的工作中,实现过简单的网络爬虫,没有系统的介绍过,这篇博客就系统的介绍以下如何使用java的HttpClient实现网络爬虫。
关于网络爬虫的一些理论知识、实现思想以及策略问题,可以参考百度百科“网络爬虫”,那里已经介绍的十分详细,这里也不再啰嗦,下面就主要介绍如何去实现。
http请求:
代码开始之前,还是首先介绍以下如何通过浏览器获取http请求信息,这一步是分析网站资源的第一步。在浏览器界面右键有“审查元素”这一功能(如果没找到,F12一样可以的),谷歌浏览器效果如下:
点击“审查元素”之后会出现如下界面:
其中的Network栏目是做爬虫应该重点关注的,打开会看到当前网页所有的http请求信息,如下图:
单击每个信息,可以看到http请求的详细信息,如下图所示:
通过程序伪装成浏览器请求的时候,就多需要关注Request Headers里面的信息,还有一些需要登录的网站也是需要关注这些的。Response里面的信息就是服务器返回的内容,这里只做对文本信息的处理,对图片、音频、视频等信息不做介绍。
Response里面就包含这我们爬虫想获取的信息内容。如果里面的格式不好看的话,可以在浏览器中输入该http请求的url地址,然后右键-->查看网页源代码的形式查看相关信息。通过分析网页源代码中的字符串,总结出统一的规则,提取相应的文本信息。
代码实现:
CrawlBase类,模拟http请求的基类
/**
*@Description: 获取网页信息基类
*/
package com.lulei.crawl;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.log4j.Logger;
import com.lulei.util.CharsetUtil;
public abstract class CrawlBase {
private static Logger log = Logger.getLogger(CrawlBase.class);
//链接源代码
private String pageSourceCode = "";
//返回头信息
private Header[] responseHeaders = null;
//连接超时时间
private static int connectTimeout = 3500;
//连接读取时间
private static int readTimeout = 3500;
//默认最大访问次数
private static int maxConnectTimes = 3;
//网页默认编码方式
private static String charsetName = "iso-8859-1";
private static HttpClient httpClient = new HttpClient();
static {
httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(connectTimeout);
httpClient.getHttpConnectionManager().getParams().setSoTimeout(readTimeout);
}
/**
* @param urlStr
* @param charsetName
* @param method
* @param params
* @return
* @throws HttpException
* @throws IOException
* @Author: lulei
* @Description: method方式访问页面
*/
public boolean readPage(String urlStr, String charsetName, String method, HashMap<String, String> params) throws HttpException, IOException {
if ("post".equals(method) || "POST".equals(method)) {
return readPageByPost(urlStr, charsetName, params);
} else {
return readPageByGet(urlStr, charsetName, params);
}
}
/**
* @param urlStr
* @param charsetName
* @param params
* @return 访问是否成功
* @throws HttpException
* @throws IOException
* @Author: lulei
* @Description: Get方式访问页面
*/
public boolean readPageByGet(String urlStr, String charsetName, HashMap<String, String> params) throws HttpException, IOException {
GetMethod getMethod = createGetMethod(urlStr, params);
return readPage(getMethod, charsetName, urlStr);
}
/**
* @param urlStr
* @param charsetName
* @param params
* @return 访问是否成功
* @throws HttpException
* @throws IOException
* @Author: lulei
* @Description: Post方式访问页面
*/
public boolean readPageByPost(String urlStr, String charsetName, HashMap<String, String> params) throws HttpException, IOException{
PostMethod postMethod = createPostMethod(urlStr, params);
return readPage(postMethod, charsetName, urlStr);
}
/**
* @param method
* @param defaultCharset
* @param urlStr
* @return 访问是否成功
* @throws HttpException
* @throws IOException
* @Author: lulei
* @Description: 读取页面信息和头信息
*/
private boolean readPage(HttpMethod method, String defaultCharset, String urlStr) throws HttpException, IOException{
int n = maxConnectTimes;
while (n > 0) {
try {
if (httpClient.executeMethod(method) != HttpStatus.SC_OK){
log.error("can not connect " + urlStr + "\t" + (maxConnectTimes - n + 1) + "\t" + httpClient.executeMethod(method));
n--;
} else {
//获取头信息
responseHeaders = method.getResponseHeaders();
//获取页面源代码
InputStream inputStream = method.getResponseBodyAsStream();
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, charsetName));
StringBuffer stringBuffer = new StringBuffer();
String lineString = null;
while ((lineString = bufferedReader.readLine()) != null){
stringBuffer.append(lineString);
stringBuffer.append("\n");
}
pageSourceCode = stringBuffer.toString();
InputStream in =new ByteArrayInputStream(pageSourceCode.getB