在曾经的工作中,实现过简单的网络爬虫,没有系统的介绍过,这篇博客就系统的介绍以下怎样使用java的HttpClient实现网络爬虫。
关于网络爬虫的一些理论知识、实现思想以及策略问题。能够參考百度百科“网络爬虫”,那里已经介绍的十分具体。这里也不再啰嗦,以下就主要介绍怎样去实现。
http请求:
代码開始之前,还是首先介绍以下怎样通过浏览器获取http请求信息。这一步是分析站点资源的第一步。在浏览器界面右键有“审查元素”这一功能(假设没找到。F12一样能够的),谷歌浏览器效果例如以下:
点击“审查元素”之后会出现例如以下界面:
当中的Network栏目是做爬虫应该重点关注的,打开会看到当前网页所有的http请求信息,例如以下图:
单击每一个信息。能够看到http请求的具体信息。例如以下图所看到的:
通过程序伪装成浏览器请求的时候,就多须要关注Request Headers里面的信息,另一些须要登录的站点也是须要关注这些的。Response里面的信息就是server返回的内容,这里仅仅做对文本信息的处理,对图片、音频、视频等信息不做介绍。
Response里面就包括这我们爬虫想获取的信息内容。假设里面的格式不好看的话。能够在浏览器中输入该http请求的url地址。然后右键-->查看网页源码的形式查看相关信息。
通过分析网页源码中的字符串。总结出统一的规则。提取对应的文本信息。
代码实现:
CrawlBase类,模拟http请求的基类
/**
*@Description: 获取网页信息基类
*/
package com.lulei.crawl;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.log4j.Logger;
import com.lulei.util.CharsetUtil;
public abstract class CrawlBase {
private static Logger log = Logger.getLogger(CrawlBase.class);
//链接源码
private String pageSourceCode = "";
//返回头信息
private Header[] responseHeaders = null;
//连接超时时间
private static int connectTimeout = 3500;
//连接读取时间
private static int readTimeout = 3500;
//默认最大訪问次数
private static int maxConnectTimes = 3;
//网页默认编码方式
private static String charsetName = "iso-8859-1";
private static HttpClient httpClient = new HttpClient();
static {
httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(connectTimeout);
httpClient.getHttpConnectionManager().getParams().setSoTimeout(readTimeout);
}
/**
* @param urlStr
* @param charsetName
* @param method
* @param params
* @return
* @throws HttpException
* @throws IOException
* @Author: lulei
* @Description: method方式訪问页面
*/
public boolean readPage(String urlStr, String charsetName, String method, HashMap params) throws HttpException, IOException {
if ("post".equals(method) || "POST".equals(method)) {
return readPageByPost(urlStr, charsetName, params);
} else {
return readPageByGet(urlStr, charsetName, params);
}
}
/**
* @param urlStr
* @param charsetName
* @param params
* @return 訪问是否成功
* @throws HttpException
* @throws IOException
* @Author: lulei
* @Description: Get方式訪问页面
*/
public boolean readPageByGet(String urlStr, String charsetName, HashMap params) throws HttpException, IOException {
GetMethod getMethod = createGetMethod(urlStr, params);
return readPage(getMethod, charsetName, urlStr);
}
/**
* @param urlStr
* @param charsetName
* @param params
* @return 訪问是否成功
* @throws HttpException
* @throws IOException
* @Author: lulei
* @Description: Post方式訪问页面
*/
public boolean readPageByPost(String urlStr, String charsetName, HashMap params) throws HttpException, IOException{
PostMethod postMethod = createPostMethod(urlStr, params);
return readPage(postMethod, charsetName, urlStr);
}
/**
* @param method
* @param defaultCharset
* @param urlStr
* @return 訪问是否成功
* @throws HttpException
* @throws IOException
* @Author: lulei
* @Description: 读取页面信息和头信息
*/
private boolean readPage(HttpMethod method, String defaultCharset, String urlStr) throws HttpException, IOException{
int n = maxConnectTimes;
while (n > 0) {
try {
if (httpClient.executeMethod(method) != HttpStatus.SC_OK){
log.error("can not connect " + urlStr + "\t" + (maxConnectTimes - n + 1) + "\t" + httpClient.executeMethod(method));
n--;
} else {
//获取头信息
responseHeaders = method.getResponseHeaders();
//获取页面源码
InputStream inputStream = method.getResponseBodyAsStream();
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, charsetName));
StringBuffer stringBuffer = new StringBuffer();
String lineString = null;
while ((lineString = bufferedReader.readLine()) != null){
stringBuffer.append(lineString);
stringBuffer.append("\n");
}
pageSourceCode = stringBuffer.toString();
InputStream in =new ByteArrayInputStream(pageSourceCode.getBytes(charsetName));
String charset = CharsetUtil.getStreamCharset(in, defaultCharset);
//以下这个推断是为了IP归属地查询特意加上去的
if ("Big5".equals(charset)) {
charset = "gbk";
}
if (!charsetName.toLowerCase().equals(charset.toLowerCase())) {
pageSourceCode = new String(pageSourceCode.getBytes(charsetName), charset);
}
return true;
}
} catch (Exception e) {
e.printStackTrace();
System.out.println(urlStr + " -- can't connect " + (maxConnectTimes - n + 1));
n--;
}
}
return false;
}
/**
* @param urlStr
* @param params
* @return GetMethod
* @Author: lulei
* @Description: 设置get请求參数
*/
@SuppressWarnings("rawtypes")
private GetMethod createGetMethod(String urlStr, HashMap params){
GetMethod getMethod = new GetMethod(urlStr);
if (params == null){
return getMethod;
}
Iterator iter = params.entrySet().iterator();
while (iter.hasNext()) {
Map.Entry entry = (Map.Entry) iter.next();
String key = (String) entry.getKey();
String val = (String) entry.getValue();
getMethod.setRequestHeader(key, val);
}
return getMethod;
}
/**
* @param urlStr
* @param params