网络爬虫总结
爬虫准备
- JDK 1.7
- Eclipse(或STS)
- httpclient 4.3以上版本
- jsoup(html解析器)
- Chrome(带有开发者工具的浏览器)
请求分析
对爬虫网站进行请求分析。
主要获取请求头、cookie和请求对应的URL。
对于无法直接获取请求URL的请求,比如通过form表单提交的请求,
或者ajax请求,需要通过源码来查看提交的URL地址。
对于POST请求中的请求参数,可以通过Fiddler抓包工具查看。
爬虫代码编写
爬虫通用方法及属性值
首先定义配置属性值:
//请求超时
private static final int TIMEOUT = 20000;
//连接池最大连接数
private static final int MAX_TOTAL = 200;
//路由的默认最大连接数
private static final int MAX_PERROUTE = 2;
//定义 CookieStore 对象,用于保存响应cookie
private static CookieStore cookieStore = new BasicCookieStore();
//定义请求配置对象,并设置相关参数
private static RequestConfig config = RequestConfig.custom()
.setSocketTimeout(TIMEOUT)
.setConnectTimeout(TIMEOUT).setConnectionRequestTimeout(TIMEOUT)
.setCookieSpec(CookieSpecs.BROWSER_COMPATIBILITY)
.build();
设置获取httpclient对象的方法
/**
* 获取httpclient对象
* @param {[int]} 重试次数
* @return {[closeableHttpClient]}
*/
public static CloseableHttpClient getHttpClient(int tryTimes) {
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
StandardHttpRequestRetryHandler retryHandler = new StandardHttpRequestRetryHandler(tryTimes, false);
cm.setMaxTotal(MAX_TOTAL);
cm.setDefaultMaxPerRoute(MAX_PERROUTE);
CloseableHttpClient httpClient = HttpClients
.custom()
.setDefaultCookieStore(cookieStore)
.setRetryHandler(retryHandler)
.setConnectionManager(cm)
.build();
return httpClient;
}
设置请求头信息,可以将固定不变的请求头信息存放在Map对象中。
通过cookieStore获取cookies,然后将获取到cookies添加到请求头中。
List<Cookie> cookies = cookieStore.getCookies();
String cookielist = cookie1 + cookie2 + cookie3 + cookie4 + cookie5;
for (int i = 0; i < cookies.size(); i++) {
cookielist += cookies.get(i).getName() + "=" + cookies.get(i).getValue() + "; ";
}
headers.put("Cookie", cookielist.substring(0, cookielist.length()-2));
/**
* 执行HTTP Get请求,返回response对象
* @param {[CloseableHttpClient]}
* @param {[String]}
* @param {[HashMap<String,String>]}
* @return {[CloseablehHttpResponse]}
*/
public static CloseableHttpResponse doHttpGet(CloseableHttpClient httpClient,
String url, HashMap<String, String> headers) {
CloseableHttpResponse response = null;
HttpGet httpGet = new HttpGet(url);
httpGet.setConfig(config);
for (String key : headers.keySet()) {
httpGet.setHeader(key, headers.get(key));
}
try {
response = httpClient.execute(httpGet);
} catch (IOException e) {
e.printStackTrace();
}
return response;
}
对于POST请求,需要设置提交参数。
List<NameValuePair> formparams = new ArrayList<NameValuePair>();
formparams.add(new BasicNameValuePair("key", "value"));
UrlEncodedFormEntity uentity = new UrlEncodedFormEntity(formparams,
Consts.UTF_8);
httpPost.setEntity(uentity);
响应处理过程:
/**
* 针对不同的响应状态进行处理
* @param {CloseableHttpResponse}
* @return {[void]}
*/
public static void ResponseProcess(CloseableHttpResponse response) {
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
log.info("成功响应! " + response.getStatusLine());
} else {
log.info(response.getStatusLine());
}
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
获取响应实体:
HttpEntity entity = response.getEntity();
if (entity != null) {
String responseBody =EntityUtils.toString(entity, "utf-8");
EntityUtils.consume(entity);
response.close();
}
爬虫过程分析
/**
* 执行爬虫过程,主要分三次请求,和三次响应处理,返回第三次请求的响应体
* @param {[...]}
* @return {[String
*/
public static String searchFlightInfo(int tryTimes, String org,
String dst, String orgCode, String dstCode, String oDate,
String dDate){
CloseableHttpClient httpClient = getHttpClient(tryTimes);
headers = getHeaders();
response = doHttpGet(httpClient, INDEX_URL, headers);
firstResponseOut(response);
headers = getCookieHeaders();
response = doHttpPost(httpClient, SEARCH_URL, headers, getFormParams(org, dst, orgCode, dstCode, oDate, dDate));
secondResponseOut(response);
response = doHttpGet(httpClient, RESULT_URL, headers);
String responseBody = thirdResponseOut(response);
return responseBody;
}
html页面解析
Document document = Jsoup.parse(html);
Element route = document.select("div.route-resume").first();
Elements cities = route.select("span.city");
ordCity = cities.get(0).text();
dstCity = cities.get(1).text();
将响应页面解析完成后,将数据封装到实体类。
最后,编写测试方法。