现在我们的大概思路有了。就是实现问题了。 这里我推荐一个java爬取页面的好工具。[weblink url="https://github.com/code4craft/webmagic"]webmagic[/weblink]
webmagic webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。 web爬虫是一种技术,webmagic致力于将这种技术的实现成本降低,但是出于对资源提供者的尊重,webmagic不会做反封锁的事情,包括:验证码破解、代理切换、自动登录等。
下面是利用这个工具爬取页面的代码:
package com.wbdb.action.baidu;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.http.HttpHost;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HttpContext;
import us.codecraft.webmagic.MultiPageModel;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ComboExtract;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.MultiPagePipeline;
/**
* @author www.xxku.net<br>
*/
@TargetUrl(value = "http://www.baidu.com/s\\?wd=site%3Awww.xxku.net&pn=\\d+&ie=utf-8")
public class Search implements MultiPageModel {
@ExtractBy(value = "href=\"(http://www\\.baidu\\.com/link\\?url=.*?)\"", type = ExtractBy.Type.Regex)
private List<String> baiduUrl;
@ExtractBy(value = "//p[@id='page']/strong/span[@class='pc']", type = ExtractBy.Type.XPath)
private String pageKey;
@ExtractBy(value = "<span class=\"current\">(\\d+)</span>", type = ExtractBy.Type.Regex)
private String page;
@ComboExtract(value = {
@ExtractBy("//p[@id='page']/a"),
@ExtractBy(value = "http://www.baidu.com/s\\?wd=site%3Awww.xxku.net&pn=\\d+&ie=utf-8", type = ExtractBy.Type.Regex) }, multi = true, notNull = false)
private List<String> otherPage;
@Override
public String getPageKey() {
return pageKey;
}
@Override
public Collection<String> getOtherPages() {
return otherPage;
}
@Override
public String getPage() {
if (page == null) {
return "1";
}
return page;
}
@Override
public MultiPageModel combine(MultiPageModel multiPageModel) {
Search s = new Search();
Search pagedModel1 = (Search) multiPageModel;
this.baiduUrl.addAll(pagedModel1.baiduUrl);
return s;
}
public List<String> getBaiduUrl() {
return baiduUrl;
}
public void setBaiduUrl(List<String> baiduUrl) {
this.baiduUrl = baiduUrl;
}
@Override
public String toString() {
return "Search [baiduUrl=" + baiduUrl + ", pageKey=" + pageKey + ", page=" + page + ", otherPage=" + otherPage
+ "]";
}
public static void main(String[] args) throws IOException {
OOSpider o = OOSpider.create(
Site.me().addStartUrl("http://www.baidu.com/s?wd=site%3Awww.xxku.net&pn=0&ie=utf-8"), Search.class);
o.addPipeline(new MultiPagePipeline());
o.addPipeline(new SearchPipeline());
o.run();
List<String> baiduUrlList = SearchPipeline.getBaiduUrlList();
Search s = new Search();
ArrayList<String> realUrl404 = new ArrayList<String>();
// 获取404 URl
for (int i = 0; i < baiduUrlList.size(); i++) {
String url404 = s.getRealUrl(baiduUrlList.get(i));
if (url404 != null) {
realUrl404.add(url404);
}
}
// 组件xml
//这里可以用dom4j来做比较容易
//我直接把链接打出来。通过在线的sitemap生成器生成了
}
/**
* 获取真实链接
*
* @param url
* @return
* @throws IOException
* @throws ClientProtocolException
*/
private String getRealUrl(String url) throws IOException {
CloseableHttpClient httpclient = HttpClients.createDefault();
HttpGet httpget = new HttpGet(url);
HttpContext localContext = new BasicHttpContext();
CloseableHttpResponse response = httpclient.execute(httpget, localContext);
try {
int status = response.getStatusLine().getStatusCode();
if (status == 404) {
HttpHost target = (HttpHost) localContext.getAttribute(ExecutionContext.HTTP_TARGET_HOST);
HttpUriRequest req = (HttpUriRequest) localContext.getAttribute(ExecutionContext.HTTP_REQUEST);
return target.toString() + req.getURI();
}
} catch (Exception e) {
e.printStackTrace();
} finally {
response.close();
}
return null;
}
}