webmagic 文档地址: http://webmagic.io/docs/zh/ webmagic github 地址: https://github.com/code4craft/webmagic
注意:目前使用过 0.6.1 版本 和 0.7.x 版本
pom依赖如下:
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.6.1</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.6.1</version>
</dependency>
- 模拟登录 其中 0.6.1 版本对于 post 方法的支持不太好,如果需要模拟登录,可以采用 httpclient 登录后 使用 cookieStore 获取cookie,在通过 webmagic 的 site 设置。 0.7.x 版本好像对这种设置cookie的方式不太兼容,不知道是不是用的方式不对,还是其版本bug。 示例代码:
public class SpiderLogin{
public void login(url) {
HttpGet httpGet = new HttpGet(url);
CookieStore cookieStore = new BasicCookieStore();
CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse response = httpClient.execute(httpGet);
String result = EntityUtils.toString(response.getEntity(), "utf-8");
if(...){
// 登录成功的逻辑
List<Cookie> cookies = cookiestore.getCookies(); // 从cookiesstore 中获取cookie
this.saveCookie(cookies); // 遍历cookie保存到cookieMap中
}
}
public void saveCookie(List<Cookie> cookies) {
for (int i = 0; i < cookies.size(); i++) {
if (cookies.get(i).getName().equals("PHPSESSID")) {
cookieMap.put("PHPSESSID", cookies.get(i).getValue());
}
}
}
}
Spider 示例代码:
public class JDParser implements PageProcessor {
// 抓取网页时设置cookies
public Site getSite() {
Iterator iter = SpiderLogin.cookieMap.entrySet().iterator();
while (iter.hasNext()) {
Map.Entry entry = (Map.Entry) iter.next();
Object key = entry.getKey();
Object val = entry.getValue();
site.addCookie(key.toString(), val.toString());
}
return site.addHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1");
}
public void process(Page page){
// 处理页面的逻辑
}
}
- 设置代理 0.6.x 版本对代理设置的兼容性不太好,没有用成功,github 上有人提到,https://github.com/code4craft/webmagic/pull/290 说是 0.7.0 是稳定版本,更改版本按照文档编写后,依然无法使用代理,所以只能重写 Download。
代码如下:
public class ProxyDownloader implements Downloader, Closeable {
private static Logger logger = Logger.getLogger(ProxyDownloader.class);
CloseableHttpClient httpClient;
public ProxyDownloader() {
}
@Override
public void close() throws IOException {
httpClient.close();
}
/**
* 开始下载(核心)
*/
@Override
public Page download(Request request, Task task) {
Site site = null;
if (task != null) {
site = task.getSite();
}
Set<Integer> acceptStatCode;
String charset = null;
Map<String, String> headers = null;
if (site != null) {
acceptStatCode = site.getAcceptStatCode();
charset = site.getCharset();
headers = site.getHeaders();
} else {
acceptStatCode = WMCollections.newHashSet(200);
}
int statusCode = 0;
logger.info("downloading page " + request.getUrl());
httpClient = getHttpClientWithProxy(SpiderLogin.cookieStore);
HttpGet httpGet = new HttpGet(request.getUrl());
httpGet.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
+ " (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
statusCode = response.getStatusLine().getStatusCode();
request.putExtra(Request.STATUS_CODE, statusCode);
if (statusAccept(acceptStatCode, statusCode)) {
Page page = handleResponse(request, charset, response, task);
onSuccess(request);
return page;
} else {
logger.warn("get page " + request.getUrl() + " error, status code " + statusCode);
return null;
}
} catch (IOException e1) {
logger.warn("download page " + request.getUrl() + " error " + e1.getMessage());
if (site.getCycleRetryTimes() > 0) {
return addToCycleRetry(request, site);
}
onError(request);
logger.info(e1.getMessage());
}
return null;
}
/**
* 网页下载成功回调
*
* @param request
*/
protected void onSuccess(Request request) {
}
/**
* 网页下载失败回调
*
* @param request
*/
protected void onError(Request request) {
}
/**
* 处理请求结果 并封装成 page
*
* @param request
* @param charset
* @param httpResponse
* @param task
* @return
* @throws IOException
*/
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task)
throws IOException {
String content = getContent(charset, httpResponse);
Page page = new Page();
page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
return page;
}
/**
* 根据编码处理网页
*
* @param charset
* @param httpResponse
* @return
* @throws IOException
*/
protected String getContent(String charset, HttpResponse httpResponse) throws IOException {
if (charset == null) {
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
if (htmlCharset != null) {
return new String(contentBytes, htmlCharset);
} else {
logger.warn("Charset autodetect failed, use " + Charset.defaultCharset()
+ " as charset. Please specify charset in Site.setCharset()");
return new String(contentBytes);
}
} else {
return IOUtils.toString(httpResponse.getEntity().getContent(), charset);
}
}
/**
* 从网页中获取网页编码 并对网页进行编码
*
* @param httpResponse
* @param contentBytes
* @return
* @throws IOException
*/
protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
String charset;
// charset
String value = httpResponse.getEntity().getContentType().getValue();
charset = UrlUtils.getCharset(value);
if (StringUtils.isNotBlank(charset)) {
logger.debug("Auto get charset:" + charset);
return charset;
}
Charset defaultCharset = Charset.defaultCharset();
String content = new String(contentBytes, defaultCharset.name());
if (StringUtils.isNotEmpty(content)) {
Document document = Jsoup.parse(content);
Elements links = document.select("meta");
for (Element link : links) {
String metaContent = link.attr("content");
String metaCharset = link.attr("charset");
if (metaContent.indexOf("charset") != -1) {
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
charset = metaContent.split("=")[1];
break;
}
// 2.2、html5 <meta charset="UTF-8" />
else if (StringUtils.isNotEmpty(metaCharset)) {
charset = metaCharset;
break;
}
}
}
logger.debug("Auto get charset: " + charset);
return charset;
}
/**
* 失败重试机制
*
* @param request
* @param site
* @return
*/
protected Page addToCycleRetry(Request request, Site site) {
Page page = new Page();
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
if (cycleTriedTimesObject == null) {
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
} else {
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
cycleTriedTimes++;
if (cycleTriedTimes >= site.getCycleRetryTimes()) {
return null;
}
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes));
}
page.setNeedCycleRetry(true);
return page;
}
/**
* 设置可接受的 code 码
*
* @param acceptStatCode
* @param statusCode
* @return
*/
protected boolean statusAccept(Set<Integer> acceptStatCode, int statusCode) {
return acceptStatCode.contains(statusCode);
}
@Override
public void setThread(int threadNum) {
}
// 设置代理
public static CloseableHttpClient getHttpClientWithProxy(CookieStore cookieStore) {
HttpClientBuilder builder = HttpClientBuilder.create();
if (cookieStore != null) {
builder.setDefaultCookieStore(cookieStore);
}
builder.setProxy(new HttpHost(host, port, "http"));
BasicCredentialsProvider credsProvider = new BasicCredentialsProvider();
credsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(username, password));
builder.setDefaultCredentialsProvider(credsProvider);
return builder.build();
}
}
用 Spider 时就可以使用
Spider.create(new JDParser()).addUrl(url).setDownloader(new ProxyDownloader()).run();
还有一点,0.6.x 版本在处理 相对路径的url时,会帮你把域名加上去,但是 0.7.x 没有加上去,不知道是没有提供这个方法还是我没有找到。