一.demo实例
package com.pz998.rpc.common.utils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
public class GithubRepoPageProcessor implements PageProcessor{
//设置爬取参数 如重试次数 间隔多长时间
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
@Override
public void process(Page page) {
//将page中包含的链接重新作为目标链接 进行数据爬取
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
if (page.getResultItems().get("name")==null){
//skip this page
page.setSkip(true);
}
page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
// page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
System.out.println("网页内容");
System.out.println(page.toString());
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
}
}
二.主体源码研读
处理的主逻辑在Spider中的run方法中
@Override
public void run() {
checkRunningStat();
initComponent();
logger.info("Spider " + getUUID() + " started!");
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
//从队列中获取链接数据
Request request = scheduler.poll(this);
if (request == null) {
if (threadPool.getThreadAlive() == 0 && exitWhenComplete) {
break;
}
// wait until new url added
waitNewUrl();
} else {
final Request requestFinal = request;
//将获取到的链接扔到线程池中处理
threadPool.execute(new Runnable() {
@Override
public void run() {
try {
//处理从队列中获取的链接
processRequest(requestFinal);
onSuccess(requestFinal);
} catch (Exception e) {
onError(requestFinal);
logger.error("process request " + requestFinal + " error", e);
} finally {
if (site.getHttpProxyPool()!=null && site.getHttpProxyPool().isEnable()) {
site.returnHttpProxyToPool((HttpHost) requestFinal.getExtra(Request.PROXY), (Integer) requestFinal
.getExtra(Request.STATUS_CODE));
}
pageCount.incrementAndGet();
signalNewUrl();
}
}
});
}
}
stat.set(STAT_STOPPED);
// release some resources
if (destroyWhenExit) {
close();
}
}
- .首先框架使用的模式是比较经典的recort模式,开启一个主线程,从任务集合中取任务或者需要处理的数据,而这里的是从队列中取得链接集合,然后将每个链接丢到线程池调用PageProcessor页面处理器处理。
- .创建一个Spider组件 (爬虫) 需要设置一个定制的页面处理器组件PageProcessor,这个有点类型callback模式,保持页面主流程不变,需要变化定制的部分以接口的形式开放给开发者。
processRequest 方法
protected void processRequest(Request request) {
Page page = downloader.download(request, this);
if (page == null) {
sleep(site.getRetrySleepTime());
onError(request);
return;
}
// for cycle retry
if (page.isNeedCycleRetry()) {
extractAndAddRequests(page, true);
sleep(site.getRetrySleepTime());
return;
}
//调用用户定制的页面处理器的处理方法
pageProcessor.process(page);
//将获取的页面中的符合表达是的目标url再次pull到队列中 这样就可以再次调用处理逻辑 进行爬取了
extractAndAddRequests(page, spawnUrl);
if (!page.getResultItems().isSkip()) {
for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
}
}
//for proxy status management
request.putExtra(Request.STATUS_CODE, page.getStatusCode());
sleep(site.getSleepTime());
}
从extractAndAddRequests方法中可以看到将Page目标url放入队列
protected void extractAndAddRequests(Page page, boolean spawnUrl) {
if (spawnUrl && CollectionUtils.isNotEmpty(page.getTargetRequests())) {
for (Request request : page.getTargetRequests()) {
addRequest(request);
}
}
}
private void addRequest(Request request) {
if (site.getDomain() == null && request != null && request.getUrl() != null) {
site.setDomain(UrlUtils.getDomain(request.getUrl()));
}
scheduler.push(request, this);
}
----
spring mvc+tomcat源码分析视频(链接复制在浏览器打开)
https://study.163.com/course/courseMain.htm?share=2&shareId=480000001919582&courseId=1209399899&_trace_c_p_k2_=6d81bc445e9c462ab8d6345e40f6b0bf