WebMagic springboot 多站点 垂直 数据采集
webmagic介绍
WebMagic是一个简单灵活的Java爬虫框架。基于WebMagic,你可以快速开发出一个高效、易维护的爬虫。(来自官网介绍)
webmagic 应用
定制 Downloader
/**
* 自定义下载器,将下载失败的url记录到redis中
*/
@Slf4j
@Component
public class SpiderDownloader extends HttpClientDownloader {
private static final String DOWNLOAD_START_MILLS = "download_start_mills";
private static final String DOWNLOAD_EXPAND_MILLS = "download_expand_mills";
@Override
public Page download(Request request, Task task) {
request.putExtra(DOWNLOAD_START_MILLS, System.currentTimeMillis());
return super.download(request, task);
}
@Override
protected void onSuccess(Request request) {
super.onSuccess(request);
calcExpandMills(request);
log.info("download expand: {} ms, url: {}", request.getExtra(DOWNLOAD_EXPAND_MILLS), request.getUrl());
}
@Override
protected void onError(Request request) {
super.onError(request);
calcExpandMills(request);
log.info("download error!!! expand: {} ms, url: {}", request.getExtra(DOWNLOAD_EXPAND_MILLS), request.getUrl());
// 将下载失败的url记录到redis
}
/**
* 计算下载耗费毫秒数
* @param request
*/
private void calcExpandMills(Request request) {
long downloadEndMills = System.currentTimeMillis();
Object downloadStartMills = request.getExtra(DOWNLOAD_START_MILLS);
if(downloadStartMills != null) {
long expandMills = downloadEndMills - Long.valueOf(downloadStartMills.toString()).longValue();
request.putExtra(DOWNLOAD_EXPAND_MILLS, expandMills);
}
}
}
定制 Scheduler(基于 redis redisson)
/**
* 自定义调度器,将下载器传递过来的请求保存到redis中,进行url去重,弹出请求
*/
@Slf4j
@Component
@NoArgsConstructor
public class SpiderRedisScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, DuplicateRemover {
/**
* 用于存放url的队列
*/
private static final String QUEUE_PREFIX = "queue_";
/**
* 用于存放插队url的队列
*/
private static final String QUEUE_JUMP = "jump_";
/**
* 用于对url去重
*/
private static final String SET_PREFIX = "set_";
@Resource
private CrawlerRuleService crawlerRuleService;
@Autowired
private RedissonClient redissonClient;
protected String getSetKey(Task task) {
return StringUtils.join(RedisKeyConst.spiderKeySpace, SET_PREFIX, task.getUUID());
}
protected String getQueueKey(Task task,boolean isJump) {
return StringUtils.join( RedisKeyConst.spiderKeySpace, (isJump?QUEUE_JUMP:QUEUE_PREFIX), task.getUUID());
}
@Override
public void resetDuplicateCheck(Task task) {
RSet<Object> urlSet = redissonClient.getSet(getSetKey(task));
urlSet.delete();
}
@Override
public void push(Request request, Task task) {
if (this.shouldReserved(request) || this.noNeedToRemoveDuplicate(request) || !isDuplicate(request, task)) {
this.pushWhenNoDuplicate(request, task);
}
}
@Override
public boolean isDuplicate(Request request, Task task) {
RSetCache<Object> urlSet = redissonClient.getSetCache(getSetKey(task));
boolean has = urlSet.contains(request.getUrl());
if(Boolean.FALSE.equals(has)) {
// 将url加入到redis set中
urlSet.add(request.getUrl(),60, TimeUnit.MINUTES);
return false;
} else {
return true;
}
}
@Override
protected void pushWhenNoDuplicate(Request request, Task task) {
CrawlerRequestDto requestInfo = request.getExtra(RequestConst.REQUEST_INFO);
if(null == requestInfo){
return;
}
boolean jump = requestInfo.isJump();
// 将request推入redis队列中
RDeque<Object> deque = redissonClient.getDeque(getQueueKey(task,jump));
Integer type = requestInfo.getType();
if(CrawlerTypeEnum.CONTENT.getType().equals(type)){
deque.addLast(request);
return;
}
deque.addFirst(request);
}
@Override
public Request poll(Task task) {
// 从队列中弹出一个url
RDeque<Object> jumpDeque = redissonClient.getDeque(getQueueKey(task, true));
Request request = pollWithStatus(jumpDeque);
if(null != request){
return request;
}
RDeque<Object> deque = redissonClient.getDeque(getQueueKey(task,false));
return pollWithStatus(deque);
}
private Request pollWithStatus(RDeque<Object> deque){
while (true){
Request request = (Request)deque.pollFirst();
if(request == null) {
return null;
}
CrawlerRequestDto requestInfo = request.getExtra(RequestConst.REQUEST_INFO);
if(null == requestInfo){
return null;
}
CrawlerRule crawlerInfo = crawlerRuleService.getByRuleId(requestInfo.getRuleId());
if(null == crawlerInfo){
return null;
}
int openStatus = crawlerInfo.getOpenStatus();
if(openStatus == 0){
log.info("url:{},规则:{} 被关闭",request.getUrl(),crawlerInfo.getRuleName());
continue;
}
return request;
}
}
@Override
public int getLeftRequestsCount(Task task) {
RDeque<Object> jumpDeque = redissonClient.getDeque(getQueueKey(task,true));
RDeque<Object> deque = redissonClient.getDeque(getQueueKey(task,false));
return jumpDeque.size() + deque.size();
}
public int getLeftRequestsCount(String domain){
if(StrUtil.isBlank(domain)){
return 0;
}
RDeque<Object> deque = redissonClient.getDeque(RedisKeyConst.spiderKeySpace + QUEUE_PREFIX + domain);
return deque.size();
}
@Override
public int getTotalRequestsCount(Task task) {
RSetCache<Object> urlSet = redissonClient.getSetCache(getSetKey(task));
return urlSet.size();
}
public int getTotalRequestsCount(String domain) {
if(StrUtil.isBlank(domain)){
return 0;
}
RSetCache<Object> urlSet = redissonClient.getSetCache(RedisKeyConst.spiderKeySpace + SET_PREFIX + domain);
return urlSet.size();
}
public void delAllRequest(String domain){
RDeque<Object> deque = redissonClient.getDeque(RedisKeyConst.spiderKeySpace + QUEUE_PREFIX + domain);
deque.delete();
RSetCache<Object> urlSet = redissonClient.getSetCache(RedisKeyConst.spiderKeySpace + SET_PREFIX + domain);
urlSet.delete();
}
}
定制 Pipeline
/**
* @Description: implements Pipeline 数据持久化
*/
@Slf4j
@Component
public class SpiderSavePipLine implements Pipeline {
@SneakyThrows
@Override
public void process(ResultItems resultItems, Task task) {
Request request = resultItems.getRequest();
CrawlerRequestDto requestInfo = request.getExtra(RequestConst.REQUEST_INFO);
if(null == requestInfo){
return;
}
Integer type = requestInfo.getType();
if(CrawlerTypeEnum.DETAIL.getType().equals(type)){
//详情页信息保存
}
if(CrawlerTypeEnum.CONTENT.getType().equals(type)){
//内容页
}
}
}
定制 SpiderListener
/**
* 自定义监听器,统计处理request的成功数和失败数,同时将处理失败的request收集起来
*/
@Component
public class SpiderEventListener implements SpiderListener {
private final AtomicInteger successCount = new AtomicInteger(0);
private final AtomicInteger failCount = new AtomicInteger(0);
private List<Request> failRequests = new CopyOnWriteArrayList<>();
@Resource
private RedissonClient redissonClient;
@Resource
private CrawlerRuleService crawlerRuleService;
@Override
public void onSuccess(Request request) {
successCount.incrementAndGet();
CrawlerRequestDto requestInfo = request.getExtra(RequestConst.REQUEST_INFO);
if(null == requestInfo){
return;
}
String countDownSpace = requestInfo.getCountDownSpace();
if(StrUtil.isNotBlank(countDownSpace)){
if(null == redissonClient){
return;
}
RCountDownLatch latch = redissonClient.getCountDownLatch(countDownSpace);
latch.countDown();
}
}
@Override
public void onError(Request request) {
failRequests.add(request);
failCount.incrementAndGet();
}
public AtomicInteger getSuccessCount() {
return successCount;
}
public AtomicInteger getFailCount() {
return failCount;
}
public List<Request> getFailRequests() {
return failRequests;
}
/**
* 根据规则id获取规则信息
* @param ruleId
* @return
*/
private CrawlerRule getCrawlerInfo(String ruleId){
if(null == ruleId){
return null;
}
CrawlerRule crawlerRule = crawlerRuleService.getByRuleId(ruleId);
if(StrUtil.isBlank(crawlerRule.getId())){
return null;
}
return crawlerRule;
}
}
定制 PageProcessor
@Slf4j
@Component
public class SpiderProcessor implements PageProcessor {
@Resource
private NovelProcessorFactory novelProcessorFactory;
@Resource
private CrawlerRuleService crawlerRuleService;
@Value("${spider.retryTimes:3000}")
private int retryTimes = 3000;
@Value("${spider.sleepTime:3000}")
private int sleepTime = 3000;
@Value("${spider.timeOut:60000}")
private int timeOut = 60000;
@Override
public void process(Page page) {
CrawlerRequestDto requestInfo =
page.getRequest().getExtra(RequestConst.REQUEST_INFO);
int type = requestInfo.getType();
CrawlerRule crawlerInfo = getCrawlerInfo(requestInfo.getRuleId());
if(null == crawlerInfo){
return;
}
int openStatus = crawlerInfo.getOpenStatus();
if(openStatus == 0){
//规则已被关闭
log.info("规则:{} 被关闭",crawlerInfo.getRuleName());
return;
}
NovelProcessor novelProcessor = novelProcessorFactory.getProcessor(type);
if(null == novelProcessor){
return;
}
novelProcessor.process(page,requestInfo,crawlerInfo);
}
/**
* 根据规则id获取规则信息
* @param ruleId
* @return
*/
private CrawlerRule getCrawlerInfo(String ruleId){
if(null == ruleId){
return null;
}
CrawlerRule crawlerRule = crawlerRuleService.getByRuleId(ruleId);
if(StrUtil.isBlank(crawlerRule.getId())){
return null;
}
return crawlerRule;
}
@Override
public Site getSite() {
Site site = Site.me().setRetryTimes(retryTimes).setSleepTime(sleepTime).setTimeOut(timeOut);
site.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
site.addHeader("Accept-Encoding", "gzip, deflate");
site.addHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
site.addHeader("Cache-Control", "max-age=0");
site.addHeader("Connection", "keep-alive");
// site.addHeader("Cookie", "Hm_lvt_42e120beff2c918501a12c0d39a4e067=1566530194,1566819135,1566819342,1566963215; Hm_lpvt_42e120beff2c918501a12c0d39a4e067=1566963215");
// site.addHeader("Host", "www.yousuu.com");
site.addHeader("Upgrade-Insecure-Requests", "1");
site.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36");
return site;
}
}
根据采集规则获取不同processor
@Component
public class NovelProcessorFactory {
private static Map<Integer, NovelProcessor> NOVEL_PROCESSOR_MAP = new HashMap<>();
@Autowired
private void setProcessorStrategy(
NovelListProcessor novelListProcessor,
NovelInfoProcessor novelInfoProcessor,
NovelContentProcessor novelContentProcessor
){
NovelProcessorFactory.NOVEL_PROCESSOR_MAP.put(
CrawlerTypeEnum.LIST.getType(),novelListProcessor);
NovelProcessorFactory.NOVEL_PROCESSOR_MAP.put(
CrawlerTypeEnum.DETAIL.getType(),novelInfoProcessor);
NovelProcessorFactory.NOVEL_PROCESSOR_MAP.put(
CrawlerTypeEnum.CONTENT.getType(),novelContentProcessor);
}
public NovelProcessor getProcessor(Integer crawlerType){
NovelProcessor novelProcessor = NOVEL_PROCESSOR_MAP.get(crawlerType);
return novelProcessor;
}
}
定制 Spider
public class SpiderStartContainer extends Spider {
private SpiderEventListener listener;
private RedissonClient redissonClient;
private int status = 0;
public SpiderStartContainer(PageProcessor pageProcessor, SpiderEventListener listener, RedissonClient redissonClient) {
super(pageProcessor);
this.redissonClient = redissonClient;
List<SpiderListener> spiderListeners = this.getSpiderListeners();
if(CollectionUtils.isEmpty(spiderListeners)) {
spiderListeners = new ArrayList<>();
}
this.listener = listener;
spiderListeners.add(listener);
this.setSpiderListeners(spiderListeners);
}
public static SpiderStartContainer create(PageProcessor pageProcessor,SpiderEventListener listener,RedissonClient redissonClient) {
return new SpiderStartContainer(pageProcessor,listener, redissonClient);
}
@Override
public void run() {
super.run();
if(scheduler != null && scheduler instanceof DuplicateRemover) {
((DuplicateRemover) scheduler).resetDuplicateCheck(this);
}
//爬虫任务完成时处理爬取失败的请求
List<Request> failRequests = listener.getFailRequests();
if(CollectionUtils.isNotEmpty(failRequests)) {
}
}
@Override
public void close() {
super.close();
this.status = 3;
}
public void spiderClose(String countDownSpace) {
this.close();
spiderCountDown(countDownSpace);
}
public void spiderStop(String countDownSpace){
this.stop();
this.status = this.stat.intValue();
spiderCountDown(countDownSpace);
}
public void spiderStart(){
spiderStart(null);
}
public void spiderStart(String countDownSpace){
logger.info("===================== check spider:"+this.getUUID()+" status =====================");
if(this.getSpiderStatus() == 3){
logger.info("===================== spider:"+this.getUUID()+" is close! spider will be restart! =====================");
this.spiderStop(null);
}
while (this.getSpiderStatus() == 3){
logger.info("===================== spider:"+this.getUUID()+" is to stop =====================");
}
if(this.stat.intValue() == 2 || this.stat.intValue() == 0){
this.start();
this.status = this.stat.intValue();
}
logger.info("===================== spider:"+this.getUUID()+" is start =====================");
spiderCountDown(countDownSpace);
}
public void spiderJumpQueue(Request request){
CrawlerRequestDto requestInfo = request.getExtra(RequestConst.REQUEST_INFO);
if(null == requestInfo){
return;
}
requestInfo.setJump(true);
this.addRequest(request);
spiderStart();
}
public String getSpiderUUID() {
return this.getUUID();
}
public Integer getSpiderStatus(){
if(status == 3){
return status;
}
return stat.get();
}
public int getLeftRequestsCount(String domain){
SpiderRedisScheduler scheduler = (SpiderRedisScheduler)this.getScheduler();
if(null != scheduler){
return scheduler.getLeftRequestsCount(domain);
}
return 0;
}
public int getTotalRequestsCount(String domain){
SpiderRedisScheduler scheduler = (SpiderRedisScheduler)this.getScheduler();
if(null != scheduler){
return scheduler.getTotalRequestsCount(domain);
}
return 0;
}
private void spiderCountDown(String countDownSpace){
if(null == redissonClient){
return;
}
if(StrUtil.isNotBlank(countDownSpace)){
RCountDownLatch latch = redissonClient.getCountDownLatch(countDownSpace);
latch.countDown();
}
}
}
spider生产者:
@Slf4j
@Configuration
public class SpiderStartContainerFactory {
@Value("${spider.threadNum:0}")
private int threadNum;
/**
* 线程统一管理
*/
private static ExecutorService executorService;
@Resource
private SpiderProcessor spiderProcessor;
@Resource
private SpiderEventListener spiderEventListener;
@Resource
private SpiderDownloader spiderDownloader;
@Resource
private SpiderRedisScheduler spiderRedisScheduler;
@Resource
private SpiderSavePipLine spiderSavePipLine;
@Resource
private RedissonClient redissonClient;
private static final Map<String, SpiderStartContainer> startContainerMap = Collections.synchronizedMap(new HashMap<>());
public SpiderStartContainer getStartContainer(String domain) {
if(startContainerMap.get(domain) != null){
return startContainerMap.get(domain);
}
SpiderStartContainer spiderStartContainer =
SpiderStartContainer.create(spiderProcessor, spiderEventListener, redissonClient);
spiderStartContainer.setDownloader(spiderDownloader);
spiderStartContainer.setScheduler(spiderRedisScheduler);
spiderStartContainer.setPipelines(Collections.singletonList(spiderSavePipLine));
spiderStartContainer.setExecutorService(executorService);
startContainerMap.put(domain,spiderStartContainer);
return spiderStartContainer;
}
public SpiderStartContainer getStartContainer(Request request) {
CrawlerRequestDto requestInfo = request.getExtra(RequestConst.REQUEST_INFO);
if(null == requestInfo){
return null;
}
UrlBuilder urlBuilder = UrlBuilder.ofHttp(requestInfo.getUrl(), CharsetUtil.CHARSET_UTF_8);
String domain = urlBuilder.getHost();
return getStartContainer(domain);
}
@PostConstruct
public void getExecutorService(){
//线程数 cpu * 2
if (threadNum == 0) {
threadNum = Runtime.getRuntime().availableProcessors() * 2;
}
log.info("================ spider thread num :{} ================", threadNum);
SpiderStartContainerFactory.executorService = Executors.newFixedThreadPool(threadNum);
}
}
数据采集启动
/**
* @Description: 采集启动,总线程,集成remote支持远程调用
*/
@Slf4j
@Component
public class SpiderStart implements ApplicationRunner, SpiderContainerRemote,SpiderJobStartRemote {
@Autowired
private SpiderStartContainerFactory spiderStartContainerFactory;
@Resource
private CrawlerRuleService crawlerRuleService;
private int status;
@Override
public void run(ApplicationArguments args){
this.start();
}
@Override
public void start() {
List<CrawlerRule> allRules = crawlerRuleService.getAll();
List<CrawlerRule> collect = allRules.stream()
.filter(rule -> rule.getOpenStatus() == 1)
.collect(Collectors.toList());
for (CrawlerRule crawlerRule : collect) {
CrawlerListRule listRule = crawlerRule.getListRule();
String sourceUrl = listRule.getSourceUrl();
int pageStartRule = listRule.getPageStartRule();
String url = sourceUrl.replace(RequestConst.PAGE_REPLACE,pageStartRule+"");
Request request = new Request(url);
CrawlerRequestDto requestInfo = CrawlerRequestDto.builder()
.url(url)
.ruleId(crawlerRule.getRuleId())
.type(CrawlerTypeEnum.LIST.getType())
.build();
request.putExtra(RequestConst.REQUEST_INFO,requestInfo);
SpiderStartContainer spiderStartContainer = spiderStartContainerFactory.getStartContainer(getDomain(url));
spiderStartContainer.addRequest(request);
spiderStartContainer.spiderStart();
status = 1;
}
}
@Override
public int getStarStatus() {
return this.status;
}
public int getStatus() {
return status;
}
public void setStatus(int status) {
this.status = status;
}
private String getDomain(String url){
UrlBuilder urlBuilder = UrlBuilder.ofHttp(url, CharsetUtil.CHARSET_UTF_8);
return urlBuilder.getHost();
}
@Override
public Integer getSpiderStatus(String domain) {
SpiderStartContainer spiderStartContainer = spiderStartContainerFactory.getStartContainer(domain);
if(spiderStartContainer == null){
return null;
}
return spiderStartContainer.getSpiderStatus();
}
@Override
public void spiderClose(String domain, String countDownSpace) {
SpiderStartContainer spiderStartContainer = spiderStartContainerFactory.getStartContainer(domain);
spiderStartContainer.spiderClose(null);
}
@Override
public void spiderStop(String domain, String countDownSpace) {
SpiderStartContainer spiderStartContainer = spiderStartContainerFactory.getStartContainer(domain);
spiderStartContainer.spiderStop(null);
}
@Override
public void spiderStart(String domain) {
SpiderStartContainer spiderStartContainer = spiderStartContainerFactory.getStartContainer(domain);
spiderStartContainer.spiderStart();
}
@Override
public void spiderStart(String domain, String countDownSpace) {
SpiderStartContainer spiderStartContainer = spiderStartContainerFactory.getStartContainer(domain);
spiderStartContainer.spiderStart(countDownSpace);
}
@Override
public void spiderJumpQueue(Request request) {
SpiderStartContainer spiderStartContainer = spiderStartContainerFactory.getStartContainer(getDomain(request.getUrl()));
spiderStartContainer.spiderJumpQueue(request);
}
@Override
public void spiderJumpQueue(Request request, String domain) {
SpiderStartContainer spiderStartContainer = spiderStartContainerFactory.getStartContainer(domain);
spiderStartContainer.spiderJumpQueue(request);
}
}
开源地址
github开源地址(飞鸟小说):https://github.com/caobinrg/birds-novel
gitee开源地址(飞鸟小说) : https://gitee.com/caobinrg/birds-novel