1 新建模块xsy-vi-server-webmagic
2 进入依赖
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
</dependency>
3 编写mapper、service
供保存数据
4 编写解析器
package org.xsy.itgo.news;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.xsy.itgo.news.domain.CmsContent;
import org.xsy.itgo.news.service.CmsContentService;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.selector.XpathSelector;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
@Component
public class NewsPageProcessor implements PageProcessor {
private final CmsContentService contentService;
@Autowired
public NewsPageProcessor(CmsContentService contentService){
this.contentService = contentService;
}
/**抓取网站的相关配置,包括编码、抓取间隔、重试次数等*/
private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
/**
* 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
* @param page
*/
@Override
public void process(Page page) {
/**
* 通过page.getHtml()可以获取到main函数中Spider.create(new NewsPageProcessor()).addUrl中的地址的网页内容
* 1、通过$或css()方法获取到该page html下某元素dom
*/
Selectable selectable = page.getHtml().$(".xpage-content-list").select(
new XpathSelector("div[@class='column-center-item']")
);
List<Selectable> nodes = selectable.nodes();
/**
* 获取到指定的dom后,从这些dom中提取元素内容。
*/
List<Map<String,Object>> newsList = new ArrayList<>();
for (int i = 1; i <= nodes.size() - 1; i++) {
Selectable node = nodes.get(i);
String link = node.$(".tit a", "href").get();
if(link.contains("politics")){
String timeStr = link.substring(28,36);
String title = node.$(".tit a", "text").get();
Map<String,Object> obj = new HashMap<>();
SimpleDateFormat formatter1 = new SimpleDateFormat("yyyyMMdd");
SimpleDateFormat formatter2 = new SimpleDateFormat("yyyy-MM-dd");
Date date = null;
String clTime = "";
try {
date = formatter1.parse(timeStr);
clTime = formatter2.format(date);
} catch (ParseException e) {
e.printStackTrace();
}
obj.put("timeStr",clTime);
obj.put("title",title);
obj.put("link",link);
newsList.add(obj);
}
}
if(CollectionUtils.isNotEmpty(newsList)){
System.out.println("市政关注:");
for (int i = 1; i <= newsList.size() - 1; i++) {
String timeStr = newsList.get(i).get("timeStr") == null ? "" : newsList.get(i).get("timeStr").toString();
String title = newsList.get(i).get("title") == null ? "" : newsList.get(i).get("title").toString();
String link = newsList.get(i).get("link") == null ? "" : newsList.get(i).get("link").toString();
System.out.printf("%d、%s->%s,访问地址:%s%n", i,timeStr,title, link);
}
}
/**处理db*/
if(CollectionUtils.isNotEmpty(newsList)){
for (Map<String,Object> map : newsList) {
String timeStr = map.get("timeStr") == null ? "" : map.get("timeStr").toString();
String title = map.get("title") == null ? "" : map.get("title").toString();
String link = map.get("link") == null ? "" : map.get("link").toString();
CmsContent content = new CmsContent();
content.setTimestr(timeStr);
content.setTitle(title);
content.setUrl(link);
contentService.saveCmsContent(content);
}
}
}
@Override
public Site getSite() {
return site;
}
public void start(){
Spider.create(new NewsPageProcessor(contentService)).addUrl("http://www.news.cn/politics").thread(2).run();
}
}
5 集成定时任务
5.1 进入依赖
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-quartz</artifactId>
</dependency>
5.2 编写定时任务类
package org.xsy.itgo.task;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import org.xsy.itgo.news.NewsPageProcessor;
@Component
public class MyTak {
@Autowired
private NewsPageProcessor pageProcessor;
//@Scheduled(cron = "0 0/1 * * * ?")
@Scheduled(fixedRate = 5000)
public void execute(){
pageProcessor.start();
}
}