官网地址:http://webmagic.io/docs/zh/
直接上代码不BB
单元测试启动类
@Test
public void webMagic() {
OOSpider.create(Site.me().setSleepTime(5000)
, pipeline, WebMagic.class)
.addUrl("http://www.119.cn/content/201903/28/c102918.html")
.addUrl("http://www.119.cn/content/201902/21/c99922.html")
.addUrl("http://www.119.cn/content/201902/20/c99843.html")
.addUrl("http://www.119.cn/content/201901/30/c98820.html")
.addUrl("http://www.119.cn/content/201901/16/c97731.html")
.addUrl("http://www.119.cn/content/201901/15/c97590.html")
.addUrl("http://www.119.cn/content/201901/10/c97269.html")
.addUrl("http://www.119.cn/content/201901/03/c96718.html")
.addUrl("http://www.119.cn/content/201901/03/c96677.html")
.addUrl("http://www.119.cn/content/201901/03/c96663.html")
.addUrl("http://www.119.cn/content/201812/28/c96264.html")
.addUrl("http://www.119.cn/content/201812/27/c96167.html")
.addUrl("http://www.119.cn/content/201812/26/c96028.html")
.addUrl("http://www.119.cn/content/201812/26/c96027.html")
.addUrl("http://www.119.cn/content/201812/26/c96025.html")
.thread(5).run();
}
业务处理组件类
@Component
public class Pipeline implements PageModelPipeline {
@Resource
private NoticeMapper noticeMapper;
public Pipeline() {
}
@Override
public void process(Object o, Task task) {
WebMagic webMagic = (WebMagic) o;
String title = webMagic.getTitle();
NoticeEntity old = noticeMapper.selectTitle(title);
if(null == old){
NoticeEntity noticeEntity = new NoticeEntity();
noticeEntity.setNoticeId(UUID.randomUUID().toString().replaceAll("-", ""));
noticeEntity.setClassifyId("402882456ad2f2b9016ad3f03a6a000a");
noticeEntity.setContent(webMagic.getContent());
noticeEntity.setIsHomeShow(1);
noticeEntity.setIsTop(1);
noticeEntity.setStatus(1);
noticeEntity.setTitle(title);
noticeEntity.setVisiblePersonId("2");
noticeMapper.insert(noticeEntity);
}
}
}
注解扫描类
@TargetUrl("http://www.119.cn/content/*/*/*.html")
@HelpUrl("http://www.119.cn/picture.html")
public class WebMagic {
@ExtractBy("//h1[@class='title']/text()")
private String title;
// @ExtractBy("//div[@class='contentBox']")
@ExtractBy("//div[@class='bodyBox']")
private String content;
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
}
这样一个爬虫就成型了。