导入依赖
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
爬取信息
实现接口PageProcessor,重写方法process和getSite
@Service
public class JobServiceImpl extends ServiceImpl<JobMapper, Job> implements JobService, PageProcessor {
//爬取的地址
private static String url = "https://www.jobui.com/jobs?jobKw=Java&cityKw=%E5%8C%97%E4%BA%AC";
//开始爬取
public void add() {
Spider.create(this).
addUrl(url).
thread(5).run();
}
@Override
public void process(Page page) {
ArrayList<Job> jobs = new ArrayList<>();
//选中对应的项 标签.类选择器class 空格代表下一个标签 多个类标签可.类选择器.类选择器 标签#id选择器
List<Selectable> nodes = page.getHtml().css("ul.j-jobList li.jobui-container.job-search-list").nodes();
//遍历获取
nodes.stream().forEach( node ->{
//获取值 text属性值比如a标签可以href node.css("a div.m-job-right-wrap div.mb15 div.job-name-wrap.segmentation h3.job-name", "text").get()
Job job = new Job()
.setDurl(page.getUrl().get())
.setName(node.css("a div.m-job-right-wrap div.mb15 div.job-name-wrap.segmentation h3.job-name", "text").get())
.setSalary(node.css("div.m-job-right-wrap div.mb15 div.segmentation.flex-box div.job-list-condition-wrap span.f60.fs16.fwb", "text").get());
jobs.add(job);
});
if (page.getUrl().get().equals(url)) {
List<String> urls=new ArrayList<>();
//获取总页码 也可单获取界面值
//page.getHtml().css("","text").get();
for (int i = 0; i < 50; i++) {
urls.add(url + "&n=" + i);
}
//爬取分页内容
page.addTargetRequests(urls);
}
}
//站点信息 模拟站点信息
@Override
public Site getSite() {
return Site.me().setRetryTimes(3).setSleepTime(100);
}
}