点击资料或前往github查看源码WebMagic
如果是采用@Autowired注入变量,则当前类的实例,必须也是spring 容器注入才能成功注入,应该也采用@Autowired注入,要不就不要new需要注入其他对象的当前类
使用springboot开启定时任务,使用自定义pipeline将数据存储到数据库,根据传入的url获取页面,和jquery相似的选择器方法解析页面存入自己想得到的信息,有些数据有几个标签,获取文本内容不要用css()方法,要加上jsoup解析,text()方法可以获取到所有文本内容,而css要选中标签,用了两种标签数据就会出现问题。
ps:爬取前查看得到的html,会与网页的不一样
package com.example.demo.task;
import com.example.demo.pojo.JobInfo;
import com.example.demo.utils.MathSalary;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Selectable;
import java.util.Date;
@Component
public class JobProcessor implements PageProcessor {
private String url = "https://search.51job.com/list/030200,000000,0000,00,9,99,java,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=";
@Autowired
private ObjectMapper objectMapper;
private int count = 1;
@Override
public void process(Page page) {
//爬取解析的与页面不同,此次爬取得到数据中是以json形式存在script中
String jsonString = page.getHtml().css("script").regex(".*SEARCH_RESULT.*").toString();
//不为空即为招聘页
if (!StringUtils.isEmpty(jsonString)) {
//截取json类型字符串
String substring = jsonString.substring(jsonString.indexOf("{"), jsonString.lastIndexOf("}") + 1);
try {
JsonNode jsonNode = objectMapper.readTree(substring);
JsonNode array = jsonNode.get("engine_search_result");
if (array.isArray()) {
for (JsonNode node : array) {
//获取详情链接
String detailUrl = node.get("job_href").asText();
page.addTargetRequest(detailUrl);
page.addTargetRequest(detailUrl);
}
// 获取下一页的url
String bkUrl = "https://search.51job.com/list/030200,000000,0000,00,9,99,java,2," + (++count) + ".html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=";
// // 把url放到任务队列中
page.addTargetRequest(bkUrl);
} else {
count = 0;
return;
}
//
} catch (JsonProcessingException e) {
e.printStackTrace();
}
} else {
//进入详情页,保存数据到实体类,再通过page.putfiled(即resultItems)给Pipeline传入数据库
saveJobInfo(page);
}
}
//解析详情页
private void saveJobInfo(Page page) {
JobInfo jobInfo = new JobInfo();
Selectable detailInfo = page.getHtml().css("div.tCompany_center");
if (detailInfo == null) {
return;
}
String addr = detailInfo.css(".ltype[title]").toString();
jobInfo.setCompanyName(detailInfo.css("p.cname a", "text").toString());
jobInfo.setCompanyAddr(addr.substring(addr.lastIndexOf("=") + 1, addr.indexOf("&")).trim());
jobInfo.setCompanyInfo(detailInfo.css("div.tBorderTop_box div.tmsg", "text").toString().trim());
jobInfo.setJobAddr(detailInfo.css("div.tBorderTop_box div.bmsg>p.fp", "text").toString());
jobInfo.setJobInfo(Jsoup.parse(detailInfo.css("div.job_msg").toString()).text());
jobInfo.setJobName(detailInfo.css("div.cn h1", "text").toString());
// 设置当前链接
jobInfo.setUrl(page.getUrl().toString());
// 获取薪资
// 有的没有写薪资, 先获取薪资的字符串
String salaryText = detailInfo.css("div.cn strong", "text").toString();
// 看看是否没有薪资这个字段
if (!StringUtils.isEmpty(salaryText)) {
// 使用工具类转换薪资字符串
Integer[] salary = MathSalary.getSalary(salaryText);
jobInfo.setSalaryMin(salary[0]);
jobInfo.setSalaryMax(salary[1]);
} else {
// 没有则设为零
jobInfo.setSalaryMax(0);
jobInfo.setSalaryMin(0);
}
jobInfo.setTime((new Date().getYear() + 1900) + "-" + addr.substring(addr.lastIndexOf("发") - 5, addr.lastIndexOf("发")));
page.putField("jobInfo", jobInfo);
}
//网页编码格式
private Site site = Site.me().setCharset("gbk")
//超时时间10s
.setTimeOut(10000)
//重试次数3次
.setRetryTimes(3)
//重试间隔时间3s
.setSleepTime(3000);
@Override
public Site getSite() {
return site;
}
@Autowired
private JobInfoPipeline jobInfoPipeline;
@Scheduled(initialDelay = 1000, fixedDelay = 100 * 1000)
public void process() {
Spider.create(this)
//自定义PipeLine存入数据库中
.addPipeline(jobInfoPipeline)
//设置爬取url
.addUrl(url)
//线程数10
.thread(10)
//使用内存任务设置Bloom过滤去重器,初始化10万位空间
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100000)))
.run();//运行Spider
}
}