java爬虫之WebMagic实战抓取前程无忧招聘信息

webmagic教程

http://webmagic.io/docs/zh/

入门案例

package com.hikktn.webmagic;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.Scheduler;


/**
 * @ClassName JobProcessor
 * @Description TODO
 * @Author lisonglin
 * @Date 2021/5/4 12:43
 * @Version 1.0
 */
public class JobProcessor implements PageProcessor {

	public void process(Page page) {
		System.out.println(page.getHtml());
		// CSS选择器
		page.putField("author", page.getHtml().css("div.b-wrap>a").all());

		// XPath选择器
		page.putField("div", page.getHtml().xpath("//div[@id=app]/div/div/div[@class=b-wrap]/div/div[@id" +
				"=primaryPageTab]/ul/li/a/span/text()"));
		// 正则表达式
		page.putField("div3", page.getHtml().css("div.b-wrap>a").regex(".*年轻人.*").all());

		// 处理结果API 返回一条数据
		page.putField("div4", page.getHtml().css("div.b-wrap>a").regex(".*年轻人.*").get());
		page.putField("div5", page.getHtml().css("div.b-wrap>a").regex(".*年轻人.*").toString());

		// 获取链接 全部
		page.addTargetRequests(page.getHtml().css("div.b-wrap").links().all());
		// 获取上方查询出的url里面的元素
		page.putField("url",page.getHtml().css("div.nav-search from input").all());

		// 抓取链接
		page.addTargetRequest("https://jobs.51job.com/chongqing-jlpq/123700142.html?s=sou_sou_soulb&t=0");

	}

	private Site site = Site.me()
			.setCharset("utf8")    // 设置编码
			.setTimeOut(10000)   // 设置超时时间,单位是ms毫秒
			.setRetrySleepTime(3000)  // 设置重试的间隔时间
			.setSleepTime(3);      // 设置重试次数

	public Site getSite() {
		return site;
	}

	public static void main(String[] args) {
		// Spider.create(new JobProcessor())
		// 		//初始访问url地址
		// 		.addUrl("https://www.bilibili.com/").run();

		Spider spider = Spider.create(new JobProcessor())
				.addUrl("https://www.bilibili.com/")  //设置爬取数据的页面
				//.addPipeline(new FilePipeline("C:\\Users\\tree\\Desktop\\result"))  // 保存到文件中
				.thread(5)  // 开启5个线程
				.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(10000000)));//设置布隆去重过滤器,指定最多对1000万数据进行去重操作

		// Scheduler scheduler = spider.getScheduler();

		//执行爬虫
		spider.run();
	}
}

准备

打开前程无忧网站,根据关键词搜索,选择你想要的招聘信息。

https://search.51job.com/list/060000,000000,0000,00,9,99,java,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=03&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=

我们需要的是这些招聘信息

我们选择抓取的关键信息

按照抽取的页面信息,创建数据库

/*
 Navicat Premium Data Transfer

 Source Server         : localhost
 Source Server Type    : MySQL
 Source Server Version : 50723
 Source Host           : localhost:3306
 Source Schema         : test

 Target Server Type    : MySQL
 Target Server Version : 50723
 File Encoding         : 65001

 Date: 06/05/2021 14:03:34
*/

SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;

-- ----------------------------
-- Table structure for job_info
-- ----------------------------
DROP TABLE IF EXISTS `job_info`;
CREATE TABLE `job_info`  (
  `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主键id',
  `company_name` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '公司名称',
  `company_addr` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '公司联系方式',
  `company_info` text CHARACTER SET utf8 COLLATE utf8_general_ci NULL COMMENT '公司信息',
  `job_name` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '职位名称',
  `job_addr` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '工作地点',
  `job_info` text CHARACTER SET utf8 COLLATE utf8_general_ci NULL COMMENT '职位信息',
  `salary_min` int(10) NULL DEFAULT NULL COMMENT '薪资范围,最小',
  `salary_max` int(10) NULL DEFAULT NULL COMMENT '薪资范围,最大',
  `technology` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '关键技术点',
  `url` varchar(150) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '招聘信息详情页',
  `time` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '职位最近发布时间',
  PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 532 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '招聘信息' ROW_FORMAT = Dynamic;

SET FOREIGN_KEY_CHECKS = 1;

大家看起来,一切都非常顺利,可惜的是,直接使用JavaScript渲染HTML,致使我们拿取数据的时候,比较麻烦。

转换一下格式

上面框起来的就是我们需要解析的数据

你看下面的链接,就是下一页的请求链接。

本来打算使用bean对象,JSON转换为对象,但是太麻烦了,还是使用直接转换为jsonobject对象,没想到里面内置有直接获取key-value,那就很好办了。

前面走的弯路,不需要再走了。

我的分析结束了,现在我们开始吧。

开始

pom

    <dependencies>
    
        <!--SpringMVC-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>
    
        <!--SpringData Jpa-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-jpa</artifactId>
        </dependency>
    
        <!--MySQL连接包-->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
        </dependency>
    
        <!--WebMagic-->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.4</version>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-log4j12</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.4</version>
        </dependency>
        <!--WebMagic对布隆过滤器的支持-->
        <dependency>
            <groupId>com.google.guava</groupId>
            <artifactId>guava</artifactId>
            <version>16.0</version>
        </dependency>
    
        <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.2</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-log4j12 -->
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.24</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.10</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <!--<dependency>-->
            <!--<groupId>org.jsoup</groupId>-->
            <!--<artifactId>jsoup</artifactId>-->
            <!--<version>1.13.1</version>-->
        <!--</dependency>-->
        <!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.3</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
            <version>3.12.0</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.75</version>
        </dependency>


    </dependencies>

数据库相关

package com.hikktn.pojo;

import javax.persistence.*;
import java.util.Objects;

/**
 * @ClassName JobInfoEntity
 * @Description TODO
 * @Author lisonglin
 * @Date 2021/5/6 0:23
 * @Version 1.0
 */
@Entity
@Table(name = "job_info", schema = "test")
public class JobInfoEntity {
	// 主键id
	@GeneratedValue(strategy = GenerationType.IDENTITY)
	private long id;
	// 公司名称
	private String companyName;
	// 公司联系方式
	private String companyAddr;
	// 公司信息
	private String companyInfo;
	// 职位名称
	private String jobName;
	// 工作地点
	private String jobAddr;
	// 职位信息
	private String jobInfo;
	// 薪资范围,最小
	private Integer salaryMin;
	// 薪资范围,最大
	private Integer salaryMax;
	// 关键技术点
	private String technology;
	// 招聘信息详情页
	private String url;
	// 职位最近发布时间
	private String time;

	@Id
	@Column(name = "id", nullable = false)
	public long getId() {
		return id;
	}

	public void setId(long id) {
		this.id = id;
	}

	@Basic
	@Column(name = "company_name", nullable = true, length = 100)
	public String getCompanyName() {
		return companyName;
	}

	public void setCompanyName(String companyName) {
		this.companyName = companyName;
	}

	@Basic
	@Column(name = "company_addr", nullable = true, length = 200)
	public String getCompanyAddr() {
		return companyAddr;
	}

	public void setCompanyAddr(String companyAddr) {
		this.companyAddr = companyAddr;
	}

	@Basic
	@Column(name = "company_info", nullable = true, length = -1)
	public String getCompanyInfo() {
		return companyInfo;
	}

	public void setCompanyInfo(String companyInfo) {
		this.companyInfo = companyInfo;
	}

	@Basic
	@Column(name = "job_name", nullable = true, length = 100)
	public String getJobName() {
		return jobName;
	}

	public void setJobName(String jobName) {
		this.jobName = jobName;
	}

	@Basic
	@Column(name = "job_addr", nullable = true, length = 50)
	public String getJobAddr() {
		return jobAddr;
	}

	public void setJobAddr(String jobAddr) {
		this.jobAddr = jobAddr;
	}

	@Basic
	@Column(name = "job_info", nullable = true, length = -1)
	public String getJobInfo() {
		return jobInfo;
	}

	public void setJobInfo(String jobInfo) {
		this.jobInfo = jobInfo;
	}

	@Basic
	@Column(name = "salary_min", nullable = true)
	public Integer getSalaryMin() {
		return salaryMin;
	}

	public void setSalaryMin(Integer salaryMin) {
		this.salaryMin = salaryMin;
	}

	@Basic
	@Column(name = "salary_max", nullable = true)
	public Integer getSalaryMax() {
		return salaryMax;
	}

	public void setSalaryMax(Integer salaryMax) {
		this.salaryMax = salaryMax;
	}

	@Basic
	@Column(name = "technology", nullable = true, length = 200)
	public String getTechnology() {
		return technology;
	}

	public void setTechnology(String technology) {
		this.technology = technology;
	}

	@Basic
	@Column(name = "url", nullable = true, length = 150)
	public String getUrl() {
		return url;
	}

	public void setUrl(String url) {
		this.url = url;
	}

	@Basic
	@Column(name = "time", nullable = true, length = 10)
	public String getTime() {
		return time;
	}

	public void setTime(String time) {
		this.time = time;
	}

	@Override
	public boolean equals(Object o) {
		if (this == o) return true;
		if (o == null || getClass() != o.getClass()) return false;
		JobInfoEntity that = (JobInfoEntity) o;
		return id == that.id && Objects.equals(companyName, that.companyName) && Objects.equals(companyAddr,
				that.companyAddr) && Objects.equals(companyInfo, that.companyInfo) && Objects.equals(jobName,
				that.jobName) && Objects.equals(jobAddr, that.jobAddr) && Objects.equals(jobInfo, that.jobInfo) && Objects.equals(salaryMin, that.salaryMin) && Objects.equals(salaryMax, that.salaryMax) && Objects.equals(technology, that.technology) && Objects.equals(url, that.url) && Objects.equals(time, that.time);
	}

	@Override
	public int hashCode() {
		return Objects.hash(id, companyName, companyAddr, companyInfo, jobName, jobAddr, jobInfo, salaryMin, salaryMax
				, technology, url, time);
	}
}
package com.hikktn.dao;

import com.hikktn.pojo.JobInfoEntity;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.data.jpa.repository.Query;

import java.util.List;

/**
 * @ClassName JobInfoDao
 * @Description TODO
 * @Author lisonglin
 * @Date 2021/5/5 20:40
 * @Version 1.0
 */
public interface JobInfoDao extends JpaRepository<JobInfoEntity,Long> {

	@Query(value = "SELECT technology FROM job_info",nativeQuery = true)
	public List<JobInfoEntity> findJobTechnology();
}
package com.hikktn.service;

import com.hikktn.pojo.JobInfoEntity;

import java.util.List;

/**
 * @ClassName JobInfoService
 * @Description TODO
 * @Author lisonglin
 * @Date 2021/5/5 20:42
 * @Version 1.0
 */
public interface JobInfoService {

	/**
	 * 保存数据
	 *
	 * @param jobInfoEntity
	 */
	public void save(JobInfoEntity jobInfoEntity);

	/**
	 * 根据条件查询数据
	 *
	 * @param jobInfoEntity
	 * @return
	 */
	public List<JobInfoEntity> findJobInfo(JobInfoEntity jobInfoEntity);

	public List<JobInfoEntity> findJobTechnology();
}
package com.hikktn.service.impl;

import com.hikktn.dao.JobInfoDao;
import com.hikktn.pojo.JobInfoEntity;
import com.hikktn.service.JobInfoService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Example;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;

import java.util.List;

/**
 * @ClassName JobInfoServiceImpl
 * @Description TODO
 * @Author lisonglin
 * @Date 2021/5/5 20:43
 * @Version 1.0
 */
@Service
public class JobInfoServiceImpl implements JobInfoService {

	@Autowired
	private JobInfoDao jobInfoDao;

	@Override
	@Transactional
	public void save(JobInfoEntity jobInfoEntity) {
		JobInfoEntity param =new JobInfoEntity();
		param.setUrl(jobInfoEntity.getUrl());
		param.setTime(jobInfoEntity.getTime());
		List<JobInfoEntity> jobInfoList = this.findJobInfo(param);
		if (jobInfoList.size() == 0){
			this.jobInfoDao.saveAndFlush(jobInfoEntity);
		}
	}

	@Override
	public List<JobInfoEntity> findJobInfo(JobInfoEntity jobInfoEntity) {
		Example<JobInfoEntity> jobInfoEntityExample = Example.of(jobInfoEntity);
		List<JobInfoEntity> jobInfoDaoAll = this.jobInfoDao.findAll(jobInfoEntityExample);
		return jobInfoDaoAll;
	}

	public List<JobInfoEntity> findJobTechnology(){
		return this.jobInfoDao.findJobTechnology();
	}
}

爬虫

package com.hikktn.task;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.hikktn.pojo.JobInfoEntity;
import com.hikktn.utils.MathSalary;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Html;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @ClassName JobProcessorTask
 * @Description TODO
 * @Author lisonglin
 * @Date 2021/5/5 20:54
 * @Version 1.0
 */
@Component
public class JobProcessorTask implements PageProcessor {

	private static final int PAGE = 1;
	private static final String SHANG_URL = "https://search.51job.com/list/060000,000000,0000,32%252c01,9,99,java,2,";
	private static final String XIA_URL =
			".html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=03&jobterm" + "=99&companysize=99" +
					"&ord_field=0" + "&dibiaoid=0&line=&welfare=";
	private static String total_page = "";
	@Autowired
	private SpringDataPipeline springDataPipeline;

	@Override
	public void process(Page page) {
		//解析页面,获取招聘信息详情的url地址
		// String html = page.getHtml().toString();
		// System.out.println(html);
		JSONObject jsonObject = null;
		List<Request> targetRequests = page.getTargetRequests();
		// 如果没有任务,才进行添加任务,进入详情页面
		if (targetRequests.size() == 0) {
			// 在主页上面读取数据 ,很麻烦的方式
			Document document = page.getHtml().getDocument();
			Element script = document.getElementsByTag("script").get(8);
			String data = script.data();
			String job = "";
			if (data.indexOf("{\"top_ads\"") != -1) {
				job = data.substring(data.indexOf("{\"top_ads\""), data.length());
				// 将字符串转换为JSON对象
				jsonObject = JSONObject.parseObject(job);
				// 获取JSON对象
				JSONArray engine_search_result = (JSONArray) jsonObject.get("engine_search_result");
				for (int i = 0; i < engine_search_result.size(); i++) {
					JSONObject list = (JSONObject) engine_search_result.get(i);

					// 这里的获取,JSON数据不要,因为后头的处理,是进入详细页面获取相应的数据,而这里的数据仅仅只是准备第一页的所有连接,
					// 只有for循环和下面的代码全部处理完毕,webmagic才会进行下一次请求,而后每次请求五个线程,分批次读取详细页面的数据
					// 获取公司名称
					// String companyind_text = list.get("company_name").toString();
					// // 获取公司联系方式 -- 详情页面
					// // 公司信息  -- 详情页面
					// // 职位名称
					// String job_name = list.get("job_name").toString();
					// // 工作地点
					// String workarea_text = list.get("workarea_text").toString();
					// // 职位信息 -- 详情页面
					// // 获取薪资
					// Integer[] providesalary_texts = MathSalary.getSalary(list.get("providesalary_text").toString());
					// // 薪资范围,最小 -- 详情页面
					// // 薪资范围,最大 -- 详情页面
					// // 关键技术点 -- 详情页面
					// // 职位最近发布时间
					// String issuedate = list.get("issuedate").toString();

					// 招聘信息详情页
					String job_href = list.get("job_href").toString();
					// 把获取到的url地址放到任务队列中
					page.addTargetRequest(job_href);
					page.setRequest(targetRequests.get(i));
				}
			} else {
				// 在子页面读取数据
				saveJobInfo(page);
			}

			// 获取下一页的url
			if (total_page.length() <= 0) {
				total_page = jsonObject.get("total_page").toString();
			} else {
				// 不处理
			}
			int xia_page = 0;
			if (PAGE < Integer.valueOf(total_page)) {
				xia_page = PAGE + 1;
				String URL = SHANG_URL + xia_page + XIA_URL;
				// 把url放到任务队列中
				page.addTargetRequest(URL);
			} else {
				// 不处理
			}
		} else {
			// 任务等待执行

		}

	}

	// 解析页面,获取招聘详情信息,保存数据
	private void saveJobInfo(Page page) {
		// 创建招聘详情对象
		JobInfoEntity jobInfoEntity = new JobInfoEntity();

		// 解析页面
		Html html = page.getHtml();
		// 获取公司名称
		jobInfoEntity.setCompanyName(html.css("div.cn p.cname a","text").toString());
		// 获取公司联系方式 -- 详情页面
		String text = Jsoup.parse(html.css("div.bmsg").nodes().get(1).toString()).text();
		if (text.length()>0){
			jobInfoEntity.setCompanyAddr(text.substring(0,text.length()-2));
		}else {
			// 不处理
		}
		// 公司信息  -- 详情页面
		jobInfoEntity.setCompanyInfo(Jsoup.parse(html.css("div.tmsg").toString()).text());
		// 职位名称
		jobInfoEntity.setJobName(html.css("div.cn h1","text").toString());
		// 工作地点
		jobInfoEntity.setJobAddr(html.css("div.cn p.ltype","text").regex(".*区").toString());
		// 职位信息 -- 详情页面
		jobInfoEntity.setJobInfo(Jsoup.parse(html.css("div.job_msg").toString()).text());
		// 关键技术点
		String technology = Jsoup.parse(html.css("div.job_msg").toString()).text();
		// 正则匹配
		Pattern compile = Pattern.compile("[a-zA-Z]+");
		Matcher matcher = compile.matcher(technology);
		ArrayList al=new ArrayList();
		while (matcher.find()){
			al.add(matcher.group(0));
		}
		// 去重
		HashSet hs=new HashSet(al);
		al.clear();
		al.addAll(hs);
		String str = al.toString();
		jobInfoEntity.setTechnology(str);
		// 获取薪资
		Integer[] salary = MathSalary.getSalary(html.css("div.cn strong", "text").toString());
		// 薪资范围,最小
		jobInfoEntity.setSalaryMax(salary[0]);
		// 薪资范围,最大
		jobInfoEntity.setSalaryMin(salary[1]);
		// 职位最近发布时间
		String time = Jsoup.parse(html.css("div.cn p.ltype").regex(".*发布").toString()).text();
		if (time.length()>0){
			jobInfoEntity.setTime(time.substring(time.lastIndexOf("|")+1,time.length()-2));
		}else {
			// 不处理
		}
		// 招聘信息详情页
		jobInfoEntity.setUrl(page.getUrl().toString());

		page.putField("jobInfo", jobInfoEntity);
	}


	@Override
	public Site getSite() {
		Site site = Site.me().setCharset("gbk")//设置编码
				.setTimeOut(10 * 1000)//设置超时时间
				.setRetrySleepTime(3000)//设置重试的间隔时间
				.setRetryTimes(3);//设置重试的次数
		return site;
	}

	@Scheduled(initialDelay = 1000, fixedDelay = 1000 * 100)
	public void process() {
		String URL = SHANG_URL + PAGE + XIA_URL;
		Spider.create(new JobProcessorTask()).addUrl(URL).
				setScheduler(new QueueScheduler()
						.setDuplicateRemover(new BloomFilterDuplicateRemover(10000000)))
				.thread(5)
				.addPipeline(this.springDataPipeline)
				.run();
	}
}
package com.hikktn.utils;

public class MathSalary {


    /**
     * 获取薪水范围
     *
     * @param salaryStr
     * @return
     */
    public static Integer[] getSalary(String salaryStr) {
        //声明存放薪水范围的数组
        Integer[] salary = new Integer[2];

        //"500/天"
        //0.8-1.2万/月
        //5-8千/月
        //5-6万/年
        String date = salaryStr.substring(salaryStr.length() - 1, salaryStr.length());
        //如果是按天,则直接乘以240进行计算
        if (!"月".equals(date) && !"年".equals(date)) {
            salaryStr = salaryStr.substring(0, salaryStr.length() - 2);
            salary[0] = salary[1] = str2Num(salaryStr, 240);
            return salary;
        }

        String unit = salaryStr.substring(salaryStr.length() - 3, salaryStr.length() - 2);
        String[] salarys = salaryStr.substring(0, salaryStr.length() - 3).split("-");


        salary[0] = mathSalary(date, unit, salarys[0]);
        salary[1] = mathSalary(date, unit, salarys[1]);

        return salary;


    }

    //根据条件计算薪水
    private static Integer mathSalary(String date, String unit, String salaryStr) {
        Integer salary = 0;

        //判断单位是否是万
        if ("万".equals(unit)) {
            //如果是万,薪水乘以10000
            salary = str2Num(salaryStr, 10000);
        } else {
            //否则乘以1000
            salary = str2Num(salaryStr, 1000);
        }

        //判断时间是否是月
        if ("月".equals(date)) {
            //如果是月,薪水乘以12
            salary = str2Num(salary.toString(), 12);
        }

        return salary;
    }


    private static int str2Num(String salaryStr, int num) {
        try {
            // 把字符串转为小数,必须用Number接受,否则会有精度丢失的问题
            Number result = Float.parseFloat(salaryStr) * num;
            return result.intValue();
        } catch (Exception e) {
        }
        return 0;
    }
}
package com.hikktn.task;

import com.hikktn.pojo.JobInfoEntity;
import com.hikktn.service.JobInfoService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

/**
 * @ClassName SpringDataPipeline
 * @Description TODO
 * @Author lisonglin
 * @Date 2021/5/6 4:52
 * @Version 1.0
 */
@Component
public class SpringDataPipeline implements Pipeline {
	@Autowired
	private JobInfoService jobInfoService;

	@Override
	public void process(ResultItems resultItems, Task task) {
		//获取封装好的招聘详情对象
		JobInfoEntity jobInfo = resultItems.get("jobInfo");

		//判断数据是否不为空
		if (jobInfo != null) {
			//如果不为空把数据保存到数据库中
			this.jobInfoService.save(jobInfo);
		}
	}
}

最后启动一下spring服务,定时器自动执行。

搞定!

  • 0
    点赞
  • 14
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

hikktn

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值