本程序将展示使用Jsoup爬取51job招聘信息的示例,只是用于对Jsoup的学习,不会做其他使用
1. 新建一个springboot项目
添加Jsoup的依赖,以及mysql和mybatis的依赖,其中后面的依赖用于将爬取到的数据存入中mysql数据库中
1
2 org.springframework.boot
3 spring-boot-starter-web
4
5
6
7
8 org.jsoup
9 jsoup
10 1.10.2
11
12
13
14 tk.mybatis
15 mapper-spring-boot-starter
16 2.0.4
17
18
19
20 org.springframework.boot
21 spring-boot-starter-jdbc
22 2.2.1.RELEASE
23
24
25
26 org.mybatis.spring.boot
27 mybatis-spring-boot-starter
28 2.0.1
29
30
31
32 mysql
33 mysql-connector-java
34 5.1.48
35
36
37
38 com.alibaba
39 druid
40 1.1.1
41
2. 配置文件application.yml
主要配置数据库的链接字符串信息
1 server:2 port: 79993 spring:4 servlet:5 multipart:6 max-request-size: 100MB #最大请求文件的大小7 max-file-size: 20MB #设置单个文件最大长度8 http:9 encoding:10 charset: utf-811 force: true12 enabled: true13 datasource:14 platform: mysql15 type: com.alibaba.druid.pool.DruidDataSource16 initialSize: 517 minIdle: 318 maxActive: 50019 maxWait: 6000020 timeBetweenEvictionRunsMillis: 6000021 minEvictableIdleTimeMillis: 3000022 validationQuery: select 123 testOnBorrow: true24 poolPreparedStatements: true25 maxPoolPreparedStatementPerConnectionSize: 2026 driverClassName: com.mysql.jdbc.Driver27 url: jdbc:mysql://localhost:3306/job?serverTimezone=UTC&useSSL=false&useUnicode=true&characterEncoding=utf-8&useAffectedRows=true&rewriteBatchedStatements=true28 username: root29 password: root
3. springboot启动类
额外添加了对mybatis mapper的扫描
1 packagecom.devin.jobsearch;2
3 importorg.springframework.boot.SpringApplication;4 importorg.springframework.boot.autoconfigure.SpringBootApplication;5 importtk.mybatis.spring.annotation.MapperScan;6
7
8 @MapperScan("com.devin.jobsearch.mapper")9 @SpringBootApplication10 public classJobSearchApplication {11
12 public static voidmain(String[] args) {13 SpringApplication.run(JobSearchApplication.class, args);14 }15
16 }
4. 数据库表和对应的model
CREATE TABLE`job` (
`job_id`varchar(128) NOT NULL,
`job_name`varchar(512) DEFAULT NULL,
`job_detail`text,
`job_company_name`varchar(512) DEFAULT NULL,
`job_company_image`varchar(512) DEFAULT NULL,
`job_company_desc`text,
`job_company_url`varchar(512) DEFAULT NULL,
`job_url`varchar(512) DEFAULT NULL,
`job_location`varchar(128) DEFAULT NULL,
`job_location_detail`varchar(4000) DEFAULT NULL,
`job_salary`varchar(512) DEFAULT NULL,
`job_date`varchar(128) DEFAULT NULL,
`job_restrict_str`varchar(512) DEFAULT NULL,PRIMARY KEY(`job_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
1 packagecom.devin.jobsearch.model;2
3 importjavax.persistence.Table;4
5 /**
6 *@authorDevin Zhang7 * @className JobModel8 * @description TODO9 * @date 2020/4/22 9:4210 */
11 @Table(name = "job")12 public classJobModel {13 privateString jobId;14 privateString jobName;15 privateString jobDetail;16 privateString jobCompanyName;17 privateString jobCompanyImage;18 privateString jobCompanyDesc;19 privateString jobCompanyUrl;20 privateString jobUrl;21 privateString jobLocation;22 privateString jobLocationDetail;23 privateString jobSalary;24 privateString jobDate;25 privateString jobRestrictStr;26
27 publicString getJobId() {28 returnjobId;29 }30
31 public voidsetJobId(String jobId) {32 this.jobId =jobId;33 }34
35 publicString getJobName() {36 returnjobName;37 }38
39 public voidsetJobName(String jobName) {40 this.jobName =jobName;41 }42
43 publicString getJobDetail() {44 returnjobDetail;45 }46
47 public voidsetJobDetail(String jobDetail) {48 this.jobDetail =jobDetail;49 }50
51 publicString getJobCompanyName() {52 returnjobCompanyName;53 }54
55 public voidsetJobCompanyName(String jobCompanyName) {56 this.jobCompanyName =jobCompanyName;57 }58
59 publicString getJobCompanyDesc() {60 returnjobCompanyDesc;61 }62
63 public voidsetJobCompanyDesc(String jobCompanyDesc) {64 this.jobCompanyDesc =jobCompanyDesc;65 }66
67 publicString getJobCompanyUrl() {68 returnjobCompanyUrl;69 }70
71 public voidsetJobCompanyUrl(String jobCompanyUrl) {72 this.jobCompanyUrl =jobCompanyUrl;73 }74
75 publicString getJobUrl() {76 returnjobUrl;77 }78
79 public voidsetJobUrl(String jobUrl) {80 this.jobUrl =jobUrl;81 }82
83 publicString getJobLocation() {84 returnjobLocation;85 }86
87 public voidsetJobLocation(String jobLocation) {88 this.jobLocation =jobLocation;89 }90
91 publicString getJobLocationDetail() {92 returnjobLocationDetail;93 }94
95 public voidsetJobLocationDetail(String jobLocationDetail) {96 this.jobLocationDetail =jobLocationDetail;97 }98
99 publicString getJobSalary() {100 returnjobSalary;101 }102
103 public voidsetJobSalary(String jobSalary) {104 this.jobSalary =jobSalary;105 }106
107 publicString getJobDate() {108 returnjobDate;109 }110
111 public voidsetJobDate(String jobDate) {112 this.jobDate =jobDate;113 }114
115 publicString getJobCompanyImage() {116 returnjobCompanyImage;117 }118
119 public voidsetJobCompanyImage(String jobCompanyImage) {120 this.jobCompanyImage =jobCompanyImage;121 }122
123 publicString getJobRestrictStr() {124 returnjobRestrictStr;125 }126
127 public voidsetJobRestrictStr(String jobRestrictStr) {128 this.jobRestrictStr =jobRestrictStr;129 }130
131 @Override132 publicString toString() {133 return "JobModel{" +
134 "jobId='" + jobId + '\'' +
135 ", jobName='" + jobName + '\'' +
136 ", jobDetail='" + jobDetail + '\'' +
137 ", jobCompanyName='" + jobCompanyName + '\'' +
138 ", jobCompanyImage='" + jobCompanyImage + '\'' +
139 ", jobCompanyDesc='" + jobCompanyDesc + '\'' +
140 ", jobCompanyUrl='" + jobCompanyUrl + '\'' +
141 ", jobUrl='" + jobUrl + '\'' +
142 ", jobLocation='" + jobLocation + '\'' +
143 ", jobLocatonDetail='" + jobLocationDetail + '\'' +
144 ", jobSalary='" + jobSalary + '\'' +
145 ", jobDate='" + jobDate + '\'' +
146 ", jobRestrictStr='" + jobRestrictStr + '\'' +
147 '}';148 }149 }
5. 数据库操作mapper类 和 mapper配置文件
因为使用了tkmybatis,所以mapper类 和 mapper配置文件中补需要额外添加任何代码和配置
JobMapper.java
1 packagecom.devin.jobsearch.mapper;2
3 importcom.devin.jobsearch.model.JobModel;4 importtk.mybatis.mapper.common.Mapper;5 importtk.mybatis.mapper.common.MySqlMapper;6
7 /**
8 *@authorDevin Zhang9 * @className JobMapper10 * @description TODO11 * @date 2020/4/22 16:2412 */
13
14 public interface JobMapper extends Mapper, MySqlMapper{15 }
JobMapper.xml
1 <?xml version="1.0" encoding="UTF-8"?>
2
3
4
5
6. 工具类
SearchUtil.java 主要用于Jsoup加载url返回一个Document对象
1 packagecom.devin.jobsearch.util;2
3
4 importorg.jsoup.Jsoup;5 importorg.jsoup.nodes.Document;6 importorg.springframework.stereotype.Component;7
8 /**
9 *@authorDevin Zhang10 * @className SearchUtil11 * @description TODO12 * @date 2020/4/22 10:0713 */
14
15 @Component16 public classSearchUtil {17
18 public Document getDocument(String url) throwsException {19 Document document =Jsoup20 .connect(url)21 .header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36")22 .get();23 returndocument;24 }25 }
FileUtil.java 主要用于记录错误信息
1 packagecom.devin.jobsearch.util;2
3 importorg.springframework.stereotype.Component;4
5 importjava.io.BufferedWriter;6 importjava.io.File;7 importjava.io.FileOutputStream;8 importjava.io.OutputStreamWriter;9
10 /**
11 *@authorDevin Zhang12 * @className FileUtil13 * @description TODO14 * @date 2020/4/22 17:5615 */
16 @Component17 public classFileUtil {18
19 private String filePath = "D:\\data\\job\\fail.log";20
21 public voidwriteLog(String log) {22 BufferedWriter bw = null;23 try{24 bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(filePath), true)));25 bw.write(log);26 bw.flush();27 } catch(Exception e) {28 e.printStackTrace();29 } finally{30 try{31 bw.close();32 } catch(Exception e) {33 e.printStackTrace();34 }35 }36 }37 }
7. 爬虫的逻辑处理
我们打开51job的搜索页,分别查看首页,第2页,第3页,可以看到变化只是在访问页码的参数上有变化,所以我们可以循环去爬取整个的职位信息
首页
https://search.51job.com/list/000000,000000,0000,00,9,99,%2B,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
第二页
https://search.51job.com/list/000000,000000,0000,00,9,99,%2B,2,2.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
第三页
https://search.51job.com/list/000000,000000,0000,00,9,99,%2B,2,3.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
代码实现:
首先创建一个接口,接口中定了两个方法,一个用于获取总的页数,一个用于循环去爬取数据
爬虫的方法实现了该接口,后续如果我们要爬取其他网站,只需要实现该接口,编写逻辑即可
IJobHandle.java
1 packagecom.devin.jobsearch.Service;2
3 /**
4 *@authorDevin Zhang5 * @className IJobHandle6 * @description TODO7 * @date 2020/4/22 16:398 */
9
10 public interfaceIJobHandle {11
12 int getJobPage() throwsException;13
14 void handle() throwsException;15 }
Job51SearchHandle.java
1 packagecom.devin.jobsearch.Service;2
3 importcom.devin.jobsearch.mapper.JobMapper;4 importcom.devin.jobsearch.model.JobModel;5 importcom.devin.jobsearch.util.FileUtil;6 importcom.devin.jobsearch.util.SearchUtil;7 importorg.jsoup.nodes.Document;8 importorg.jsoup.nodes.Element;9 importorg.jsoup.select.Elements;10 importorg.springframework.stereotype.Service;11 importorg.springframework.util.CollectionUtils;12 importorg.springframework.util.StringUtils;13
14 importjavax.annotation.Resource;15 importjava.util.ArrayList;16 importjava.util.List;17 importjava.util.regex.Matcher;18 importjava.util.regex.Pattern;19
20 /**
21 *@authorDevin Zhang22 * @className Job51SearchHandle23 * @description TODO24 * @date 2020/4/21 16:4125 */
26
27 @Service28 public class Job51SearchHandle implementsIJobHandle {29
30 @Resource31 privateSearchUtil searchUtil;32 @Resource33 privateJobMapper jobMapper;34 @Resource35 privateFileUtil fileUtil;36
37 private static final String PAGEPATTERN = "pagePattern";38 private static final String JOB51URL = "https://search.51job.com/list/000000,000000,0000,00,9,99,%2B,2,pagePattern.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=";39
40
41 /**
42 * 获取51job总共有多少页43 *44 *@return
45 *@throwsException46 */
47 @Override48 public int getJobPage() throwsException {49 String url =JOB51URL;50 url = url.replace(PAGEPATTERN, "1");51 Document document =searchUtil.getDocument(url);52 return Integer.parseInt(document.getElementById("hidTotalPage").val());53 }54
55
56 /**
57 * 分页爬取51job58 */
59 @Override60 public void handle() throwsException {61 //目标地址
62 int pageTotal = this.getJobPage();63 List jobModelList = null;64 for (int page = 1; page <= pageTotal; page++) {65 try{66 jobModelList = newArrayList();67 System.out.println("开始爬取第:" + page + "页的数据");68 String url =JOB51URL;69 url = url.replace(PAGEPATTERN, page + "");70 Document document =searchUtil.getDocument(url);71 //右侧导航栏
72 Elements nav_com = document.getElementsByClass("el");73 for(Element element : nav_com) {74 if (element.children().first().tagName("p").hasClass("t1") &&
75 element.children().first().tagName("p").children().hasClass("check")) {76
77 String jobName = element.children().first().tagName("p").children().tagName("span").text();78 String jobUrl = element.children().first().tagName("p").child(2).child(0).attr("href");79 String companyName = element.child(1).text();80 String companyUrl = element.child(1).child(0).attr("href");81 String jobLocation = element.child(2).text();82 String jobSalary = element.child(3).text();83 String jobDate = element.child(4).text();84
85 JobModel jobModel = newJobModel();86 jobModel.setJobName(jobName);87 jobModel.setJobUrl(jobUrl);88 jobModel.setJobCompanyName(companyName);89 jobModel.setJobCompanyUrl(companyUrl);90 jobModel.setJobLocation(jobLocation);91 jobModel.setJobSalary(jobSalary);92 jobModel.setJobDate(jobDate);93
94 //爬取明细
95 Document detailDocument =searchUtil.getDocument(jobUrl);96
97 String jobRestrict = detailDocument.getElementsByClass("msg ltype").text();98 String jobDesc = detailDocument.getElementsByClass("bmsg job_msg inbox").text();99 String jobLocationDetail = "";100 if (detailDocument.getElementsByClass("bmsg inbox").size() > 0) {101 jobLocationDetail = detailDocument.getElementsByClass("bmsg inbox").first().child(0).text();102 }103 String companyDesc = detailDocument.getElementsByClass("tmsg inbox").text();104 String companyImage = "";105 if (detailDocument.getElementsByClass("com_name himg").size() > 0) {106 companyImage = detailDocument.getElementsByClass("com_name himg").first().child(0).attr("src");107 }108
109
110 jobModel.setJobRestrictStr(jobRestrict);111 jobModel.setJobDetail(jobDesc);112 jobModel.setJobLocationDetail(jobLocationDetail);113 jobModel.setJobCompanyDesc(companyDesc);114 jobModel.setJobCompanyImage(companyImage);115
116 String jobId = "";117 String patternStr = "/[0-9]*.html";118 Pattern pattern =Pattern.compile(patternStr);119 Matcher matcher =pattern.matcher(jobUrl);120 if(matcher.find()) {121 jobId =matcher.group();122 jobId = jobId.replaceAll(".html", "").replaceAll("/","");123 }124 if(StringUtils.isEmpty(jobId)) {125 patternStr = "jobid=[0-9]*";126 pattern =Pattern.compile(patternStr);127 matcher =pattern.matcher(jobUrl);128 if(matcher.find()) {129 jobId =matcher.group();130 jobId = jobId.replaceAll("jobid=", "");131 }132 }133 if(StringUtils.isEmpty(jobId)) {134 patternStr = "#[0-9]*";135 pattern =Pattern.compile(patternStr);136 matcher =pattern.matcher(jobUrl);137 if(matcher.find()) {138 jobId =matcher.group();139 jobId = jobId.replaceAll("#", "");140 }141 }142 jobModel.setJobId(jobId);143 System.out.println(jobModel);144 jobModelList.add(jobModel);145 }146 }147 if (!CollectionUtils.isEmpty(jobModelList)) {148 System.out.println("第" + page + "页数据,开始插入数据");149 jobMapper.insertList(jobModelList);150 }151 Thread.sleep(3000); //sleep 3 秒,防止访问太频繁,被禁掉152 } catch(Exception e) {153 e.printStackTrace();154 if (null !=jobModelList) {155 fileUtil.writeLog(jobModelList.toString());156 }157 }158 }159 }160
161 }
8. 调用
我们新建一个controller,访问直接调用弄
1 packagecom.devin.jobsearch.controller;2
3 importcom.devin.jobsearch.Service.Job51SearchHandle;4 importorg.springframework.web.bind.annotation.GetMapping;5 importorg.springframework.web.bind.annotation.RequestMapping;6 importorg.springframework.web.bind.annotation.RestController;7
8 importjavax.annotation.Resource;9
10 /**
11 *@authorDevin Zhang12 * @className JobController13 * @description TODO14 * @date 2020/4/22 16:3615 */
16 @RestController17 @RequestMapping("/job")18 public classJobController {19
20 @Resource21 privateJob51SearchHandle job51SearchHandle;22
23 @GetMapping("/51jobHandle")24 public String handle51JobController() throwsException {25 job51SearchHandle.handle();26 return "success";27 }28 }
访问: localhost:7999/job/51jobHandle 即可触发爬取,可以看到爬取到的数据已经存到数据库了
最后,再次说明,本文章的目的只是为了学习Jsoup,爬取到的数据也不会用作其他使用,毕竟爬虫爬的好,牢饭吃的饱。