java 爬取网页的数据库_JAVA程序使用Jsoup 爬取网页内容

本示例教程详细介绍了如何在SpringBoot项目中使用Jsoup库爬取51job的招聘信息,并将其存储到MySQL数据库中。包括添加相关依赖、配置数据库连接、创建数据库表、定义数据模型、数据库操作接口、Jsoup工具类、文件记录错误信息、分页爬取逻辑以及调用示例。
摘要由CSDN通过智能技术生成

本程序将展示使用Jsoup爬取51job招聘信息的示例,只是用于对Jsoup的学习,不会做其他使用

1. 新建一个springboot项目

添加Jsoup的依赖,以及mysql和mybatis的依赖,其中后面的依赖用于将爬取到的数据存入中mysql数据库中

1   

2 org.springframework.boot

3 spring-boot-starter-web

4

5

6

7

8 org.jsoup

9 jsoup

10 1.10.2

11

12

13

14 tk.mybatis

15 mapper-spring-boot-starter

16 2.0.4

17

18

19

20 org.springframework.boot

21 spring-boot-starter-jdbc

22 2.2.1.RELEASE

23

24

25

26 org.mybatis.spring.boot

27 mybatis-spring-boot-starter

28 2.0.1

29

30

31

32 mysql

33 mysql-connector-java

34 5.1.48

35

36

37

38 com.alibaba

39 druid

40 1.1.1

41

2. 配置文件application.yml

主要配置数据库的链接字符串信息

1 server:2 port: 79993 spring:4 servlet:5 multipart:6 max-request-size: 100MB #最大请求文件的大小7 max-file-size: 20MB #设置单个文件最大长度8 http:9 encoding:10 charset: utf-811 force: true12 enabled: true13 datasource:14 platform: mysql15 type: com.alibaba.druid.pool.DruidDataSource16 initialSize: 517 minIdle: 318 maxActive: 50019 maxWait: 6000020 timeBetweenEvictionRunsMillis: 6000021 minEvictableIdleTimeMillis: 3000022 validationQuery: select 123 testOnBorrow: true24 poolPreparedStatements: true25 maxPoolPreparedStatementPerConnectionSize: 2026 driverClassName: com.mysql.jdbc.Driver27 url: jdbc:mysql://localhost:3306/job?serverTimezone=UTC&useSSL=false&useUnicode=true&characterEncoding=utf-8&useAffectedRows=true&rewriteBatchedStatements=true28 username: root29 password: root

3. springboot启动类

额外添加了对mybatis mapper的扫描

1 packagecom.devin.jobsearch;2

3 importorg.springframework.boot.SpringApplication;4 importorg.springframework.boot.autoconfigure.SpringBootApplication;5 importtk.mybatis.spring.annotation.MapperScan;6

7

8 @MapperScan("com.devin.jobsearch.mapper")9 @SpringBootApplication10 public classJobSearchApplication {11

12 public static voidmain(String[] args) {13 SpringApplication.run(JobSearchApplication.class, args);14 }15

16 }

4. 数据库表和对应的model

CREATE TABLE`job` (

`job_id`varchar(128) NOT NULL,

`job_name`varchar(512) DEFAULT NULL,

`job_detail`text,

`job_company_name`varchar(512) DEFAULT NULL,

`job_company_image`varchar(512) DEFAULT NULL,

`job_company_desc`text,

`job_company_url`varchar(512) DEFAULT NULL,

`job_url`varchar(512) DEFAULT NULL,

`job_location`varchar(128) DEFAULT NULL,

`job_location_detail`varchar(4000) DEFAULT NULL,

`job_salary`varchar(512) DEFAULT NULL,

`job_date`varchar(128) DEFAULT NULL,

`job_restrict_str`varchar(512) DEFAULT NULL,PRIMARY KEY(`job_id`)

) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

1 packagecom.devin.jobsearch.model;2

3 importjavax.persistence.Table;4

5 /**

6 *@authorDevin Zhang7 * @className JobModel8 * @description TODO9 * @date 2020/4/22 9:4210 */

11 @Table(name = "job")12 public classJobModel {13 privateString jobId;14 privateString jobName;15 privateString jobDetail;16 privateString jobCompanyName;17 privateString jobCompanyImage;18 privateString jobCompanyDesc;19 privateString jobCompanyUrl;20 privateString jobUrl;21 privateString jobLocation;22 privateString jobLocationDetail;23 privateString jobSalary;24 privateString jobDate;25 privateString jobRestrictStr;26

27 publicString getJobId() {28 returnjobId;29 }30

31 public voidsetJobId(String jobId) {32 this.jobId =jobId;33 }34

35 publicString getJobName() {36 returnjobName;37 }38

39 public voidsetJobName(String jobName) {40 this.jobName =jobName;41 }42

43 publicString getJobDetail() {44 returnjobDetail;45 }46

47 public voidsetJobDetail(String jobDetail) {48 this.jobDetail =jobDetail;49 }50

51 publicString getJobCompanyName() {52 returnjobCompanyName;53 }54

55 public voidsetJobCompanyName(String jobCompanyName) {56 this.jobCompanyName =jobCompanyName;57 }58

59 publicString getJobCompanyDesc() {60 returnjobCompanyDesc;61 }62

63 public voidsetJobCompanyDesc(String jobCompanyDesc) {64 this.jobCompanyDesc =jobCompanyDesc;65 }66

67 publicString getJobCompanyUrl() {68 returnjobCompanyUrl;69 }70

71 public voidsetJobCompanyUrl(String jobCompanyUrl) {72 this.jobCompanyUrl =jobCompanyUrl;73 }74

75 publicString getJobUrl() {76 returnjobUrl;77 }78

79 public voidsetJobUrl(String jobUrl) {80 this.jobUrl =jobUrl;81 }82

83 publicString getJobLocation() {84 returnjobLocation;85 }86

87 public voidsetJobLocation(String jobLocation) {88 this.jobLocation =jobLocation;89 }90

91 publicString getJobLocationDetail() {92 returnjobLocationDetail;93 }94

95 public voidsetJobLocationDetail(String jobLocationDetail) {96 this.jobLocationDetail =jobLocationDetail;97 }98

99 publicString getJobSalary() {100 returnjobSalary;101 }102

103 public voidsetJobSalary(String jobSalary) {104 this.jobSalary =jobSalary;105 }106

107 publicString getJobDate() {108 returnjobDate;109 }110

111 public voidsetJobDate(String jobDate) {112 this.jobDate =jobDate;113 }114

115 publicString getJobCompanyImage() {116 returnjobCompanyImage;117 }118

119 public voidsetJobCompanyImage(String jobCompanyImage) {120 this.jobCompanyImage =jobCompanyImage;121 }122

123 publicString getJobRestrictStr() {124 returnjobRestrictStr;125 }126

127 public voidsetJobRestrictStr(String jobRestrictStr) {128 this.jobRestrictStr =jobRestrictStr;129 }130

131 @Override132 publicString toString() {133 return "JobModel{" +

134 "jobId='" + jobId + '\'' +

135 ", jobName='" + jobName + '\'' +

136 ", jobDetail='" + jobDetail + '\'' +

137 ", jobCompanyName='" + jobCompanyName + '\'' +

138 ", jobCompanyImage='" + jobCompanyImage + '\'' +

139 ", jobCompanyDesc='" + jobCompanyDesc + '\'' +

140 ", jobCompanyUrl='" + jobCompanyUrl + '\'' +

141 ", jobUrl='" + jobUrl + '\'' +

142 ", jobLocation='" + jobLocation + '\'' +

143 ", jobLocatonDetail='" + jobLocationDetail + '\'' +

144 ", jobSalary='" + jobSalary + '\'' +

145 ", jobDate='" + jobDate + '\'' +

146 ", jobRestrictStr='" + jobRestrictStr + '\'' +

147 '}';148 }149 }

5. 数据库操作mapper类 和 mapper配置文件

因为使用了tkmybatis,所以mapper类 和 mapper配置文件中补需要额外添加任何代码和配置

JobMapper.java

1 packagecom.devin.jobsearch.mapper;2

3 importcom.devin.jobsearch.model.JobModel;4 importtk.mybatis.mapper.common.Mapper;5 importtk.mybatis.mapper.common.MySqlMapper;6

7 /**

8 *@authorDevin Zhang9 * @className JobMapper10 * @description TODO11 * @date 2020/4/22 16:2412 */

13

14 public interface JobMapper extends Mapper, MySqlMapper{15 }

JobMapper.xml

1 <?xml version="1.0" encoding="UTF-8"?>

2

3

4

5

6. 工具类

SearchUtil.java 主要用于Jsoup加载url返回一个Document对象

1 packagecom.devin.jobsearch.util;2

3

4 importorg.jsoup.Jsoup;5 importorg.jsoup.nodes.Document;6 importorg.springframework.stereotype.Component;7

8 /**

9 *@authorDevin Zhang10 * @className SearchUtil11 * @description TODO12 * @date 2020/4/22 10:0713 */

14

15 @Component16 public classSearchUtil {17

18 public Document getDocument(String url) throwsException {19 Document document =Jsoup20 .connect(url)21 .header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36")22 .get();23 returndocument;24 }25 }

FileUtil.java 主要用于记录错误信息

1 packagecom.devin.jobsearch.util;2

3 importorg.springframework.stereotype.Component;4

5 importjava.io.BufferedWriter;6 importjava.io.File;7 importjava.io.FileOutputStream;8 importjava.io.OutputStreamWriter;9

10 /**

11 *@authorDevin Zhang12 * @className FileUtil13 * @description TODO14 * @date 2020/4/22 17:5615 */

16 @Component17 public classFileUtil {18

19 private String filePath = "D:\\data\\job\\fail.log";20

21 public voidwriteLog(String log) {22 BufferedWriter bw = null;23 try{24 bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(filePath), true)));25 bw.write(log);26 bw.flush();27 } catch(Exception e) {28 e.printStackTrace();29 } finally{30 try{31 bw.close();32 } catch(Exception e) {33 e.printStackTrace();34 }35 }36 }37 }

7. 爬虫的逻辑处理

我们打开51job的搜索页,分别查看首页,第2页,第3页,可以看到变化只是在访问页码的参数上有变化,所以我们可以循环去爬取整个的职位信息

首页

https://search.51job.com/list/000000,000000,0000,00,9,99,%2B,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=

第二页

https://search.51job.com/list/000000,000000,0000,00,9,99,%2B,2,2.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=

第三页

https://search.51job.com/list/000000,000000,0000,00,9,99,%2B,2,3.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=

代码实现:

首先创建一个接口,接口中定了两个方法,一个用于获取总的页数,一个用于循环去爬取数据

爬虫的方法实现了该接口,后续如果我们要爬取其他网站,只需要实现该接口,编写逻辑即可

IJobHandle.java

1 packagecom.devin.jobsearch.Service;2

3 /**

4 *@authorDevin Zhang5 * @className IJobHandle6 * @description TODO7 * @date 2020/4/22 16:398 */

9

10 public interfaceIJobHandle {11

12 int getJobPage() throwsException;13

14 void handle() throwsException;15 }

Job51SearchHandle.java

1 packagecom.devin.jobsearch.Service;2

3 importcom.devin.jobsearch.mapper.JobMapper;4 importcom.devin.jobsearch.model.JobModel;5 importcom.devin.jobsearch.util.FileUtil;6 importcom.devin.jobsearch.util.SearchUtil;7 importorg.jsoup.nodes.Document;8 importorg.jsoup.nodes.Element;9 importorg.jsoup.select.Elements;10 importorg.springframework.stereotype.Service;11 importorg.springframework.util.CollectionUtils;12 importorg.springframework.util.StringUtils;13

14 importjavax.annotation.Resource;15 importjava.util.ArrayList;16 importjava.util.List;17 importjava.util.regex.Matcher;18 importjava.util.regex.Pattern;19

20 /**

21 *@authorDevin Zhang22 * @className Job51SearchHandle23 * @description TODO24 * @date 2020/4/21 16:4125 */

26

27 @Service28 public class Job51SearchHandle implementsIJobHandle {29

30 @Resource31 privateSearchUtil searchUtil;32 @Resource33 privateJobMapper jobMapper;34 @Resource35 privateFileUtil fileUtil;36

37 private static final String PAGEPATTERN = "pagePattern";38 private static final String JOB51URL = "https://search.51job.com/list/000000,000000,0000,00,9,99,%2B,2,pagePattern.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=";39

40

41 /**

42 * 获取51job总共有多少页43 *44 *@return

45 *@throwsException46 */

47 @Override48 public int getJobPage() throwsException {49 String url =JOB51URL;50 url = url.replace(PAGEPATTERN, "1");51 Document document =searchUtil.getDocument(url);52 return Integer.parseInt(document.getElementById("hidTotalPage").val());53 }54

55

56 /**

57 * 分页爬取51job58 */

59 @Override60 public void handle() throwsException {61 //目标地址

62 int pageTotal = this.getJobPage();63 List jobModelList = null;64 for (int page = 1; page <= pageTotal; page++) {65 try{66 jobModelList = newArrayList();67 System.out.println("开始爬取第:" + page + "页的数据");68 String url =JOB51URL;69 url = url.replace(PAGEPATTERN, page + "");70 Document document =searchUtil.getDocument(url);71 //右侧导航栏

72 Elements nav_com = document.getElementsByClass("el");73 for(Element element : nav_com) {74 if (element.children().first().tagName("p").hasClass("t1") &&

75 element.children().first().tagName("p").children().hasClass("check")) {76

77 String jobName = element.children().first().tagName("p").children().tagName("span").text();78 String jobUrl = element.children().first().tagName("p").child(2).child(0).attr("href");79 String companyName = element.child(1).text();80 String companyUrl = element.child(1).child(0).attr("href");81 String jobLocation = element.child(2).text();82 String jobSalary = element.child(3).text();83 String jobDate = element.child(4).text();84

85 JobModel jobModel = newJobModel();86 jobModel.setJobName(jobName);87 jobModel.setJobUrl(jobUrl);88 jobModel.setJobCompanyName(companyName);89 jobModel.setJobCompanyUrl(companyUrl);90 jobModel.setJobLocation(jobLocation);91 jobModel.setJobSalary(jobSalary);92 jobModel.setJobDate(jobDate);93

94 //爬取明细

95 Document detailDocument =searchUtil.getDocument(jobUrl);96

97 String jobRestrict = detailDocument.getElementsByClass("msg ltype").text();98 String jobDesc = detailDocument.getElementsByClass("bmsg job_msg inbox").text();99 String jobLocationDetail = "";100 if (detailDocument.getElementsByClass("bmsg inbox").size() > 0) {101 jobLocationDetail = detailDocument.getElementsByClass("bmsg inbox").first().child(0).text();102 }103 String companyDesc = detailDocument.getElementsByClass("tmsg inbox").text();104 String companyImage = "";105 if (detailDocument.getElementsByClass("com_name himg").size() > 0) {106 companyImage = detailDocument.getElementsByClass("com_name himg").first().child(0).attr("src");107 }108

109

110 jobModel.setJobRestrictStr(jobRestrict);111 jobModel.setJobDetail(jobDesc);112 jobModel.setJobLocationDetail(jobLocationDetail);113 jobModel.setJobCompanyDesc(companyDesc);114 jobModel.setJobCompanyImage(companyImage);115

116 String jobId = "";117 String patternStr = "/[0-9]*.html";118 Pattern pattern =Pattern.compile(patternStr);119 Matcher matcher =pattern.matcher(jobUrl);120 if(matcher.find()) {121 jobId =matcher.group();122 jobId = jobId.replaceAll(".html", "").replaceAll("/","");123 }124 if(StringUtils.isEmpty(jobId)) {125 patternStr = "jobid=[0-9]*";126 pattern =Pattern.compile(patternStr);127 matcher =pattern.matcher(jobUrl);128 if(matcher.find()) {129 jobId =matcher.group();130 jobId = jobId.replaceAll("jobid=", "");131 }132 }133 if(StringUtils.isEmpty(jobId)) {134 patternStr = "#[0-9]*";135 pattern =Pattern.compile(patternStr);136 matcher =pattern.matcher(jobUrl);137 if(matcher.find()) {138 jobId =matcher.group();139 jobId = jobId.replaceAll("#", "");140 }141 }142 jobModel.setJobId(jobId);143 System.out.println(jobModel);144 jobModelList.add(jobModel);145 }146 }147 if (!CollectionUtils.isEmpty(jobModelList)) {148 System.out.println("第" + page + "页数据,开始插入数据");149 jobMapper.insertList(jobModelList);150 }151 Thread.sleep(3000); //sleep 3 秒,防止访问太频繁,被禁掉152 } catch(Exception e) {153 e.printStackTrace();154 if (null !=jobModelList) {155 fileUtil.writeLog(jobModelList.toString());156 }157 }158 }159 }160

161 }

8. 调用

我们新建一个controller,访问直接调用弄

1 packagecom.devin.jobsearch.controller;2

3 importcom.devin.jobsearch.Service.Job51SearchHandle;4 importorg.springframework.web.bind.annotation.GetMapping;5 importorg.springframework.web.bind.annotation.RequestMapping;6 importorg.springframework.web.bind.annotation.RestController;7

8 importjavax.annotation.Resource;9

10 /**

11 *@authorDevin Zhang12 * @className JobController13 * @description TODO14 * @date 2020/4/22 16:3615 */

16 @RestController17 @RequestMapping("/job")18 public classJobController {19

20 @Resource21 privateJob51SearchHandle job51SearchHandle;22

23 @GetMapping("/51jobHandle")24 public String handle51JobController() throwsException {25 job51SearchHandle.handle();26 return "success";27 }28 }

访问: localhost:7999/job/51jobHandle 即可触发爬取,可以看到爬取到的数据已经存到数据库了

e81e13ea8ff3da1daf2293ca34148b3f.png

最后,再次说明,本文章的目的只是为了学习Jsoup,爬取到的数据也不会用作其他使用,毕竟爬虫爬的好,牢饭吃的饱。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值