使用Springboot-实现webMagic的简单实例-爬取51Job招聘信息

 1.创建maven工程并添加依赖

<dependencies>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-jpa</artifactId>
        </dependency>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
        </dependency>
        <!--webMagic jar-->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-log4j12</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
        </dependency>
        <!--webMagic 对布隆过滤器的支持-->
        <dependency>
            <groupId>com.google.guava</groupId>
            <artifactId>guava</artifactId>
            <version>16.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
            <version>3.7</version>
        </dependency>
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
        </dependency>
    </dependencies>

2.创建application.yml文件

spring:
  datasource:
    url: jdbc:mysql://localhost:3306/test01
    username: root
    password: root
    driver-class-name: com.mysql.jdbc.Driver
  jpa:
    database: MySQL
    show-sql: true

3.创建数据库表

CREATE TABLE job_info (
    id int PRIMARY KEY AUTO_INCREMENT,
    
	company_name varchar(256),
    company_addr varchar(256),
	company_info varchar(500),
	job_name varchar(256),
	job_info varchar(500),
	job_addr varchar(256),
	url varchar(256),
	time varchar(128),
    salary_min int,
	salary_max int
    );

4.创建启动类

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;

@SpringBootApplication
@EnableScheduling //定时任务注解
public class Application {
    public static void main(String[] args) {
        SpringApplication.run(Application.class);
    }
}

5.创建数据库表对应映射

    5.1创建pojo

import lombok.Data;

import javax.persistence.Entity;
import javax.persistence.GeneratedValue;
import javax.persistence.GenerationType;
import javax.persistence.Id;

@Entity
@Data
public class JobInfo {

    @Id
    @GeneratedValue(strategy = GenerationType.IDENTITY)
    private Long id;
    private String companyName;
    private String companyAddr;
    private String companyInfo;
    private String jobName;
    private String jobInfo;
    private String jobAddr;
    private String url;
    private String time;
    private Integer salaryMin;
    private Integer salaryMax;

    @Override
    public String toString() {
        return "JobInfo{" +
                "id=" + id +
                ", companyName='" + companyName + '\'' +
                ", companyAddr='" + companyAddr + '\'' +
                ", companyInfo='" + companyInfo + '\'' +
                ", jobName='" + jobName + '\'' +
                ", jobInfo='" + jobInfo + '\'' +
                ", url='" + url + '\'' +
                ", time='" + time + '\'' +
                ", salaryMin=" + salaryMin +
                ", salaryMax=" + salaryMax +
                '}';
    }
}

  5.2创建dao 

import org.springframework.data.jpa.repository.JpaRepository;

public interface JobInfoDao extends JpaRepository<JobInfo,Long> {
}

 6.创建操作实体类的Service

import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Example;
import org.springframework.stereotype.Service;

import java.util.List;

@Service
public class JobInfoService {

    @Autowired
    private JobInfoDao infoDao;

    public void save(JobInfo jobInfo){

        //根据url 和 发布时间查询数据
        JobInfo jobInfo1 = new JobInfo();
        jobInfo1.setUrl(jobInfo.getUrl());
        jobInfo1.setTime(jobInfo.getTime());

        //判断查询结果是否为空
       List<JobInfo> list =  this.findJobInfo(jobInfo1);
        if (list.size() == 0) {
            //如果查询结果为空,表示招聘信息数据不存在,或者已经更新了,需要新增或者更新数据库
            this.infoDao.saveAndFlush(jobInfo);
        }
    }

    private  List<JobInfo> findJobInfo(JobInfo jobInfo) {
        //设置查询条件
        Example<JobInfo> example = Example.of(jobInfo);

        //执行查询
        List<JobInfo> list = this.infoDao.findAll(example);
        return list;
    }
}

7.创建webMagic任务类

import org.jsoup.Jsoup;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

import java.util.List;

@Component
public class JobTask implements PageProcessor {

    private String url = "https://search.51job.com/list/020000,000000,0000,00,9,99,java,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=";
    @Override
    public void process(Page page) {
        //解析页面,获取招聘信息详情的url地址
        List<Selectable> list = page.getHtml().css("div#resultList div.el").nodes();

        //判断获取到集合是否为空
        if (list.size() == 0) {
            //如果为空,表示这是招聘的详情页,解析页面,获取招聘详情信息,保存数据
            this.saveJobInfo(page);
        }else {
            //如果不为空,表示这是列表页,解析出详情页的URL地址,放到任务队列
            for(Selectable selectable : list){
                //获取url地址
                String jobInfoUrl = selectable.links().toString();
                //把获取到的url地址放到任务队列中
                page.addTargetRequest(jobInfoUrl);

            }
            //获取下一页的url
            String bkUrl = page.getHtml().css("div.p_in li.bk").nodes().get(1).links().toString();
            System.out.println(bkUrl);
            //把获取到的url地址放到任务队列中
            page.addTargetRequest(bkUrl);

        }
        String html = page.getHtml().toString();
    }

    private void saveJobInfo(Page page) {
        JobInfo jobInfo = new JobInfo();

        //解析页面
        Html html = page.getHtml();

        //获取数据
        jobInfo.setCompanyName(html.css("div.cn p.cname a","text").toString());
        jobInfo.setCompanyAddr(Jsoup.parse(html.css("div.bmsg").nodes().get(1).toString()).text());
        jobInfo.setCompanyInfo(Jsoup.parse(html.css("div.tmsg").toString()).text());
        jobInfo.setJobName(html.css("div.cn h1","text").toString());
        jobInfo.setJobInfo(Jsoup.parse(html.css("div.job_msg").toString()).text());
        jobInfo.setJobAddr(html.css("div.cn span.lname","text").toString());
        jobInfo.setUrl(page.getUrl().toString());

        //获取薪资

        //获取发布时间
       //String time =  Jsoup.parse(html.css("div.cn p.ltype").regex(".*发布").toString()).text();
        String time = "10-17";
       jobInfo.setTime(time);

       //把结果保存起来
        page.putField("jobInfo",jobInfo);


    }


    private Site site = Site.me()
            .setCharset("gbk")//设置编码
            .setTimeOut(10*1000)//设置超时时间
            .setRetrySleepTime(3000)//设置重试的间隔时间
            .setRetryTimes(3);//设置重试次数
    @Override
    public Site getSite() {
        return site;
    }

    @Autowired
    private SpringDataPipeline springDataPipeline;

    @Scheduled(initialDelay = 1000,fixedDelay = 100 * 1000)
    public void process(){
        //创建下载器
        HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
        //给下载器设置代理服务器信息
        httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy("101.110.119.70",80)));

        Spider.create(new JobTask())
                .addUrl(url)
                .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100000)))
                .thread(10)
                .addPipeline(springDataPipeline)
             //   .setDownloader(httpClientDownloader)
                .run();
    }
}

8.获取保存的实体类,并保存进数据库

import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

@Component
public class SpringDataPipeline implements Pipeline {

    @Autowired
    private JobInfoService jobInfoService;

    @Override
    public void process(ResultItems resultItems, Task task) {

        //获取封装好的招聘详情对象
        JobInfo jobInfo = resultItems.get("jobInfo");

        //判断数据是否不为空
        if (jobInfo != null){
            //如果不为空把数据保存到数据库中
            this.jobInfoService.save(jobInfo);
        }
    }


}

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值