网络爬虫知识 day08

一、案例实现

1.1 开发准备

1.1.1 创建工程

        创建Maven工程,并加入依赖。pom.xml为:

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0"

         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"

         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

    <modelVersion>4.0.0</modelVersion>

    <parent>

        <groupId>org.springframework.boot</groupId>

        <artifactId>spring-boot-starter-parent</artifactId>

        <version>2.0.2.RELEASE</version>

    </parent>

    <groupId>cn.itcast.crawler</groupId>

    <artifactId>itcast-crawler-job</artifactId>

    <version>1.0-SNAPSHOT</version>

    <properties>

        <java.version>1.8</java.version>

    </properties>

    <dependencies>

        <!--SpringMVC-->

        <dependency>

            <groupId>org.springframework.boot</groupId>

            <artifactId>spring-boot-starter-web</artifactId>

        </dependency>

        <!--SpringData Jpa-->

        <dependency>

            <groupId>org.springframework.boot</groupId>

            <artifactId>spring-boot-starter-data-jpa</artifactId>

        </dependency>

        <!--MySQL连接包-->

        <dependency>

            <groupId>mysql</groupId>

            <artifactId>mysql-connector-java</artifactId>

        </dependency>

        <!--WebMagic核心包-->

        <dependency>

            <groupId>us.codecraft</groupId>

            <artifactId>webmagic-core</artifactId>

            <version>0.7.3</version>

            <exclusions>

                <exclusion>

                    <groupId>org.slf4j</groupId>

                    <artifactId>slf4j-log4j12</artifactId>

                </exclusion>

            </exclusions>

        </dependency>

        <!--WebMagic扩展-->

        <dependency>

            <groupId>us.codecraft</groupId>

            <artifactId>webmagic-extension</artifactId>

            <version>0.7.3</version>

        </dependency>

        <!--WebMagic对布隆过滤器的支持-->

        <dependency>

            <groupId>com.google.guava</groupId>

            <artifactId>guava</artifactId>

            <version>16.0</version>

        </dependency>

        <!--工具包-->

        <dependency>

            <groupId>org.apache.commons</groupId>

            <artifactId>commons-lang3</artifactId>

        </dependency>

    </dependencies>

    

</project>

1.1.2 加入配置文件

        添加application.properties配置文件

        #DB Configuration:

        spring.datasource.driverClassName=com.mysql.jdbc.Driver

        spring.datasource.url=jdbc:mysql://127.0.0.1:3306/crawler

        spring.datasource.username=root

        spring.datasource.password=root

        #JPA Configuration:

        spring.jpa.database=MySQL

        spring.jpa.show-sql=true

1.1.3 编写Pojo

        @Entity

        public class JobInfo {

            @Id

            @GeneratedValue(strategy = GenerationType.IDENTITY)

            private Long id;

            private String companyName;

            private String companyAddr;

            private String companyInfo;

            private String jobName;

            private String jobAddr;

            private String jobInfo;

            private Integer salaryMin;

            private Integer salaryMax;

            private String url;

            private String time;

        }

1.1.4 编写Dao

public interface JobInfoDao extends JpaRepository<JobInfo, Long> {

}

1.1.5 编写Service

        编写Service接口

        public interface JobInfoService {

            public void save(JobInfo jobInfo);

            public List<JobInfo> findJobInfo(JobInfo jobInfo);

}

        编写Service实现类

        

1.1.6 编写引导类

1.2 功能实现

1.2.1 编写url解析功能

@Component

public class JobProcessor implements PageProcessor {

    @Autowired

    private SpringDataPipeline springDataPipeline;

    @Scheduled(initialDelay = 1000, fixedDelay = 1000 * 100)

    public void process() {

        //访问入口url地址

        String url = "https://search.51job.com/list

        Spider.create(new JobProcessor())

                .addUrl(url)

                .setScheduler(new QueueScheduler()

                        .setDuplicateRemover(new BloomFilterDuplicateRemover(10000000)))

                .thread(5)

                .run();

    }

    @Override

    public void process(Page page) {

        //获取页面数据

        List<Selectable> nodes = page.getHtml().$("div#resultList div.el").nodes();

        //判断nodes是否为空

        if (nodes.isEmpty()) {

            try {

                //如果为空,表示这是招聘信息详情页保存信息详情

                this.saveJobInfo(page);

            } catch (Exception e) {

                e.printStackTrace();

            }

        } else {

            //如果有值,表示这是招聘信息列表页

            for (Selectable node : nodes) {

                //获取招聘信息详情页url

                String jobUrl = node.links().toString();

                //添加到url任务列表中,等待下载

                page.addTargetRequest(jobUrl);

                //获取翻页按钮的超链接

                List<String> listUrl = page.getHtml().$("div.p_in li.bk").links().all();

                //添加到任务列表中

                page.addTargetRequests(listUrl);

            }

        }

    }

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小陈工

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值