webmagic

webmagic学习笔记

官网地址: http://webmagic.io/

核心思路:

1. 自定义BookPageProcess类,实现 PageProcessor接口,并重写process() 。方法内主要解析html页面标签,获取需要的数据,将数据存入Pipeline
2. 自定义BookPageContentPipeline类,实现Pipeline接口,并重写process()。方法内主要是进行数据持久化,本文使用mybatis进行持久化入库。
3. 通过定时任务,定时拉取数据
1.准备工作
1.1引入依赖
<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-core</artifactId>
    <version>0.7.3</version>
</dependency>
<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-extension</artifactId>
    <version>0.7.3</version>
</dependency>
1.2SQL

DROP TABLE IF EXISTS `bookdata`;
CREATE TABLE `bookdata` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `picUrl` varchar(128) DEFAULT NULL,
  `name` varchar(64) DEFAULT NULL,
  `author` varchar(64) DEFAULT NULL,
  `readPeopleNum` varchar(24) DEFAULT NULL,
  `chapter` varchar(24) DEFAULT NULL,
  `wordNum` varchar(24) DEFAULT NULL,
  `status` varchar(24) DEFAULT NULL,
  `lastUpdateTime` datetime DEFAULT NULL,
  `downloadUrl` varchar(128) DEFAULT NULL,
  `note` varchar(255) DEFAULT NULL,
  `likeBook` text,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=401 DEFAULT CHARSET=utf8;
1.3yml
spring:
  datasource:
    type: org.apache.commons.dbcp.BasicDataSource
    url: jdbc:mysql://localhost:3306/pachong?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8
    username: root
    password: 123456
    driverClassName: com.mysql.jdbc.Driver
2.java代码

BookVo类

package com.easyexcel.demo.vo;

import lombok.Data;

import java.util.Date;

@Data
public class BookVo {
    private Integer id;
    private String picUrl;
    private String name;
    private String author;
    private String readPeopleNum;
    private String chapter;
    private String wordNum;
    private String status;
    private Date lastUpdateTime;
    private String downloadUrl;
    private String note;
    private String likeBook;
}

BookDataMapper类

package com.easyexcel.demo.mapper;

import com.easyexcel.demo.vo.BookVo;
import org.springframework.stereotype.Component;

@Component
public interface BookDataMapper {

    public int insertBookVo(BookVo bookVo);
}

BookDataMapper.xml

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.easyexcel.demo.mapper.BookDataMapper">
    <resultMap id="BaseResultMap" type="com.easyexcel.demo.vo.BookVo">
        <id column="id" property="id"/>
        <result column="picUrl" property="picUrl"/>
        <result column="name" property="name"/>
        <result column="author" property="author"/>
        <result column="readPeopleNum" property="readPeopleNum"/>
        <result column="chapter" property="chapter"/>
        <result column="detail_url" property="detail_url"/>
        <result column="wordNum" property="wordNum"/>
        <result column="status" property="status"/>
        <result column="lastUpdateTime" property="lastUpdateTime"/>
        <result column="downloadUrl" property="downloadUrl"/>
        <result column="note" property="note"/>
        <result column="likeBook" property="likeBook"/>
    </resultMap>
    <insert id="insertBookVo" parameterType="com.easyexcel.demo.vo.BookVo">
        <selectKey keyProperty="id" order="AFTER" resultType="java.lang.Integer">
            SELECT LAST_INSERT_ID()
        </selectKey>
        insert into bookdata
        <trim prefix="(" suffix=")" suffixOverrides=",">
            <if test="picUrl != null">
                picUrl,
            </if>
            <if test="name != null">
                name,
            </if>
            <if test="author != null">
                author,
            </if>
            <if test="readPeopleNum != null">
                readPeopleNum,
            </if>
            <if test="chapter != null">
                chapter,
            </if>
            <if test="wordNum != null">
                wordNum,
            </if>
            <if test="status != null">
                status,
            </if>
            <if test="lastUpdateTime != null">
                lastUpdateTime,
            </if>
            <if test="downloadUrl != null">
                downloadUrl,
            </if>
            <if test="note != null">
                note,
            </if>
            <if test="likeBook != null">
                likeBook
            </if>
        </trim>
        <trim prefix="values (" suffix=")" suffixOverrides=",">
            <if test="picUrl != null">
                #{picUrl},
            </if>
            <if test="name != null">
                #{name},
            </if>
            <if test="author != null">
                #{author},
            </if>
            <if test="readPeopleNum != null">
                #{readPeopleNum},
            </if>
            <if test="chapter != null">
                #{chapter},
            </if>
            <if test="wordNum != null">
                #{wordNum},
            </if>
            <if test="status != null">
                #{ status},
            </if>
            <if test="lastUpdateTime != null">
                #{lastUpdateTime},
            </if>
            <if test="downloadUrl != null">
                #{ downloadUrl},
            </if>
            <if test="note != null">
                #{ note},
            </if>
            <if test="likeBook != null">
                #{ likeBook}
            </if>
        </trim>
    </insert>
</mapper>

BookDataService类

package com.easyexcel.demo.Service;

import com.easyexcel.demo.mapper.BookDataMapper;
import com.easyexcel.demo.vo.BookVo;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

@Service
public class BookDataService {

    @Autowired
    private BookDataMapper bookDataMapper;

    public int insertBookVo(BookVo bookVo) {
        return bookDataMapper.insertBookVo(bookVo);
    }
}

BookPageProcess类

package com.easyexcel.demo.pageprocessor;

import com.alibaba.fastjson.JSONObject;
import com.easyexcel.demo.vo.BookVo;
import org.jsoup.Jsoup;
import org.jsoup.select.Elements;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.List;

public class BookPageProcess implements PageProcessor {

    public static final String DOMAIN = "https://www.iqisuu.com";
    private Site site = Site.me().setRetryTimes(3)
            .setTimeOut(10 * 1000)
            .setRetryTimes(3)
            .setRetrySleepTime(3000);

    @Override
    public void process(Page page) {
        List<Selectable> nodes = page.getHtml().css("div.l ul ").nodes();
        if (nodes != null && nodes.size() > 0) {

            List<String> detailUrls = page.getHtml().css("div#newscontent div.l ul li span.s2 a", "href").all();
            detailUrls.forEach(page::addTargetRequest);
            List<String> pageUrls = page.getHtml().css("div.pages div#pagelink a", "href").all();
            String nextString = pageUrls.get(pageUrls.size() - 2).substring(0, pageUrls.get(pageUrls.size() - 2).length() - 1);

            page.addTargetRequest(DOMAIN + nextString);
        } else {
            this.saveData(page);
        }
    }

    private void saveData(Page page) {

        BookVo bookVo = new BookVo();
        bookVo.setPicUrl(page.getHtml().css("div#sidebar div#fmimg img", "data-original").toString());
        bookVo.setName(page.getHtml().css("div#info h1", "text").toString());
        bookVo.setAuthor(page.getHtml().css("div#info a", "text").all().get(0));

        String text = page.getHtml().css("div#info p.hidden-xs", "text").all().get(0);
        String replace = text.replace(" ", "");
        String[] split = replace.split("\\|");
        bookVo.setReadPeopleNum(split[0]);
        bookVo.setChapter(split[1]);
        bookVo.setWordNum(split[2]);
        bookVo.setStatus(split[3]);
        String timeString = page.getHtml().css("div#info p.hidden-xs", "text").all().get(2);
        String substring = timeString.substring(timeString.indexOf(":") + 1, timeString.length());
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        try {
            Date parse = simpleDateFormat.parse(substring);
            bookVo.setLastUpdateTime(parse);
        } catch (ParseException e) {
            e.printStackTrace();
        }
        //  格式: /down/70888/
        String downHtml = page.getHtml().css("div#maininfo div.readbtn a.addbookcase", "href").toString();
        String bookId = downHtml.substring(6, downHtml.length() - 1);
        // 下载地址格式 : https://www.iqisuu.com/api/txt_down.php?articleid=53694&articlename=古老之风云再起
        bookVo.setDownloadUrl(DOMAIN + "/api/txt_down.php?articleid=" + bookId + "&articlename=" + bookVo.getName());
        bookVo.setNote(Jsoup.parse(page.getHtml().css("div#intro").toString()).text());
        HashMap<String, String> map = new HashMap<>();
        List<String> likeBookHtmlList = page.getHtml().css("div.hidden-xs p a").all();
        likeBookHtmlList.forEach(entry -> {
            Elements a = Jsoup.parse(entry).select("a");
            String href = DOMAIN + a.attr("href");
            String text1 = a.text();
            map.put(text1, href);
        });
        String jsonString = JSONObject.toJSONString(map);
        bookVo.setLikeBook(jsonString);
        page.putField("bookVo", bookVo);
    }

    @Override
    public Site getSite() {
        return site;
    }
}

BookPageContentPipeline类

package com.easyexcel.demo.listerner;

import com.easyexcel.demo.Service.BookDataService;
import com.easyexcel.demo.vo.BookVo;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

@Component
public class BookPageContentPipeline implements Pipeline {
    @Autowired
    private BookDataService bookDataService;

    @Override
    public void process(ResultItems resultItems, Task task) {
        BookVo bookVo = (BookVo) resultItems.get("bookVo");
        if (bookVo != null) {
            bookDataService.insertBookVo(bookVo);
        }
    }
}

BookTask类

package com.easyexcel.demo.task;

import com.easyexcel.demo.listerner.BookPageContentPipeline;
import com.easyexcel.demo.pageprocessor.BookPageProcess;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Spider;

@Component
@EnableScheduling
public class BookTask {

    private static final String URL= "https://www.iqisuu.com/fenlei/xuanhuan/1/";

    @Autowired
    BookPageContentPipeline bookPageContentPipeline;

    @Scheduled(cron = "* * * * * *")
    public void simpleRead() {
        Spider.create(new BookPageProcess()).addUrl(URL).addPipeline(bookPageContentPipeline).thread(4)
                .run();

    }
}

扫描公众号:回复9003亦可查看

在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值