webmagic学习笔记
官网地址: http://webmagic.io/
核心思路:
1. 自定义BookPageProcess类,实现 PageProcessor接口,并重写process() 。方法内主要解析html页面标签,获取需要的数据,将数据存入Pipeline
2. 自定义BookPageContentPipeline类,实现Pipeline接口,并重写process()。方法内主要是进行数据持久化,本文使用mybatis进行持久化入库。
3. 通过定时任务,定时拉取数据
1.准备工作
1.1引入依赖
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
1.2SQL
DROP TABLE IF EXISTS `bookdata`;
CREATE TABLE `bookdata` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`picUrl` varchar(128) DEFAULT NULL,
`name` varchar(64) DEFAULT NULL,
`author` varchar(64) DEFAULT NULL,
`readPeopleNum` varchar(24) DEFAULT NULL,
`chapter` varchar(24) DEFAULT NULL,
`wordNum` varchar(24) DEFAULT NULL,
`status` varchar(24) DEFAULT NULL,
`lastUpdateTime` datetime DEFAULT NULL,
`downloadUrl` varchar(128) DEFAULT NULL,
`note` varchar(255) DEFAULT NULL,
`likeBook` text,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=401 DEFAULT CHARSET=utf8;
1.3yml
spring:
datasource:
type: org.apache.commons.dbcp.BasicDataSource
url: jdbc:mysql://localhost:3306/pachong?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8
username: root
password: 123456
driverClassName: com.mysql.jdbc.Driver
2.java代码
BookVo类
package com.easyexcel.demo.vo;
import lombok.Data;
import java.util.Date;
@Data
public class BookVo {
private Integer id;
private String picUrl;
private String name;
private String author;
private String readPeopleNum;
private String chapter;
private String wordNum;
private String status;
private Date lastUpdateTime;
private String downloadUrl;
private String note;
private String likeBook;
}
BookDataMapper类
package com.easyexcel.demo.mapper;
import com.easyexcel.demo.vo.BookVo;
import org.springframework.stereotype.Component;
@Component
public interface BookDataMapper {
public int insertBookVo(BookVo bookVo);
}
BookDataMapper.xml
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.easyexcel.demo.mapper.BookDataMapper">
<resultMap id="BaseResultMap" type="com.easyexcel.demo.vo.BookVo">
<id column="id" property="id"/>
<result column="picUrl" property="picUrl"/>
<result column="name" property="name"/>
<result column="author" property="author"/>
<result column="readPeopleNum" property="readPeopleNum"/>
<result column="chapter" property="chapter"/>
<result column="detail_url" property="detail_url"/>
<result column="wordNum" property="wordNum"/>
<result column="status" property="status"/>
<result column="lastUpdateTime" property="lastUpdateTime"/>
<result column="downloadUrl" property="downloadUrl"/>
<result column="note" property="note"/>
<result column="likeBook" property="likeBook"/>
</resultMap>
<insert id="insertBookVo" parameterType="com.easyexcel.demo.vo.BookVo">
<selectKey keyProperty="id" order="AFTER" resultType="java.lang.Integer">
SELECT LAST_INSERT_ID()
</selectKey>
insert into bookdata
<trim prefix="(" suffix=")" suffixOverrides=",">
<if test="picUrl != null">
picUrl,
</if>
<if test="name != null">
name,
</if>
<if test="author != null">
author,
</if>
<if test="readPeopleNum != null">
readPeopleNum,
</if>
<if test="chapter != null">
chapter,
</if>
<if test="wordNum != null">
wordNum,
</if>
<if test="status != null">
status,
</if>
<if test="lastUpdateTime != null">
lastUpdateTime,
</if>
<if test="downloadUrl != null">
downloadUrl,
</if>
<if test="note != null">
note,
</if>
<if test="likeBook != null">
likeBook
</if>
</trim>
<trim prefix="values (" suffix=")" suffixOverrides=",">
<if test="picUrl != null">
#{picUrl},
</if>
<if test="name != null">
#{name},
</if>
<if test="author != null">
#{author},
</if>
<if test="readPeopleNum != null">
#{readPeopleNum},
</if>
<if test="chapter != null">
#{chapter},
</if>
<if test="wordNum != null">
#{wordNum},
</if>
<if test="status != null">
#{ status},
</if>
<if test="lastUpdateTime != null">
#{lastUpdateTime},
</if>
<if test="downloadUrl != null">
#{ downloadUrl},
</if>
<if test="note != null">
#{ note},
</if>
<if test="likeBook != null">
#{ likeBook}
</if>
</trim>
</insert>
</mapper>
BookDataService类
package com.easyexcel.demo.Service;
import com.easyexcel.demo.mapper.BookDataMapper;
import com.easyexcel.demo.vo.BookVo;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
@Service
public class BookDataService {
@Autowired
private BookDataMapper bookDataMapper;
public int insertBookVo(BookVo bookVo) {
return bookDataMapper.insertBookVo(bookVo);
}
}
BookPageProcess类
package com.easyexcel.demo.pageprocessor;
import com.alibaba.fastjson.JSONObject;
import com.easyexcel.demo.vo.BookVo;
import org.jsoup.Jsoup;
import org.jsoup.select.Elements;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
public class BookPageProcess implements PageProcessor {
public static final String DOMAIN = "https://www.iqisuu.com";
private Site site = Site.me().setRetryTimes(3)
.setTimeOut(10 * 1000)
.setRetryTimes(3)
.setRetrySleepTime(3000);
@Override
public void process(Page page) {
List<Selectable> nodes = page.getHtml().css("div.l ul ").nodes();
if (nodes != null && nodes.size() > 0) {
List<String> detailUrls = page.getHtml().css("div#newscontent div.l ul li span.s2 a", "href").all();
detailUrls.forEach(page::addTargetRequest);
List<String> pageUrls = page.getHtml().css("div.pages div#pagelink a", "href").all();
String nextString = pageUrls.get(pageUrls.size() - 2).substring(0, pageUrls.get(pageUrls.size() - 2).length() - 1);
page.addTargetRequest(DOMAIN + nextString);
} else {
this.saveData(page);
}
}
private void saveData(Page page) {
BookVo bookVo = new BookVo();
bookVo.setPicUrl(page.getHtml().css("div#sidebar div#fmimg img", "data-original").toString());
bookVo.setName(page.getHtml().css("div#info h1", "text").toString());
bookVo.setAuthor(page.getHtml().css("div#info a", "text").all().get(0));
String text = page.getHtml().css("div#info p.hidden-xs", "text").all().get(0);
String replace = text.replace(" ", "");
String[] split = replace.split("\\|");
bookVo.setReadPeopleNum(split[0]);
bookVo.setChapter(split[1]);
bookVo.setWordNum(split[2]);
bookVo.setStatus(split[3]);
String timeString = page.getHtml().css("div#info p.hidden-xs", "text").all().get(2);
String substring = timeString.substring(timeString.indexOf(":") + 1, timeString.length());
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
try {
Date parse = simpleDateFormat.parse(substring);
bookVo.setLastUpdateTime(parse);
} catch (ParseException e) {
e.printStackTrace();
}
// 格式: /down/70888/
String downHtml = page.getHtml().css("div#maininfo div.readbtn a.addbookcase", "href").toString();
String bookId = downHtml.substring(6, downHtml.length() - 1);
// 下载地址格式 : https://www.iqisuu.com/api/txt_down.php?articleid=53694&articlename=古老之风云再起
bookVo.setDownloadUrl(DOMAIN + "/api/txt_down.php?articleid=" + bookId + "&articlename=" + bookVo.getName());
bookVo.setNote(Jsoup.parse(page.getHtml().css("div#intro").toString()).text());
HashMap<String, String> map = new HashMap<>();
List<String> likeBookHtmlList = page.getHtml().css("div.hidden-xs p a").all();
likeBookHtmlList.forEach(entry -> {
Elements a = Jsoup.parse(entry).select("a");
String href = DOMAIN + a.attr("href");
String text1 = a.text();
map.put(text1, href);
});
String jsonString = JSONObject.toJSONString(map);
bookVo.setLikeBook(jsonString);
page.putField("bookVo", bookVo);
}
@Override
public Site getSite() {
return site;
}
}
BookPageContentPipeline类
package com.easyexcel.demo.listerner;
import com.easyexcel.demo.Service.BookDataService;
import com.easyexcel.demo.vo.BookVo;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
@Component
public class BookPageContentPipeline implements Pipeline {
@Autowired
private BookDataService bookDataService;
@Override
public void process(ResultItems resultItems, Task task) {
BookVo bookVo = (BookVo) resultItems.get("bookVo");
if (bookVo != null) {
bookDataService.insertBookVo(bookVo);
}
}
}
BookTask类
package com.easyexcel.demo.task;
import com.easyexcel.demo.listerner.BookPageContentPipeline;
import com.easyexcel.demo.pageprocessor.BookPageProcess;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Spider;
@Component
@EnableScheduling
public class BookTask {
private static final String URL= "https://www.iqisuu.com/fenlei/xuanhuan/1/";
@Autowired
BookPageContentPipeline bookPageContentPipeline;
@Scheduled(cron = "* * * * * *")
public void simpleRead() {
Spider.create(new BookPageProcess()).addUrl(URL).addPipeline(bookPageContentPipeline).thread(4)
.run();
}
}
扫描公众号:回复9003亦可查看