利用Springboot+Mybatis和Jsoup进行页面解析,redis对页面url存取,实现对当当网所有图书书籍的爬取
- application.yaml基础配置
spring:
datasource:
username: root
password: abc123_
driver-class-name: com.mysql.jdbc.Driver
url: jdbc:mysql://localhost:3306/learn?serverTimezone=CTT&useUnicode=true&characterEncoding=utf-8
redis:
host: 127.0.0.1
port: 6379
password: 123456
timeout: 5000
mybatis:
mapper-locations: classpath:mapping/BookSpiterMapper.xml
type-aliases-package: com.augmentum.book.bean
dangdang:
indexUrl: "http://category.dangdang.com/"
resultNum:
num: 6000
- controller 代码
package com.augmentum.book.controller;
import com.augmentum.book.service.BookSpiterService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RestController;
@RestController
public class BookSpiterController {
@Autowired
private BookSpiterService bookSpiterService;
// 爬虫开始接口
@GetMapping("bookSpiter")
public String getData(){
bookSpiterService.process();
return "SUCCESS";
}
// 爬虫开始前 先爬取所有图书首页一级url
@GetMapping("setIndexUrl")
public String setIndexUrl(){
bookSpiterService.setIndexUrl();
return "SUCCESS";
}
// 再爬取所有图书首页二级url
@GetMapping("setIndexNextUrl")
public String setIndexNextUrl(){
bookSpiterService.setIndexNextUrl();
return "SUCCESS";
}
}
- service业务代码
package com.augmentum.book.service.impl;
import com.augmentum.book.bean.BookSpiter;
import com.augmentum.book.bean.Urls;
import com.augmentum.book.dao.BookSpiterMapper;
import com.augmentum.book.service.BookSpiterService;
import com.augmentum.book.util.HttpClientDownPage;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.eclipse.jetty.util.StringUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.data.redis.core.StringRedisTemplate;
import org.springframework.stereotype.Service;
import javax.annotation.PostConstruct;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
@Slf4j
@Service
public class BookSpiterServiceImpl implements BookSpiterService {
@Value("${dangdang.indexUrl}")
private String indexUrl;
@Autowired
BookSpiterMapper bookSpiterMapper;
@Autowired
private StringRedisTemplate stringRedisTemplate;
private List<BookSpiter> list = new ArrayList<>();
private List<Urls> urlList = new ArrayList<>();
private int successNum = 0;
private int failedNum = 0;
private Boolean flag = false;
private int result = 1;
//从id=index的url开始放入redis|202
private int index = 628;
@Override
public void process() {
//将首地址存入队列
//将首地址存入Redis中
while(true) {
try {
//从redis获取地址
String nexturl = stringRedisTemplate.opsForList().rightPop("url");
if(StringUtils.isNotBlank(nexturl)) {
//通过HttpClient请求页面,获取网页源码进行解析
String content = HttpClientDownPage.sendGet(nexturl);
//通过Jsoup进行页面解析
Document document = Jsoup.parse(content);
paraseList(document);
//批量插入数据
int i = bookSpiterMapper.insertBatch(list);
if (i > 0) {
System.out.println("批量插入数据成功:" + i);
System.out.println("当前为id=" + index + "的页面:" + nexturl);
}
list = new ArrayList<>();
}
} catch (Exception e) {
log.error("网页源码解析异常!");
log.error(e.getMessage());
}
if(flag) {
log.info("爬取结束--flag为true");
break;
}
}
log.info("-----所有页面爬取结束----");
}
@Override
public void process1() {
log.info("线程2");
}
@Override
public void setIndexUrl() {
try {
//通过HttpClient请求页面,获取网页源码进行解析
String content = HttpClientDownPage.sendGet(indexUrl);
//通过Jsoup进行页面解析
Document document = Jsoup.parse(content);
Elements elements = document.select("#floor_1 .classify_kind .classify_kind_name");
for(Element element : elements){
Elements hrefs = element.select("a[href]");
String indexUrl = hrefs.attr("href");
String type = hrefs.text();
log.info("首页地址:" + indexUrl);
Urls urls = new Urls();
urls.setUrl(indexUrl);
urls.setType(type);
urlList.add(urls);
}
bookSpiterMapper.insertUrlBatch(urlList);
} catch (Exception e) {
log.error(e.getMessage());
}
}
@Override
public void setIndexNextUrl() {
try {
//通过HttpClient请求页面,获取网页源码进行解析
String content = HttpClientDownPage.sendGet(indexUrl);
//通过Jsoup进行页面解析
Document document = Jsoup.parse(content);
Elements elements = document.select("#floor_1 .classify_kind ul");
for(Element element : elements){
Elements hrefs = element.select("li[name=cat_3]");
for(Element element1 : hrefs){
Elements href = element1.select("a[href]");
String indexUrl = href.attr("href");
String type = href.text();
Urls urls = new Urls();
urls.setUrl(indexUrl);
urls.setType(type);
urlList.add(urls);
}
}
bookSpiterMapper.insertUrlBatch(urlList);
} catch (Exception e) {
log.error(e.getMessage());
}
}
private void paraseList(Document document) throws InterruptedException {
String baseurl = "http://category.dangdang.com";
//根据网页标签解析源码
Elements elements = document.select("#search_nature_rg ul li");
for(Element element : elements){
Elements href = element.select("a[href]");
String detailUrl = href.attr("href");
dealBookSpiter(detailUrl);
}
/**
* 这里解析下一页地址的标签,获取下一页的Url,然后放在redis中
*/
Elements nextUrl = document.select(".paging .next").select("a[href]");
String url = nextUrl.attr("href");
if(StringUtil.isNotBlank(url)){
stringRedisTemplate.opsForList().leftPush("url", baseurl + url);
} else {
if (index + 1 <= bookSpiterMapper.selectCount()) {
Urls urls = bookSpiterMapper.selectById(index + 1);
String indexUrl = urls.getUrl();
stringRedisTemplate.opsForList().leftPush("url", indexUrl);
log.info("第" + index + "个页面解析结束5分钟后开启下一个页面爬取...");
log.info("成功爬取数据条数 :" + successNum);
log.info("爬取数据失败条数 :" + failedNum);
Thread.sleep(5000*60);
successNum = 0;
failedNum = 0;
index++;
} else {
flag = true;
}
}
}
public void dealBookSpiter(String detailUrl){
BookSpiter bookSpiter = new BookSpiter();
if(StringUtils.isNotBlank(detailUrl)) {
try {
//通过HttpClient请求页面,获取网页源码进行解析
String content = HttpClientDownPage.sendGet(detailUrl);
//通过Jsoup进行页面解析
Document document = Jsoup.parse(content);
//获取图书图片
Elements element1 = document.select("#largePicDiv img[src$=.jpg]");
String src = element1.attr("abs:src");
bookSpiter.setImage(src);
//获取图书名称
Elements element2 = document.select("#product_info .name_info h1");
bookSpiter.setName(element2.text());
//获取图书描述
Elements element3 = document.select("#product_info .name_info h2");
bookSpiter.setDescription(element3.text());
//获取作者名字
Elements element4 = document.select("#author");
if (element4.text().split(":").length > 1) {
bookSpiter.setAuthor(element4.text().split(":")[1]);
} else {
bookSpiter.setAuthor(element4.text());
}
//获取isbn
Elements element6 = document.select("#detail_describe ul li");
if (element6.get(4).text().split(":").length > 1) {
bookSpiter.setIsbn(element6.get(4).text().split(":")[1].trim());
} else {
bookSpiter.setAuthor(element6.get(4).text());
}
//将解析后的实体放入集合中
list.add(bookSpiter);
System.out.println(result + " : " + bookSpiter);
result += 1;
successNum ++;
} catch (Exception e) {
failedNum ++;
log.error(e.getMessage());
}
}
}
@PostConstruct
public void adUrl(){
Set<String> keys = stringRedisTemplate.keys("*");
stringRedisTemplate.delete("url");
log.info("************清空redis中缓存地址**************");
//id = index 的url放入作为首次url
Urls urls = bookSpiterMapper.selectById(index);
String indexUrl = urls.getUrl();
stringRedisTemplate.opsForList().leftPush("url", indexUrl);
log.info("************添加一次当当网首页地址地址**************");
}
}
次项目整合代码见链接
https://gitee.com/JasonLee1286791087/book
如出现卡顿在hosts文件中加入
221.122.85.248 category.dangdang.com
221.122.86.77 product.dangdang.com