微借阅小程序开发实战(三)-Springboot实现图书信息数据爬虫

利用Springboot+Mybatis和Jsoup进行页面解析,redis对页面url存取,实现对当当网所有图书书籍的爬取

  • application.yaml基础配置
spring:
    datasource:
        username: root
        password: abc123_
        driver-class-name: com.mysql.jdbc.Driver
        url: jdbc:mysql://localhost:3306/learn?serverTimezone=CTT&useUnicode=true&characterEncoding=utf-8
    redis:
        host: 127.0.0.1
        port: 6379
        password: 123456
        timeout: 5000
mybatis:
    mapper-locations: classpath:mapping/BookSpiterMapper.xml
    type-aliases-package: com.augmentum.book.bean
dangdang:
    indexUrl: "http://category.dangdang.com/"
resultNum:
    num: 6000

  • controller 代码
package com.augmentum.book.controller;

import com.augmentum.book.service.BookSpiterService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RestController;
@RestController
public class BookSpiterController {
    @Autowired
    private BookSpiterService bookSpiterService;

	// 爬虫开始接口
    @GetMapping("bookSpiter")
    public String getData(){
        bookSpiterService.process();
        return "SUCCESS";
    }

	// 爬虫开始前 先爬取所有图书首页一级url
    @GetMapping("setIndexUrl")
    public String setIndexUrl(){
        bookSpiterService.setIndexUrl();
        return "SUCCESS";
    }
    // 再爬取所有图书首页二级url
    @GetMapping("setIndexNextUrl")
    public String setIndexNextUrl(){
        bookSpiterService.setIndexNextUrl();
        return "SUCCESS";
    }
}

  • service业务代码
package com.augmentum.book.service.impl;

import com.augmentum.book.bean.BookSpiter;
import com.augmentum.book.bean.Urls;
import com.augmentum.book.dao.BookSpiterMapper;
import com.augmentum.book.service.BookSpiterService;
import com.augmentum.book.util.HttpClientDownPage;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.eclipse.jetty.util.StringUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.data.redis.core.StringRedisTemplate;
import org.springframework.stereotype.Service;

import javax.annotation.PostConstruct;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;

@Slf4j
@Service
public class BookSpiterServiceImpl implements BookSpiterService {

    @Value("${dangdang.indexUrl}")
    private String indexUrl;

    @Autowired
    BookSpiterMapper bookSpiterMapper;

    @Autowired
    private StringRedisTemplate stringRedisTemplate;

    private List<BookSpiter> list = new ArrayList<>();

    private List<Urls> urlList = new ArrayList<>();

    private int successNum = 0;

    private int failedNum = 0;

    private Boolean flag = false;

    private int result = 1;

    //从id=index的url开始放入redis|202
    private int index = 628;

    @Override
    public void process() {
        //将首地址存入队列
        //将首地址存入Redis中
        while(true) {
            try {
                //从redis获取地址
                String nexturl = stringRedisTemplate.opsForList().rightPop("url");

                if(StringUtils.isNotBlank(nexturl)) {
                    //通过HttpClient请求页面,获取网页源码进行解析
                    String content = HttpClientDownPage.sendGet(nexturl);
                    //通过Jsoup进行页面解析
                    Document document = Jsoup.parse(content);
                    paraseList(document);
                    //批量插入数据
                    int i = bookSpiterMapper.insertBatch(list);

                    if (i > 0) {
                        System.out.println("批量插入数据成功:" + i);
                        System.out.println("当前为id=" + index + "的页面:" + nexturl);
                    }

                    list = new ArrayList<>();
                }
            } catch (Exception e) {
                log.error("网页源码解析异常!");
                log.error(e.getMessage());
            }

            if(flag) {
                log.info("爬取结束--flag为true");
                break;
            }
        }

        log.info("-----所有页面爬取结束----");
    }

    @Override
    public void process1() {
        log.info("线程2");
    }

    @Override
    public void setIndexUrl() {
        try {
            //通过HttpClient请求页面,获取网页源码进行解析
            String content = HttpClientDownPage.sendGet(indexUrl);
            //通过Jsoup进行页面解析
            Document document = Jsoup.parse(content);
            Elements elements = document.select("#floor_1 .classify_kind .classify_kind_name");

            for(Element element : elements){
                Elements hrefs = element.select("a[href]");
                String indexUrl = hrefs.attr("href");
                String type = hrefs.text();
                log.info("首页地址:" + indexUrl);
                Urls urls = new Urls();
                urls.setUrl(indexUrl);
                urls.setType(type);
                urlList.add(urls);
            }

            bookSpiterMapper.insertUrlBatch(urlList);
        } catch (Exception e) {
            log.error(e.getMessage());
        }
    }

    @Override
    public void setIndexNextUrl() {
        try {
            //通过HttpClient请求页面,获取网页源码进行解析
            String content = HttpClientDownPage.sendGet(indexUrl);
            //通过Jsoup进行页面解析
            Document document = Jsoup.parse(content);
            Elements elements = document.select("#floor_1 .classify_kind ul");

            for(Element element : elements){
                Elements hrefs = element.select("li[name=cat_3]");
                for(Element element1 : hrefs){
                    Elements href = element1.select("a[href]");
                    String indexUrl = href.attr("href");
                    String type = href.text();
                    Urls urls = new Urls();
                    urls.setUrl(indexUrl);
                    urls.setType(type);
                    urlList.add(urls);
                }
            }

            bookSpiterMapper.insertUrlBatch(urlList);
        } catch (Exception e) {
            log.error(e.getMessage());
        }
    }

    private void paraseList(Document document) throws InterruptedException {
        String baseurl = "http://category.dangdang.com";
        //根据网页标签解析源码
        Elements elements = document.select("#search_nature_rg ul li");

        for(Element element : elements){
            Elements href = element.select("a[href]");
            String detailUrl = href.attr("href");
            dealBookSpiter(detailUrl);
        }

        /**
         * 这里解析下一页地址的标签,获取下一页的Url,然后放在redis中
         */
        Elements nextUrl = document.select(".paging .next").select("a[href]");
        String url = nextUrl.attr("href");

        if(StringUtil.isNotBlank(url)){
            stringRedisTemplate.opsForList().leftPush("url", baseurl + url);
        } else {
            if (index + 1 <= bookSpiterMapper.selectCount()) {
                Urls urls = bookSpiterMapper.selectById(index + 1);
                String indexUrl = urls.getUrl();
                stringRedisTemplate.opsForList().leftPush("url", indexUrl);
                log.info("第" + index + "个页面解析结束5分钟后开启下一个页面爬取...");
                log.info("成功爬取数据条数 :" + successNum);
                log.info("爬取数据失败条数 :" + failedNum);
                Thread.sleep(5000*60);
                successNum = 0;
                failedNum = 0;
                index++;
            } else {
                flag = true;
            }
        }
    }

    public void dealBookSpiter(String detailUrl){

        BookSpiter bookSpiter = new BookSpiter();

        if(StringUtils.isNotBlank(detailUrl)) {
            try {
                //通过HttpClient请求页面,获取网页源码进行解析
                String content = HttpClientDownPage.sendGet(detailUrl);
                //通过Jsoup进行页面解析
                Document document = Jsoup.parse(content);
                //获取图书图片
                Elements element1 = document.select("#largePicDiv img[src$=.jpg]");
                String src = element1.attr("abs:src");
                bookSpiter.setImage(src);
                //获取图书名称
                Elements element2 = document.select("#product_info .name_info h1");
                bookSpiter.setName(element2.text());
                //获取图书描述
                Elements element3 = document.select("#product_info .name_info h2");
                bookSpiter.setDescription(element3.text());
                //获取作者名字
                Elements element4 = document.select("#author");

                if (element4.text().split(":").length > 1) {
                    bookSpiter.setAuthor(element4.text().split(":")[1]);
                } else {
                    bookSpiter.setAuthor(element4.text());
                }

                //获取isbn
                Elements element6 = document.select("#detail_describe ul li");

                if (element6.get(4).text().split(":").length > 1) {
                    bookSpiter.setIsbn(element6.get(4).text().split(":")[1].trim());
                } else {
                    bookSpiter.setAuthor(element6.get(4).text());
                }

                //将解析后的实体放入集合中
                list.add(bookSpiter);
                System.out.println(result + " : " + bookSpiter);
                result += 1;
                successNum ++;
            } catch (Exception e) {
                failedNum ++;
                log.error(e.getMessage());
            }
        }
    }

    @PostConstruct
    public void adUrl(){
        Set<String> keys = stringRedisTemplate.keys("*");
        stringRedisTemplate.delete("url");
        log.info("************清空redis中缓存地址**************");
        //id = index 的url放入作为首次url
        Urls urls = bookSpiterMapper.selectById(index);
        String indexUrl = urls.getUrl();
        stringRedisTemplate.opsForList().leftPush("url", indexUrl);
        log.info("************添加一次当当网首页地址地址**************");
    }

}

次项目整合代码见链接

https://gitee.com/JasonLee1286791087/book
如出现卡顿在hosts文件中加入
221.122.85.248 category.dangdang.com
221.122.86.77 product.dangdang.com

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值