JAVA爬虫案例——JSOUP爬取图片并使用v-viewer实现图片预览

前言

网络爬虫是大数据时代收集数据的一种有效手段,合理合法的运用技术手段获取网络数据,实现数据的再利用也是程序员的一项重要技能。本节我们使用java环境下的jsoup实现网络数据的爬取,主要是图片数据的异步爬取,并实现网络图片的下载及图片的预览功能,预览功能使用具有丰富功能的v-viewer实现。

正文

  • 引入爬虫pom工具包
<dependency>
	<groupId>org.apache.httpcomponents</groupId>
	<artifactId>httpclient</artifactId>
	<version>4.5.6</version>
</dependency>
<dependency>
	<groupId>org.jsoup</groupId>
	<artifactId>jsoup</artifactId>
	<version>1.14.2</version>
</dependency>

  •  爬虫案例后端控制层controller
package com.yundi.atp.platform.module.test.controller;


import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
import com.yundi.atp.platform.common.Result;
import com.yundi.atp.platform.module.test.entity.SpiderData;
import com.yundi.atp.platform.module.test.service.SpiderDataService;
import io.swagger.annotations.Api;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.*;
import sun.misc.BASE64Encoder;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ThreadPoolExecutor;

/**
 * <p>
 * 前端控制器
 * </p>
 *
 * @author yanp
 * @since 2021-09-10
 */
@Api(tags = {"爬虫抓取案例API"})
@RestController
@RequestMapping("/test/spiderData")
public class SpiderDataController {
    @Autowired
    private SpiderDataService spiderDataService;
    @Autowired
    private ThreadPoolExecutor threadPoolExecutor;

    @GetMapping(value = "startSpiderData")
    public Result startSpiderData() {
        CompletableFuture.runAsync(() -> {
            spiderDataService.startSpiderData();
        }, threadPoolExecutor);
        return Result.success();
    }

    @PostMapping(value = "listPage")
    public Result listPage(@RequestBody SpiderData spiderData) {
        Page page = spiderDataService.page(spiderData.getPage(), new QueryWrapper<SpiderData>().like(StringUtils.isNotBlank(spiderData.getSrcImageName()), "src_image_name", spiderData.getSrcImageName()));
        return Result.success(page);
    }

    @GetMapping(value = "download/{id}")
    public Result download(@PathVariable(value = "id") String id) {
        spiderDataService.download(id);
        return Result.success();
    }

    @GetMapping(value = "batchDownload")
    public Result batchDownload() {
        spiderDataService.batchDownload();
        return Result.success();
    }

    @GetMapping(value = "preview/{id}")
    public Result preview(@PathVariable(value = "id") String id) throws IOException {
        SpiderData spiderData = spiderDataService.getById(id);
        File file = new File(spiderData.getStoreAddress());
        FileInputStream fileInputStream = new FileInputStream(file);
        int size = fileInputStream.available();
        byte[] bytes = new byte[size];
        fileInputStream.read(bytes);
        fileInputStream.close();
        BASE64Encoder encoder = new BASE64Encoder();
        return Result.success(encoder.encode(bytes));
    }
}

  •  爬虫案例后端业务层
package com.yundi.atp.platform.module.test.service.impl;

import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.yundi.atp.platform.module.test.entity.SpiderData;
import com.yundi.atp.platform.module.test.mapper.SpiderDataMapper;
import com.yundi.atp.platform.module.test.service.SpiderDataService;
import com.yundi.atp.platform.spider.MzituImageSpider;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Service;

import java.io.IOException;
import java.util.List;

/**
 * <p>
 * 服务实现类
 * </p>
 *
 * @author yanp
 * @since 2021-09-10
 */
@Slf4j
@Service
public class SpiderDataServiceImpl extends ServiceImpl<SpiderDataMapper, SpiderData> implements SpiderDataService {

    @Override
    public void startSpiderData() {
        try {
            //1.创建连接
            String url = "https://www.mzitu.com";
            Connection conn = Jsoup.connect(url).timeout(50000);
            conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36");
            conn.header("referer", url);
            //2.获取栏目图片连接
            Document document = conn.get();
            Element element = document.getElementById("menu-nav");
            Elements elements = element.getElementsByTag("a");
            List<String> linkList = elements.eachAttr("abs:href");
            linkList.remove(0);
            linkList.forEach(it -> {
                Connection connect = Jsoup.connect(it).timeout(50000);
                connect.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36");
                connect.header("referer", it);
                try {
                    Document doc = connect.get();
                    //3.获取栏目图片最大分页页数
                    Elements docElements = doc.getElementsByClass("nav-links");
                    Element docElement = docElements.get(0);
                    Elements tag = docElement.getElementsByTag("a");
                    List<String> tagValues = tag.eachText();
                    tagValues.remove(tagValues.size() - 1);
                    Integer maxPage = Integer.parseInt(tagValues.get(tagValues.size() - 1));
                    log.info("--------------------开始解析下载图片---------------------------");
                    String imageUrl = it.substring(0, it.length() - 1);
                    String imageCategory = imageUrl.substring(imageUrl.lastIndexOf("/") + 1);
                    //4.开始爬取图片
                    for (int i = 1; i <= maxPage; i++) {
                        String imgUrl = it + "page/" + i + "/";
                        Connection imgConn = Jsoup.connect(imgUrl).timeout(50000);
                        imgConn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36");
                        imgConn.header("referer", imgUrl);
                        try {
                            Document imgDocument = imgConn.get();
                            Element pins = imgDocument.getElementById("pins");
                            Elements imgTag = pins.getElementsByTag("img");
                            for (Element ele : imgTag) {
                                SpiderData spiderData = new SpiderData();
                                spiderData.setSrcLink(imgUrl);
                                spiderData.setSrcImageName(ele.attr("alt"));
                                spiderData.setSrcImageAddress(ele.attr("data-original"));
                                spiderData.setSrcImageCategory(imageCategory);
                                this.save(spiderData);
                            }
                        } catch (IOException ioException) {
                            log.error("获取资源失败:" + ioException);
                        }
                        Thread.sleep(5000);
                    }
                } catch (IOException | InterruptedException ioException) {
                    log.error("获取资源失败:" + ioException);
                }
            });
        } catch (IOException ioException) {
            log.error("获取资源失败:" + ioException);
        }
    }

    @Override
    public void download(String id) {
        SpiderData spiderData = this.getById(id);
        String path = MzituImageSpider.downloadImage(spiderData.getSrcImageAddress());
        if (path != null) {
            spiderData.setStoreAddress(path);
            spiderData.setStatus(true);
            this.updateById(spiderData);
        }
    }

    @Override
    public void batchDownload() {
        List<SpiderData> list = this.list(new QueryWrapper<SpiderData>().eq("status", 0));
        for (SpiderData spiderData : list) {
            String path = MzituImageSpider.downloadImage(spiderData.getSrcImageAddress());
            if (path != null) {
                spiderData.setStoreAddress(path);
                spiderData.setStatus(true);
                this.updateById(spiderData);
            }
        }
    }
}

  • 爬虫案例前端安装v-viewer预览组件

命令:npm i v-viewer -s

  •  爬虫案例前端main.js中引入v-viewer
import Vue from 'vue';
import ElementUI from 'element-ui';
import 'element-ui/lib/theme-chalk/index.css';
import router from '@/router';
import {http} from '@/axios/index';
import qs from 'qs';
import '@/util/derective'
import App from '@/App.vue';
import Print from 'vue-print-nb'
import VideoPlayer from 'vue-video-player'
require('video.js/dist/video-js.css')
require('vue-video-player/src/custom-theme.css')
import VueQuillEditor from 'vue-quill-editor'

// require styles
import 'quill/dist/quill.core.css'
import 'quill/dist/quill.snow.css'
import 'quill/dist/quill.bubble.css'

import mavonEditor from 'mavon-editor'
import 'mavon-editor/dist/css/index.css'
import TinymceVueH from 'tinymce-vue-h'
import Viewer from 'v-viewer'
import 'viewerjs/dist/viewer.css'


Vue.use(Viewer);
Viewer.setDefaults({
    Options: { "inline": true, "button": true, "navbar": true, "title": true, "toolbar": true, "tooltip": true, "movable": true, "zoomable": true, "rotatable": true, "scalable": true, "transition": true, "fullscreen": true, "keyboard": true, "url": "data-source" }
});
Vue.use(TinymceVueH);
// use
Vue.use(mavonEditor);
Vue.use(VueQuillEditor, /* { default global options } */);
Vue.use(VideoPlayer);
Vue.use(Print);
Vue.use(ElementUI);
Vue.prototype.$http = http;
Vue.prototype.$qs = qs;
Vue.config.productionTip = false;

new Vue({
    router,
    render: h => h(App),
}).$mount('#app')

  •  爬虫案例前端Spider.vue
<template>
  <div class="container">
    <div class="title">
      <span>爬虫案例(以https://www.mzitu.com网站为例)</span>
      <el-divider direction="vertical"></el-divider>
      <router-link to="home">
        <span style="font-size: 18px;">退出</span>
      </router-link>
    </div>
    <el-divider>Test Staring</el-divider>
    <el-form :inline="true" :model="query">
      <el-form-item>
        <el-input v-model="query.srcImageName" placeholder="源图片名称" clearable></el-input>
      </el-form-item>
      <el-form-item>
        <el-button type="primary" @click="startSpider">启动爬虫</el-button>
      </el-form-item>
      <el-form-item>
        <el-button type="success" @click="search">查询</el-button>
      </el-form-item>
      <el-form-item>
        <el-button type="success" @click="batchDownload">批量下载</el-button>
      </el-form-item>
    </el-form>
    <el-table
        :data="data"
        border
        stripe
        v-loading="loading"
        element-loading-text="数据加载中...">
      <el-table-column
          prop="id"
          label="ID">
      </el-table-column>
      <el-table-column
          prop="srcLink"
          label="源网址">
      </el-table-column>
      <el-table-column
          prop="srcImageName"
          label="源图片名称">
      </el-table-column>
      <el-table-column
          prop="srcImageAddress"
          label="源图片地址">
      </el-table-column>
      <el-table-column
          prop="srcImageCategory"
          label="源图片分类">
      </el-table-column>
      <el-table-column
          label="状态">
        <template slot-scope="scope">
          <el-tag size="medium" v-show="scope.row.status" type="success" effect="dark">已下载</el-tag>
          <el-tag size="medium" v-show="!scope.row.status" type="warning" effect="dark">未下载</el-tag>
        </template>
      </el-table-column>
      <el-table-column
          prop="storeAddress"
          label="存储地址">
      </el-table-column>
      <el-table-column label="操作" align="left">
        <template slot-scope="scope">
          <el-button type="text" @click="download(scope.row)" v-if="!scope.row.storeAddress">下载</el-button>
          <el-button type="text" @click="preview(scope.row)" v-if="scope.row.storeAddress">预览</el-button>
        </template>
      </el-table-column>
    </el-table>
    <el-pagination layout="total,sizes,prev,pager,next,jumper"
                   @size-change="handlerSizeChange"
                   @current-change="handlerCurrentChange"
                   :current-page="query.page.current"
                   :page-sizes="query.page.sizes"
                   :page-size="query.page.size"
                   :total="query.page.total"
                   class="page"
                   background>
    </el-pagination>
    <el-dialog
        :visible.sync="show">
      <div style="text-align: center;">
        <viewer>
          <img :src="imgSrc">
        </viewer>
      </div>
    </el-dialog>
  </div>
</template>

<script>
export default {
  name: "Spider",
  data() {
    return {
      data: [],
      loading: false,
      query: {
        page: {
          total: 0,
          current: 1,
          size: 10,
          pageSizes: [10, 50, 100, 500],
        }
      },
      show: false,
      imgSrc: '',
    }
  },
  created() {
    this.listPage();
  },
  methods: {
    startSpider() {
      this.$http.get('/test/spiderData/startSpiderData').then(res => {
        if (res.data.code === 1) {
          this.$message.success("爬取结束!");
          this.search();
        } else {
          this.$message.warning(res.data.msg);
        }
      }).catch(error => {
        this.$message.error(error);
      });
    },
    search() {
      this.query.page.current = 1;
      this.listPage();
    },
    listPage() {
      this.$http.post('/test/spiderData/listPage', this.query).then(res => {
        if (res.data.code === 1) {
          this.data = res.data.data.records;
          this.query.page.total = res.data.data.total;
        } else {
          this.$message.warning(res.data.msg);
        }
      }).catch(error => {
        this.$message.error(error);
      });
    },
    download(data) {
      this.$http.get('/test/spiderData/download/' + data.id).then(res => {
        if (res.data.code === 1) {
          this.$message.success("下载完成!");
          this.search();
        } else {
          this.$message.warning(res.data.msg);
        }
      }).catch(error => {
        this.$message.error(error);
      });
    },
    batchDownload() {
      this.$http.get('/test/spiderData/batchDownload').then(res => {
        if (res.data.code === 1) {
          this.$message.success("下载完成!");
          this.search();
        } else {
          this.$message.warning(res.data.msg);
        }
      }).catch(error => {
        this.$message.error(error);
      });
    },
    preview(data) {
      this.$http.get('/test/spiderData/preview/' + data.id).then(res => {
        if (res.data.code === 1) {
          this.show = true;
          this.imgSrc = "data:image/png;base64," + res.data.data;
        } else {
          this.$message.warning(res.data.msg);
        }
      }).catch(error => {
        this.$message.error(error);
      });
    },
    handlerSizeChange(data) {
      this.query.page.size = data;
      this.listPage();
    },
    handlerCurrentChange(data) {
      this.query.page.current = data;
      this.listPage();
    }
  }
}
</script>

<style scoped lang="scss">
.container {
  padding: 10px;

  a {
    text-decoration: none;
  }

  .title {
    font-size: 20px;
    font-weight: bold;
  }

  .page {
    float: right;
    margin-top: 20px;
  }
}
</style>

  • 验证效果

结语

关于java环境下的jsoup实现网络数据图片的爬取及网络图片的下载和图片的预览功到这里就结束了,下期见。。。

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

厉害哥哥吖

您的支持是我创作下去的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值