java爬取图片

满秩

已于 2022-04-26 12:57:42 修改

阅读量1.6k

点赞数 5

文章标签： java 爬虫

于 2022-04-26 12:36:06 首次发布

本文链接：https://blog.csdn.net/asskjj/article/details/124424490

版权

开发环境

IntelliJ IDEA Community Edition 2021.1.3
Jsoup

分析

打开王者荣耀官网

按F12使用开发者工具检查元素

首先找到class="herolist clearfix"的ul标签

再找到对应英雄的li标签

其中a标签的文本内容就是该英雄的名字，a标签的href就是该英雄详细信息的相对位置

我们可以将a标签的相对位置获取下来在和"https://pvp.qq.com/web201605/"拼接一下获得该英雄详细信息的页面地址

在新获取的地址中继续按下F12进入开发者模式检查元素

其中class=zk-con1 zk-con的div标签中的style样式中的background属性中的地址即为背景图片地址。

以桑启英雄为例，该英雄图片地址为//game.gtimg.cn/images/yxzj/img201606/skin/hero-info/534/534-bigskin-1.jpg

在网页中尝试访问一下

效果如下：

代码实现

导入相关依赖

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;

使用jsoup获取网页代码

这里只获得了10张图片

public static void parser() throws IOException {
        String url = "https://pvp.qq.com/web201605/herolist.shtml";
        String path = "D://desk/爬取图片/";
        int i = 0;
        Document document = Jsoup.connect(url).userAgent("Mozilla/4.0 (compatible;MSIE 9.0 ;Windows NT 6.1;Trident/5.0)").get();
        //获取全部html代码
        Elements selectUL = document.select("[class = herolist clearfix]");//ul标签
        Elements selectLI = selectUL.select("li");
        for (Element e :selectLI){
            String HeroName = e.select("a").text();//英雄名称
            String heroURL = e.select("a").attr("href");
            String HeroURL = "https://pvp.qq.com/web201605/"+heroURL;//拼接英雄详细信息网页地址
            //在获取该英雄的详细信息网页
            Document doc = Jsoup.connect(HeroURL).userAgent("Mozilla/4.0 (compatible;MSIE 9.0 ;Windows NT 6.1;Trident/5.0)").get();
            Elements div = doc.select("[class=zk-con1 zk-con]");
            String back =div.attr("style");//获取英雄图片地址
            String back2 = back.substring(16,back.length()-11);//裁剪
            String uri = path+HeroName+".jpg";//该英雄图片保存地址
            download("Https:"+back2,uri);
            i++;
            if (i>10)
                break;
        }
    }

path即为文件要保存的文件夹

url即为网站

下载函数

   public static void download(String image,String path){
        try {
            URL url = new URL(image);
            DataInputStream dataInputStream = new DataInputStream(url.openStream());
            //  打开到此 URL 的连接并返回一个用于从该连接读入的 InputStream。
            FileOutputStream fileOutputStream = new FileOutputStream(path);//写入path文件夹
            ByteArrayOutputStream outputStream = new ByteArrayOutputStream();//一个输出流，其中的数据被写入一个 byte 数组
            byte [] buffer = new byte[1024];
            int length = 0;
            while((length = dataInputStream.read(buffer))!=-1){//将输入流中数据写入buffer数组
                outputStream.write(buffer,0,length);// 将 buffer 数组中从偏移量 off 开始的 len 个字节写入此输出流。
            }
            fileOutputStream.write(outputStream.toByteArray());//将数组写入此文件输出流中
            fileOutputStream.close();
            dataInputStream.close();
            outputStream.close();
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

主函数

 public static void main(String[] args) throws IOException {
        System.out.println("开始");
        parser();
        System.out.println("结束");
    }

全部代码

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;


public class demo01 {
    public static void main(String[] args) throws IOException {
        System.out.println("开始");
        parser();
        System.out.println("结束");
    }

    public static void download(String image,String path){
        try {
            URL url = new URL(image);
            DataInputStream dataInputStream = new DataInputStream(url.openStream());
            //  打开到此 URL 的连接并返回一个用于从该连接读入的 InputStream。
            FileOutputStream fileOutputStream = new FileOutputStream(path);//写入path文件夹
            ByteArrayOutputStream outputStream = new ByteArrayOutputStream();//一个输出流，其中的数据被写入一个 byte 数组
            byte [] buffer = new byte[1024];
            int length = 0;
            while((length = dataInputStream.read(buffer))!=-1){//将输入流中数据写入buffer数组
                outputStream.write(buffer,0,length);// 将 buffer 数组中从偏移量 off 开始的 len 个字节写入此输出流。
            }
            fileOutputStream.write(outputStream.toByteArray());//将数组写入此文件输出流中
            fileOutputStream.close();
            dataInputStream.close();
            outputStream.close();
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    public static void parser() throws IOException {
        String url = "https://pvp.qq.com/web201605/herolist.shtml";
        String path = "D://desk/爬取图片/";
        int i = 0;
        Document document = Jsoup.connect(url).userAgent("Mozilla/4.0 (compatible;MSIE 9.0 ;Windows NT 6.1;Trident/5.0)").get();
        //获取全部html代码
        Elements selectUL = document.select("[class = herolist clearfix]");//ul标签
        Elements selectLI = selectUL.select("li");
        for (Element e :selectLI){
            String HeroName = e.select("a").text();//英雄名称
            String heroURL = e.select("a").attr("href");
            String HeroURL = "https://pvp.qq.com/web201605/"+heroURL;//拼接英雄详细信息网页地址
            //在获取该英雄的详细信息网页
            Document doc = Jsoup.connect(HeroURL).userAgent("Mozilla/4.0 (compatible;MSIE 9.0 ;Windows NT 6.1;Trident/5.0)").get();
            Elements div = doc.select("[class=zk-con1 zk-con]");
            String back =div.attr("style");//获取英雄图片地址
            String back2 = back.substring(16,back.length()-11);//裁剪
            String uri = path+HeroName+".jpg";//该英雄图片保存地址
            download("Https:"+back2,uri);
            i++;
            if (i>10)
                break;
        }
    }
}