使用jsoup简单爬取微信公众号一些图片

最新推荐文章于 2023-07-07 11:27:00 发布

我哪会这个啊

最新推荐文章于 2023-07-07 11:27:00 发布

阅读量678

点赞数 1

文章标签： java 爬虫

本文链接：https://blog.csdn.net/qq_45243783/article/details/119303190

版权

新建一个maven项目，在pom.xml中导入如下依赖

<dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.13.1</version>
 </dependency>

2.准备一个微信公众号的在线url地址，例如“https://mp.weixin.qq.com/s/YPrqMOYYrAtCni2VT8c4jA”，打开网页，f12调试该网页，找到图片所在的地址
，如下图所示，发现图片都是包含在一个p标签里面，p标签又包含在一个大的div元素中，图片如下所示
在这里插入图片描述
因此，编写如下java代码进行解析

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;

public class Test {

    public static void main(String[] args) throws IOException {

          // 获取该网页的url地址
        String url = "https://mp.weixin.qq.com/s/YPrqMOYYrAtCni2VT8c4jA";
        // 解析成文档对象
        Document document = Jsoup.parse(new URL(url), 10000);
        // 获取到父节点id为js_content的所有元素
        Element jsContent = document.getElementById("js_content");
        // 获取到标签为img的素有元素集合
        Elements imgs = jsContent.getElementsByTag("img");
        int id = 0;
        for (Element img : imgs) {
           //获取图片的url地址
            String attr = img.attr("data-src");
            // 获取输入流
            URL target = new URL(attr);
            URLConnection urlConnection = target.openConnection();
            // 获取输入流
            InputStream inputStream = urlConnection.getInputStream();
            id++;
            FileOutputStream fileOutputStream = new FileOutputStream("E:\\jsoup_picture\\" + id + ".jpg");
            int temp = 0;
            while ((temp=inputStream.read())!=-1){
                fileOutputStream.write(temp);
            }
            System.out.println(id + ".jpg下载完毕");
            fileOutputStream.close();
            inputStream.close();

        }

    }

}