java使用htmlunit + jsoup 爬网站图片案例(爬虫学习)

本文链接：https://blog.csdn.net/weixin_44684303/article/details/131935240

该文章演示了如何使用htmlunit模拟浏览器行为，结合jsoup解析HTML，从百度图片搜索“风景”关键词并下载图片到本地。主要步骤包括设置htmlunit客户端、输入搜索词、执行滚屏加载更多图片，然后用jsoup提取图片链接并下载。

摘要由CSDN通过智能技术生成

申明

该文章用于自己学习爬虫使用

案例分析

目的: 从百度图片中搜索"风景"并下载图片到本地
思路: 使用htmlunit进行模拟用户操作, 并使用jsoup对数据进行解析,获取到需要的数据后,再下载到本地保存
htmlunit官网
 jsoup官网

操作步骤

使用谷歌浏览器打开百度图片网站 https://image.baidu.com
输入"风景", 点击"百度一下"按钮
页面进行跳转
对当前页面页面中的图片地址进行获取, 并保存到本地
需要进行鼠标滚轮向下滑动

找网页中对应标签的方式

通过F12
在这里插入图片描述

引入依赖

<dependency>
          <!-- jsoup HTML parser library @ https://jsoup.org/ -->
          <groupId>org.jsoup</groupId>
          <artifactId>jsoup</artifactId>
          <version>1.16.1</version>
        </dependency>

        <dependency>
            <groupId>net.sourceforge.htmlunit</groupId>
            <artifactId>htmlunit</artifactId>
            <version>2.41.0</version>
        </dependency>

代码

import cn.hutool.core.io.FileUtil;
import cn.hutool.core.lang.Console;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlForm;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput;
import com.gargoylesoftware.htmlunit.html.HtmlTextInput;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;

/**
 * desc:
 *
 * @author qts
 * @date 2023/7/20 0020
 */
public class CrawlerTest {

    public static void main(String[] args) throws IOException {

        // ======== htmlunit 操作 start ============
        
        // 创建htmlunit 客户端, 指定浏览器(当前为谷歌浏览器)
        WebClient webClient = new WebClient(BrowserVersion.CHROME);

        // 设置客户端配置
        webClient.getOptions().setCssEnabled(false);//（屏蔽)css 因为css并不影响我们抓取数据 反而影响网页渲染效率
        webClient.getOptions().setThrowExceptionOnScriptError(false);//（屏蔽)异常
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//（屏蔽)日志
        webClient.getOptions().setJavaScriptEnabled(true);//加载js脚本
        webClient.getOptions().setTimeout(5000);//设置超时时间
        webClient.getOptions().setRedirectEnabled(true); //允许重定向
        webClient.getCookieManager().setCookiesEnabled(true);//允许cookie
        webClient.setAjaxController(new NicelyResynchronizingAjaxController());//设置ajax

        // 请求网站获取HtmlPage
        HtmlPage htmlPage = webClient.getPage("https://image.baidu.com/");
        // 获取输入框对应的表单
        HtmlForm homeSearchForm = (HtmlForm) htmlPage.getElementById("homeSearchForm");
        // 获取对应输入框
        HtmlTextInput searchInput = (HtmlTextInput) homeSearchForm.getElementsByAttribute("input", "id", "kw").get(0);
        // 设置输入框内容"风景"
        searchInput.setValueAttribute("风景");

        // 获取搜索按钮"百度一下"
        HtmlSubmitInput searchSubmitInput = htmlPage.querySelector("input.s_newBtn");
        // 点击按钮
        HtmlPage resultHome = searchSubmitInput.click();

        // 执行js 向下滚屏(因为页面的图片是通过滚屏进行刷新的,不滚屏后面的图片没有对应的地址数据) 
        // window.scrollTo(0, document.documentElement.scrollHeight) 滑动到底部,可以在页面F12控制台中执行代码测试
        resultHome.executeJavaScript("window.scrollTo(0,6000)");// 执行js 向下滚屏,自行设置对应值,当前仅做测试使用
		
        //主线程休眠10秒 让客户端有时间执行js代码
        webClient.waitForBackgroundJavaScript(10000);

        // ======== htmlunit 操作 end ============

        // ======== jsoup 解析 start ============

        // 解析html页面得到 Document
        Document doc = Jsoup.parse(resultHome.asXml());

        // 获取图片 img元素
        Elements elements = doc.select("img[class='main_img img-hover']");


        // 处理图片
        for (int i = 0; i < elements.size(); i++) {
            Element element = elements.get(i);
            String url = element.attr("src");

            if (!url.startsWith("http")) {
                // 有些广告是用的base64数据,进行排除
                continue;
            }

            InputStream inputStream = getFileInputStream(url);

            if (inputStream != null) {
                FileUtil.writeBytes(readBytes(inputStream),"D:\\baidu_pic\\pic_"+i+".png");
            }
        }

        // ======== jsoup 解析 end ============
    }

    /*读取网络文件*/
    public static InputStream getFileInputStream(String path) {
        URL url;
        try {
            url = new URL(path);
            HttpURLConnection conn = (HttpURLConnection) url.openConnection();
            //设置超时间为3秒
            conn.setConnectTimeout(3 * 1000);
            //防止屏蔽程序抓取而返回403错误
            conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
            //得到输入流
            return conn.getInputStream();
        } catch (Exception e) {
            Console.error("读取网络文件异常:" + path);
        }
        return null;
    }

    /**
     * 读取输入流到字节数组
     *
     * @param in
     * @return
     * @throws IOException
     */
    public static byte[] readBytes(InputStream in) throws IOException {
        //读取字节的缓冲
        byte[] buffer = new byte[1024];
        //最终的数据
        byte[] result = new byte[0];
        int size = 0;
        while ((size = in.read(buffer)) != -1) {
            int oldLen = result.length;
            byte[] tmp = new byte[oldLen + size];
            if (oldLen > 0) {//copy 旧字节
                System.arraycopy(result, 0, tmp, 0, oldLen);
            }
            //copy 新字节
            System.arraycopy(buffer, 0, tmp, oldLen, size);

            result = tmp;
        }
        return result;
    }

}

htmlunit使用参考: https://blog.csdn.net/weixin_44787678/article/details/106994485