htmlUnit的读取js渲染的页面

Wzy000001

于 2022-04-21 15:38:16 发布

阅读量3.1k

点赞数

分类专栏： java 文章标签： spring boot java

本文链接：https://blog.csdn.net/Wzy000001/article/details/124323364

版权

java 专栏收录该内容

25 篇文章 0 订阅

订阅专栏

htmlUnit的读取js渲染的页面

我这边有个这么的需求，要求前端每次上版本的时候，有一个版本号。然后后端去爬前端，比较禅道的版本发布计划，如果发布计划和生产上的版本号不一致，就告警。

这个跟后端的jar包，每次打包，读取该分支的git号，然后打包到jar包一样，这样每次发版就知道当前的版本是什么版本了。

前端那边摸索出来了，但是读取的时候出了点问题。前端提供的页面，如果用浏览器去浏览，是很正常的。然后用httpclient去读取，发现没有对应的标签。

很明显，这个是因为，这个结构是js渲染后生成的，这样的话，就得换工具了。

百度了下，发现java用的是htmlunit。

工具类是这样。

public class MyHtmlUnitHelper {
    /**
     * 设置一个无头浏览器,抓取动态渲染页面
     * @param requestUrl 要解析页面URL地址
     * @return 返回Document对象
     */
    public static Document accordingToURLGetBrowserHtml(String requestUrl) throws InterruptedException, MalformedURLException {

        final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_ESR);//新建一个模拟谷歌Chrome浏览器的浏览器客户端对象


        webClient.getOptions().setThrowExceptionOnScriptError(false);//当JS执行出错的时候是否抛出异常, 这里选择不需要
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//当HTTP的状态非200时是否抛出异常, 这里选择不需要
        webClient.getOptions().setActiveXNative(false);//不启用ActiveX
        webClient.getOptions().setCssEnabled(false);//是否启用CSS, 因为不需要展现页面, 所以不需要启用
        webClient.getOptions().setJavaScriptEnabled(true); //很重要，启用JS
        webClient.getOptions().setDownloadImages(false);//不下载图片
        webClient.getOptions().setActiveXNative(false);
        webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要，设置支持AJAX
        webClient.getOptions().setUseInsecureSSL(true);


        webClient.setWebConnection(
                new WebConnectionWrapper(webClient) {

                    @Override
                    public WebResponse getResponse(WebRequest request) throws IOException {

                        WebResponse response = super.getResponse(request);
                        return response;
                    }
                }
        );




        HtmlPage page = null;
        try {
            page = webClient.getPage(requestUrl);//尝试加载上面图片例子给出的网页
        } catch (Exception e) {
            e.printStackTrace();
        }finally {
            webClient.close();
        }

        webClient.waitForBackgroundJavaScript(30000);//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束

        String pageXml = page.asXml();//直接将加载完成的页面转换成xml格式的字符串
        Document parse = Jsoup.parse(pageXml);
//        Elements big = parse.getElementsByTag("big");
//        String text = big.text();
//        System.out.println(parse);
        return parse;
    }

    public static void main(String[] args) throws InterruptedException, MalformedURLException {
        Document document = MyHtmlUnitHelper.accordingToURLGetBrowserHtml("https://xjczgl.zjxj.gov.cn/#/version");
        Elements big = document.getElementsByTag("big");
        String text = big.text();
        System.out.println(text);
    }
}

      <!--jsoup 是一款 Java 的HTML 解析器-->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.13.1</version>
        </dependency>
        <!--模拟一个无头浏览器-->
        <dependency>
            <groupId>net.sourceforge.htmlunit</groupId>
            <artifactId>htmlunit</artifactId>
            <version>2.60.0</version>
        </dependency>