HtmlUnit 模拟浏览器请求 Java可以调用的内置浏览器

引入包:

<!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit -->
        <dependency>
            <groupId>net.sourceforge.htmlunit</groupId>
            <artifactId>htmlunit</artifactId>
            <version>2.42.0</version>
        </dependency>

请求类:

package com.xxx;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.ImmediateRefreshHandler;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.TextPage;
import com.gargoylesoftware.htmlunit.UnexpectedPage;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.WebResponse;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.LogFactory;

public class Gaher {

    private final WebClient webclient;
    private String referer;

    public Gaher() {
        //LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog"); // 关闭注释
        webclient = new WebClient(BrowserVersion.CHROME); // 设置浏览器版本
        webclient.getOptions().setTimeout(600 * 1000);
        webclient.getOptions().setRedirectEnabled(true);
        webclient.getOptions().setThrowExceptionOnFailingStatusCode(false);
        webclient.getOptions().setUseInsecureSSL(true);
        webclient.getOptions().setJavaScriptEnabled(true); // 启用javascript
        webclient.getOptions().setThrowExceptionOnScriptError(false); // 关闭js的异常抛出
        webclient.getOptions().setCssEnabled(false); // 不加载CSS文件
        //webclient.getCookieManager().clearCookies();
        //webclient.getCache().clear();
        webclient.setJavaScriptTimeout(600 * 1000);
        webclient.waitForBackgroundJavaScript(60 * 1000);
        webclient.setAjaxController(new NicelyResynchronizingAjaxController());
        webclient.setRefreshHandler(new ImmediateRefreshHandler());
    }

    /**
     * 请求html页并返回html内容
     * @param url
     * @return
     * @throws IOException 
     */
    public String getHtml(String url) throws IOException {
        WebRequest request = new WebRequest(new URL(url));
        if (referer != null) {
            request.setAdditionalHeader("Referer", referer);
        }

        HtmlPage page = webclient.getPage(request);
        WebResponse response = page.getWebResponse();
        referer = url;
        return response.getContentAsString();
    }

    /**
     * 请求非HTML页并返回内容,如请求类型为:text/plain; charset=gb2312
     * @param url
     * @return
     * @throws IOException 
     */
    public String getText(String url) throws IOException {
        WebRequest request = new WebRequest(new URL(url));
        if (referer != null) {
            request.setAdditionalHeader("Referer", referer);
        }

        TextPage page = webclient.getPage(request);
        WebResponse response = page.getWebResponse();
        return response.getContentAsString();
    }

    /**
     * 下载资源文件,如:图片,视频
     * @param url
     * @param filename
     * @throws IOException 
     */
    public void download(String url, String filename) throws IOException {
        WebRequest request = new WebRequest(new URL(url));
        if (referer != null) {
            request.setAdditionalHeader("Referer", referer);
        }
        UnexpectedPage page = webclient.getPage(request);
        InputStream is = page.getWebResponse().getContentAsStream();
        FileOutputStream output = new FileOutputStream(filename);
        IOUtils.copy(is, output);
        output.close();
    }

    public void close() {
        webclient.close();
    }
}

测试:

public static void main(String[] args) throws IOException {
    Gaher gaher = new Gaher();
    String html = gaher.getHtml("https://www.baidu.com/"); // 取txt会报错
    String text = gaher.getText("https://xxx.txt"); // 取html会报错
    gaher.download("https://xxx.jpg", "E:\\temp\\tmp\\logs\\a.jpg");
    gaher.download("http://xxx.mp4", "E:\\temp\\tmp\\logs\\b.mp4");
}

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值