引入包:
<!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit -->
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.42.0</version>
</dependency>
请求类:
package com.xxx;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.ImmediateRefreshHandler;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.TextPage;
import com.gargoylesoftware.htmlunit.UnexpectedPage;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.WebResponse;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.LogFactory;
public class Gaher {
private final WebClient webclient;
private String referer;
public Gaher() {
//LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog"); // 关闭注释
webclient = new WebClient(BrowserVersion.CHROME); // 设置浏览器版本
webclient.getOptions().setTimeout(600 * 1000);
webclient.getOptions().setRedirectEnabled(true);
webclient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webclient.getOptions().setUseInsecureSSL(true);
webclient.getOptions().setJavaScriptEnabled(true); // 启用javascript
webclient.getOptions().setThrowExceptionOnScriptError(false); // 关闭js的异常抛出
webclient.getOptions().setCssEnabled(false); // 不加载CSS文件
//webclient.getCookieManager().clearCookies();
//webclient.getCache().clear();
webclient.setJavaScriptTimeout(600 * 1000);
webclient.waitForBackgroundJavaScript(60 * 1000);
webclient.setAjaxController(new NicelyResynchronizingAjaxController());
webclient.setRefreshHandler(new ImmediateRefreshHandler());
}
/**
* 请求html页并返回html内容
* @param url
* @return
* @throws IOException
*/
public String getHtml(String url) throws IOException {
WebRequest request = new WebRequest(new URL(url));
if (referer != null) {
request.setAdditionalHeader("Referer", referer);
}
HtmlPage page = webclient.getPage(request);
WebResponse response = page.getWebResponse();
referer = url;
return response.getContentAsString();
}
/**
* 请求非HTML页并返回内容,如请求类型为:text/plain; charset=gb2312
* @param url
* @return
* @throws IOException
*/
public String getText(String url) throws IOException {
WebRequest request = new WebRequest(new URL(url));
if (referer != null) {
request.setAdditionalHeader("Referer", referer);
}
TextPage page = webclient.getPage(request);
WebResponse response = page.getWebResponse();
return response.getContentAsString();
}
/**
* 下载资源文件,如:图片,视频
* @param url
* @param filename
* @throws IOException
*/
public void download(String url, String filename) throws IOException {
WebRequest request = new WebRequest(new URL(url));
if (referer != null) {
request.setAdditionalHeader("Referer", referer);
}
UnexpectedPage page = webclient.getPage(request);
InputStream is = page.getWebResponse().getContentAsStream();
FileOutputStream output = new FileOutputStream(filename);
IOUtils.copy(is, output);
output.close();
}
public void close() {
webclient.close();
}
}
测试:
public static void main(String[] args) throws IOException {
Gaher gaher = new Gaher();
String html = gaher.getHtml("https://www.baidu.com/"); // 取txt会报错
String text = gaher.getText("https://xxx.txt"); // 取html会报错
gaher.download("https://xxx.jpg", "E:\\temp\\tmp\\logs\\a.jpg");
gaher.download("http://xxx.mp4", "E:\\temp\\tmp\\logs\\b.mp4");
}