maven
<!-- 模拟浏览器请求 -->
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.46.0</version>
</dependency>
Ip.class
public class Ip {
protected int hash;
protected String supply;
protected String ip;
protected int port;
protected long expire = -1;
protected long errors = 0;
public Ip() {
super();
}
public Ip(String supply, String ip, int port, long expire, long errors) {
super();
this.supply = supply;
this.ip = ip;
this.port = port;
this.expire = expire;
this.errors = errors;
this.hash = (this.ip + this.port).hashCode();
}
public synchronized void error() {
this.errors += 1;
}
public String key() {
return this.ip + ":" + this.port;
}
public String getSupply() {
return supply;
}
public void setSupply(String supply) {
this.supply = supply;
}
public String getIp() {
return ip;
}
public void setIp(String ip) {
this.ip = ip;
this.hash = (this.ip + this.port).hashCode();
}
public int getPort() {
return port;
}
public void setPort(int port) {
this.port = port;
this.hash = (this.ip + this.port).hashCode();
}
public long getExpire() {
return expire;
}
public void setExpire(long expire) {
this.expire = expire;
}
public long getErrors() {
return errors;
}
public void setErrors(long errors) {
this.errors = errors;
}
@Override
public Ip clone() {
return new Ip(supply, ip, port, expire, errors);
}
@Override
public int hashCode() {
return (this.ip + this.port).hashCode();
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof Ip)) {
return false;
}
Ip _ip = (Ip) obj;
return (this.ip + this.port).equals(_ip.ip + _ip.port);
}
@Override
public String toString() {
return this.ip + ":" + this.port;
}
}
HtmlUnitUtils
import com.gargoylesoftware.htmlunit.*;
import com.gargoylesoftware.htmlunit.html.HtmlImage;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.util.NameValuePair;
import org.apache.commons.io.IOUtils;
import org.apache.logging.log4j.Logger;
import javax.imageio.ImageIO;
import javax.imageio.ImageReader;
import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* 浏览器模拟操作工具
*/
public class HtmlUnitUtils {
protected static final String User_Agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/62.0.3202.89 Safari/537.36";
/**
* 获取 WebClient 连接实例
*
* @return
*/
public static WebClient getConnection(Ip ip) {
WebClient webClient = new WebClient();
init(webClient); //加载配置
if (ip != null) {
setProxy(webClient, ip.getIp(), ip.getPort()); //设置代理IP
}
return webClient;
}
/**
* 获取 WebClient 连接实例
*
* @return
*/
public static WebClient getConnection(Ip ip,WebClient webClient) {
if (webClient == null){
webClient = new WebClient();
init(webClient); //加载配置
}
if (ip != null) {
setProxy(webClient, ip.getIp(), ip.getPort()); //设置代理IP
}
return webClient;
}
/**
* Get请求
*
* @param url
* @return HtmlPage
* @throws Exception
*/
public static HtmlPage sendGetRequest(WebClient webClient, String url) throws Exception {
WebRequest webRequest = GetRequest(url);
HtmlPage htmlPage = webClient.getPage(webRequest);
return htmlPage;
}
/**
* Get请求
*
* @param url
* @return HtmlPage
* @throws Exception
*/
public static WebRequest GetRequest(String url) throws Exception {
WebRequest webRequest = new WebRequest(new URL(url));
webRequest.setAdditionalHeader("User-Agent", User_Agent);
webRequest.setHttpMethod(HttpMethod.GET);
return webRequest;
}
/**
* Post 请求
*
* @param url
* @param params
* @return HtmlPage
* @throws Exception
*/
public static HtmlPage sendPostRequest(WebClient webClient, String url, Map<String, Object> params) throws Exception {
WebRequest webRequest = PostRequest(url, params);
HtmlPage htmlPage = webClient.getPage(webRequest);
return htmlPage;
}
/**
* Post 请求
*
* @param url
* @param params
* @return HtmlPage
* @throws Exception
*/
public static WebRequest PostRequest(String url, Map<String, Object> params) throws Exception {
WebRequest webRequest = new WebRequest(new URL(url));
webRequest.setHttpMethod(HttpMethod.POST);
webRequest.setAdditionalHeader("User-Agent", User_Agent);
if (params != null && params.size() > 0) {
List<NameValuePair> lnv = new ArrayList<>();
for (Map.Entry<String, Object> param : params.entrySet()) {
String key = param.getKey();
Object value = param.getValue();
if (value instanceof List<?>) {
List<?> subInputs = (List<?>) value;
for (Object subInput : subInputs) {
NameValuePair nv = new NameValuePair(key, String.valueOf(subInput));
lnv.add(nv);
}
}
NameValuePair nv = new NameValuePair(key, String.valueOf(value));
lnv.add(nv);
}
webRequest.setRequestParameters(lnv);
}
return webRequest;
}
/**
* 将 HtmlPage 转化为 String
*
* @param page
* @return
* @throws IOException
*/
public static String getPageToString(HtmlPage page) throws IOException {
return new String(getPageToByte(page), "utf-8");
}
/**
* 将 HtmlPage 转化为 byte
*
* @param page
* @return
* @throws IOException
*/
public static byte[] getPageToByte(HtmlPage page) throws IOException {
byte[] responseContent = null;
WebResponse webResponse = null;
try {
webResponse = page.getWebResponse();
int status = webResponse.getStatusCode();
// 读取数据内容
if (status == 200) {
if (page.isHtmlPage()) {
// 等待JS执行完成
responseContent = page.asXml().getBytes();
} else {
InputStream bodyStream = webResponse.getContentAsStream();
responseContent = IOUtils.toByteArray(bodyStream);
bodyStream.close();
}
}
} catch (IOException e) {
throw new IOException(e);
} finally {
if (webResponse != null) {
// 关闭响应流
webResponse.cleanUp();
}
}
return responseContent;
}
/***
* 加载配置
*/
private static void init(WebClient webClient) {
// 1 启动JS
webClient.getOptions().setJavaScriptEnabled(false);
// 2 禁用Css,可避免自动二次请求CSS进行渲染
webClient.getOptions().setCssEnabled(false);
// 3 启动客户端重定向
webClient.getOptions().setRedirectEnabled(true);
// 4 js运行错误时,是否抛出异常
webClient.getOptions().setThrowExceptionOnScriptError(false);
// 5 设置超时
webClient.getOptions().setTimeout(60000);
//6 设置忽略证书
webClient.getOptions().setUseInsecureSSL(true);
//7 设置Ajax
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
//8设置cookie
webClient.getCookieManager().setCookiesEnabled(true);
// webClient.waitForBackgroundJavaScript(600*1000);
// webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
// webClient.getOptions().setActiveXNative(false);
// webClient.getOptions().setMaxInMemory(1024*1024*1024);
// webClient.getOptions().setHistoryPageCacheLimit(1024);
// webClient.getOptions().setHistorySizeLimit(1000);
}
/**
* 设置代理IP
*/
private static void setProxy(WebClient webClient, String address, int port) {
ProxyConfig proxyConfig = webClient.getOptions().getProxyConfig();
proxyConfig.setProxyHost(address);
proxyConfig.setProxyPort(port);
DefaultCredentialsProvider credentialsProvider = (DefaultCredentialsProvider) webClient
.getCredentialsProvider();
credentialsProvider.addCredentials("YsProxy", "YsProxy@0023");
}
/**
* 获取验证码
*
* @param ta
* @param image
* @return
* @throws IOException
*/
public static byte[] getCode(String ta, HtmlImage image, Logger logger) throws Exception {
String code = "";
ImageReader imageReader = image.getImageReader();
BufferedImage bufferedImage = imageReader.read(0);
ByteArrayOutputStream out = new ByteArrayOutputStream();
ImageIO.write(bufferedImage, "png", out);
byte[] bytes = out.toByteArray();
return bytes;
}
/**
* 保存图片到本地
*
* @param img
* @param name
* @throws IOException
*/
public void saveImage(HtmlImage img, String name) throws IOException {
ImageReader imageReader = img.getImageReader();
BufferedImage bufferedImage = imageReader.read(0);
BufferedImage inputbig = new BufferedImage(160, 60, BufferedImage.TYPE_INT_BGR);
inputbig.getGraphics().drawImage(bufferedImage, 0, 0, 160, 60, null); //画图
File file2 = new File("C:\\users\\Desktop\\" + name + ".png");
ImageIO.write(inputbig, "png", file2);
}
}
- 需要代理,new 一个ip并赋值ip和port,否则为空即可
// 片段
Ip ip = null;
if (proxyEnable){
ip = new Ip();
ip.setIp(proxyHostname);
ip.setPort(proxyPort);
}
webClient = HtmlUnitUtils.getConnection(ip);
- 自定义WebClient
// 片段
// 模拟一个浏览器
WebClient webClient = new WebClient(BrowserVersion.CHROME);
// 设置webClient的相关参数
webClient.setCssErrorHandler(new SilentCssErrorHandler());
//设置ajax
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
//设置支持js
webClient.getOptions().setJavaScriptEnabled(true);
//CSS渲染禁止
webClient.getOptions().setCssEnabled(false);
//设置忽略证书
webClient.getOptions().setUseInsecureSSL(true);
//超时时间
webClient.getOptions().setTimeout(50000);
//设置js抛出异常:false
webClient.getOptions().setThrowExceptionOnScriptError(false);
//允许重定向
webClient.getOptions().setRedirectEnabled(true);
//允许cookie
webClient.getCookieManager().setCookiesEnabled(true);
Ip ip = new Ip();
WebClient connection = HtmlUnitUtils.getConnection(ip, webClient);
// 模拟浏览器打开一个目标网址
// HtmlPage page = connection.getPage("https://api.twitter.com/oauth/authorize?oauth_token=JrbVIAAAAAABASakAAABdsxFCaI");
HtmlPage page = connection.getPage("https://botometer.osome.iu.edu/");
//等待js加载完全
// connection.waitForBackgroundJavaScript(10000*3);
System.out.println(page.asText());
System.out.println(page.asXml());
HtmlAnchor a = page.getFirstByXPath("/html/body/nav/div/div[2]/ul/li[7]/ul/li/a");
System.out.println(a.asXml());
HtmlPage click = a.click();
webClient.waitForBackgroundJavaScript(2000);
System.out.println("-------------------------------");
System.out.println(click.asXml());
System.out.println("-------------------------------");
System.out.println(click.asText());
System.out.println("-------------------------------");
System.out.println(click.getUrl().toString());