HtmlUntiHtmlUnit2.14动态网页JS处理后网页
编写不易,转载请注明(http://shihlei.iteye.com/blog/2067707)!
一 概述
HttpClient适合处理静态资源,网络爬虫等类似应用很大程度需要处理动态网页(内容有js填充,如百度图片,body里基本没有数据,碰到最麻烦的是新浪微博列表页)。将网页下载后,结合JS和Dom模型还原网页,我目前还未攻破,但在下载层还原网页,HtmlUnit是一种解决方案,虽然对JS的支持还是不完美。
HtmlUnit其实是自动化测试工具,集成了下载(HttpClient),Dom(NekoHtml),驱动JS(Rhino)。有一定的网页渲染能力,由于会驱动Dom,会消耗些CPU,内存。
本文描述HTMLUnit请求响应,设置cookies,设置代理,驱动JS等方法。
二 版本
Xml代码
- <dependency>
- <groupId>net.sourceforge.htmlunit</groupId>
- <artifactId>htmlunit</artifactId>
- <version>2.14</version>
- </dependency>
三 典型功能
1) 打开google搜索百度
Java代码
- /**
- * 打开google 搜索百度
- *
- * @param args
- * @throws Exception
- */
- public static void main(String[] args) throws Exception {
- String url = "http://www.google.com.hk";
- final WebClient webClient = new WebClient();
- HtmlPage htmlPage = webClient.getPage(url);
- // HtmlUnit dom模型
- // 获取表单 ,获得form标签name属性=f
- HtmlForm form = htmlPage.getFormByName("f");
- // 获取输入框, 获取 input标签 ,name属性=q
- HtmlTextInput text = form.getInputByName("q");
- // 搜索百度
- text.setText("baidu");
- // 获取提交按钮
- HtmlSubmitInput button = form.getInputByName("btnG");
- // 提交表单
- HtmlPage listPage = button.click();
- System.out.println(listPage.asXml());
- webClient.closeAllWindows();
- }
2)获取动态页面
Java代码
- /**
- * 获取百度图片js后的内容
- *
- * @throws Exception
- */
- public void demo2() throws Exception {
- String url = "http://image.baidu.com/i?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1400328281672_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=html";
- final WebClient webClient = new WebClient();
- // 1 启动JS
- webClient.getOptions().setJavaScriptEnabled(true);
- // 2 禁用Css,可避免自动二次请求CSS进行渲染
- webClient.getOptions().setCssEnabled(false);
- // 3 启动客户端重定向
- webClient.getOptions().setRedirectEnabled(true);
- // 4 js运行错误时,是否抛出异常
- webClient.getOptions().setThrowExceptionOnScriptError(false);
- // 5 设置超时
- webClient.getOptions().setTimeout(50000);
- HtmlPage htmlPage = webClient.getPage(url);
- // 等待JS驱动dom完成获得还原后的网页
- webClient.waitForBackgroundJavaScript(10000);
- // 网页内容
- System.out.println(htmlPage.asXml());
- webClient.closeAllWindows();
- }
四 样例
(1)请求响应
Java代码
- /**
- * Get请求
- * @param url
- * @return
- * @throws Exception
- */
- public static byte[] sendGetRequest(String url) throws Exception{
- WebClient webClient = new WebClient();
- WebRequest webRequest = new WebRequest(new URL(url));
- webRequest.setHttpMethod(HttpMethod.GET);
- return sendRequest(webClient,webRequest);
- }
- /**
- * Post 请求
- *
- * @param url
- * @param params
- * @return
- * @throws Exception
- */
- public static byte[] sendPostRequest(String url,Map<String,String> params) throws Exception{
- WebClient webClient = new WebClient();
- WebRequest webRequest = new WebRequest(new URL(url));
- webRequest.setHttpMethod(HttpMethod.POST);
- if (params != null && params.size() > 0) {
- for (Entry<String, String> param : params.entrySet()) {
- webRequest.getRequestParameters().add(new NameValuePair(param.getKey(), param.getValue()));
- }
- }
- return sendRequest(webClient,webRequest);
- }
- //底层请求
- private static byte[] sendRequest(WebClient webClient,WebRequest webRequest) throws Exception{
- byte[] responseContent = null;
- Page page = webClient.getPage(webRequest);
- WebResponse webResponse = page.getWebResponse();
- int status = webResponse.getStatusCode();
- System.out.println("Charset : " + webResponse.getContentCharset());
- System.out.println("ContentType : " + webResponse.getContentType());
- // 读取数据内容
- if (status==200) {
- if (page.isHtmlPage()) {
- <strong>// 等待JS执行完成,包括远程JS文件请求,Dom处理
- webClient.waitForBackgroundJavaScript(10000);</strong>
- <strong> // 使用JS还原网页
- responseContent = ((HtmlPage) page).asXml().getBytes();</strong>
- } else {
- InputStream bodyStream = webResponse.getContentAsStream();
- responseContent = ByteStreams.toByteArray(bodyStream);
- bodyStream.close();
- }
- }
- // 关闭响应流
- webResponse.cleanUp();
- return responseContent;
- }
(2)配置JS,CSS,超时,重定向
Java代码
- private void configWebClient(WebClient webClient) {
- // 设置webClient的相关参数
- // 1 启动JS
- webClient.getOptions().setJavaScriptEnabled(true);
- // 2 禁用Css,可避免自动二次请求CSS进行渲染
- webClient.getOptions().setCssEnabled(false);
- // 3 启动客户端重定向
- webClient.getOptions().setRedirectEnabled(true);
- // 4 js运行错误时,是否抛出异常
- webClient.getOptions().setThrowExceptionOnScriptError(false);
- // 5 设置超时
- webClient.getOptions().setTimeout(timeout);
- }
(3)代理
Java代码
- private void setProxy(WebClient webClient,HttpProxy proxy) {
- ProxyConfig proxyConfig = webClient.getOptions().getProxyConfig();
- proxyConfig.setProxyHost(proxy.getHost());
- proxyConfig.setProxyPort(proxy.getPort());
- DefaultCredentialsProvider credentialsProvider = (DefaultCredentialsProvider) webClient
- .getCredentialsProvider();
- credentialsProvider.addCredentials(proxy.getUser(), proxy.getPassword());
- }
辅助类:
Java代码
- package x.http.core;
- /**
- * Http代理
- *
- * @author shilei
- *
- */
- public class HttpProxy {
- private String proxy = "http";
- private String host;
- private int port;
- private String user;
- private String password;
- public String getProxy() {
- return proxy;
- }
- public void setProxy(String proxy) {
- this.proxy = proxy;
- }
- public String getHost() {
- return host;
- }
- public void setHost(String host) {
- this.host = host;
- }
- public int getPort() {
- return port;
- }
- public void setPort(int port) {
- this.port = port;
- }
- public String getUser() {
- return user;
- }
- public void setUser(String user) {
- this.user = user;
- }
- public String getPassword() {
- return password;
- }
- public void setPassword(String password) {
- this.password = password;
- }
- }
(4)Cookies:可以用于认证数据设置
1)设置Cookies
Java代码
- private void setCookies(WebClient webClient,String domain, Map<String, String> cookies) {
- if (cookies != null && cookies.size() > 0) {
- webClient.getCookieManager().setCookiesEnabled(true);// enable
- // cookies
- for (Entry<String, String> c : cookies.entrySet()) {
- Cookie cookie = new Cookie(domain, c.getKey(), c.getValue());
- webClient.getCookieManager().addCookie(cookie);
- }
- }
- }
2)获取响应Cookies
Java代码
- private Map<String, String> getResponseCookies(WebClient webClient) {
- Set<Cookie> cookies = webClient.getCookieManager().getCookies();
- Map<String, String> responseCookies = Maps.newHashMap();
- for (Cookie c : cookies) {
- responseCookies.put(c.getName(), c.getValue());
- }
- return responseCookies;
- }
3)删除所有Cookies
Java代码
- /**
- * 清除所有cookie
- */
- public void clearCookies(WebClient webClient) {
- webClient.getCookieManager().clearCookies();
- }
(5)驱动JS:
可实现自动化流程,如驱动表单提交,获取表单提交后的页面
如登录后页面:
Java代码
- public void doWeb(Page page) {
- if (page instanceof HtmlPage) {
- StringBuilder js = new StringBuilder();
- js.append("document.getElementsByName('username')[1].value='").append(WeiboAccount.USERNAME)
- .append("';");
- js.append("document.getElementsByName('password')[1].value='").append(WeiboAccount.PASSWORD)
- .append("';");
- js.append("document.getElementsByClassName('W_btn_g')[1].click();");
- HtmlPage htmlPage = (HtmlPage) page;
- htmlPage.executeJavaScript(js.toString());
- }
- }
附录:完整代码
Java代码
- package x.http.simple.htmlunit;
- import java.io.IOException;
- import java.io.InputStream;
- import java.net.URL;
- import java.util.Map;
- import java.util.Map.Entry;
- import java.util.Set;
- import x.http.core.HttpProxy;
- import com.gargoylesoftware.htmlunit.DefaultCredentialsProvider;
- import com.gargoylesoftware.htmlunit.HttpMethod;
- import com.gargoylesoftware.htmlunit.Page;
- import com.gargoylesoftware.htmlunit.ProxyConfig;
- import com.gargoylesoftware.htmlunit.WebClient;
- import com.gargoylesoftware.htmlunit.WebRequest;
- import com.gargoylesoftware.htmlunit.WebResponse;
- import com.gargoylesoftware.htmlunit.html.HtmlForm;
- import com.gargoylesoftware.htmlunit.html.HtmlPage;
- import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput;
- import com.gargoylesoftware.htmlunit.html.HtmlTextInput;
- import com.gargoylesoftware.htmlunit.util.Cookie;
- import com.gargoylesoftware.htmlunit.util.NameValuePair;
- import com.google.common.collect.Maps;
- import com.google.common.io.ByteStreams;
- public class HtmlUnitDemo {
- private WebClient webClient = null;
- private int timeout = 50000;
- public HtmlUnitDemo() {
- this(null);
- }
- /**
- * Get请求
- *
- * @param url
- * @return
- * @throws Exception
- */
- public byte[] sendGetRequest(String url) throws Exception {
- WebRequest webRequest = new WebRequest(new URL(url));
- webRequest.setHttpMethod(HttpMethod.GET);
- return sendRequest(webRequest);
- }
- /**
- * Post 请求
- *
- * @param url
- * @param params
- * @return
- * @throws Exception
- */
- public byte[] sendPostRequest(String url, Map<String, String> params) throws Exception {
- WebRequest webRequest = new WebRequest(new URL(url));
- webRequest.setHttpMethod(HttpMethod.POST);
- if (params != null && params.size() > 0) {
- for (Entry<String, String> param : params.entrySet()) {
- webRequest.getRequestParameters().add(new NameValuePair(param.getKey(), param.getValue()));
- }
- }
- return sendRequest(webRequest);
- }
- // 底层请求
- private byte[] sendRequest(WebRequest webRequest) throws Exception {
- byte[] responseContent = null;
- Page page = webClient.getPage(webRequest);
- WebResponse webResponse = page.getWebResponse();
- int status = webResponse.getStatusCode();
- System.out.println("Charset : " + webResponse.getContentCharset());
- System.out.println("ContentType : " + webResponse.getContentType());
- // 读取数据内容
- if (status == 200) {
- if (page.isHtmlPage()) {
- // 等待JS执行完成
- webClient.waitForBackgroundJavaScript(100000);
- responseContent = ((HtmlPage) page).asXml().getBytes();
- } else {
- InputStream bodyStream = webResponse.getContentAsStream();
- responseContent = ByteStreams.toByteArray(bodyStream);
- bodyStream.close();
- }
- }
- // 关闭响应流
- webResponse.cleanUp();
- return responseContent;
- }
- public HtmlUnitDemo(HttpProxy proxy) {
- webClient = new WebClient();
- configWebClient();
- // 设置代理
- if (proxy != null) {
- setProxy(proxy);
- }
- }
- private void configWebClient() {
- // 设置webClient的相关参数
- // 1 启动JS
- webClient.getOptions().setJavaScriptEnabled(true);
- // 2 禁用Css,可避免自动二次请求CSS进行渲染
- webClient.getOptions().setCssEnabled(false);
- // 3 启动客户端重定向
- webClient.getOptions().setRedirectEnabled(true);
- // 4 js运行错误时,是否抛出异常
- webClient.getOptions().setThrowExceptionOnScriptError(false);
- // 5 设置超时
- webClient.getOptions().setTimeout(timeout);
- }
- private void setProxy(HttpProxy proxy) {
- ProxyConfig proxyConfig = webClient.getOptions().getProxyConfig();
- proxyConfig.setProxyHost(proxy.getHost());
- proxyConfig.setProxyPort(proxy.getPort());
- DefaultCredentialsProvider credentialsProvider = (DefaultCredentialsProvider) webClient
- .getCredentialsProvider();
- credentialsProvider.addCredentials(proxy.getUser(), proxy.getPassword());
- }
- @SuppressWarnings("unused")
- private Map<String, String> getResponseCookies() {
- Set<Cookie> cookies = webClient.getCookieManager().getCookies();
- Map<String, String> responseCookies = Maps.newHashMap();
- for (Cookie c : cookies) {
- responseCookies.put(c.getName(), c.getValue());
- }
- return responseCookies;
- }
- @SuppressWarnings("unused")
- private void setCookies(String domain, Map<String, String> cookies) {
- if (cookies != null && cookies.size() > 0) {
- webClient.getCookieManager().setCookiesEnabled(true);// enable
- // cookies
- for (Entry<String, String> c : cookies.entrySet()) {
- Cookie cookie = new Cookie(domain, c.getKey(), c.getValue());
- webClient.getCookieManager().addCookie(cookie);
- System.out.println("Set Cookies : " + c.getKey() + " | " + c.getValue());
- }
- }
- }
- /**
- * 清除所有cookie
- */
- public void clearCookies() {
- webClient.getCookieManager().clearCookies();
- }
- public void shutdown() throws IOException {
- webClient.closeAllWindows();
- }
- /**
- * 打开google 搜索百度
- *
- * @param args
- * @throws Exception
- */
- public void demo() throws Exception{
- String url = "http://www.google.com.hk";
- final WebClient webClient = new WebClient();
- HtmlPage htmlPage = webClient.getPage(url);
- // HtmlUnit dom模型
- // 获取表单 ,获得form标签name属性=f
- HtmlForm form = htmlPage.getFormByName("f");
- // 获取输入框, 获取 input标签 ,name属性=q
- HtmlTextInput text = form.getInputByName("q");
- // 搜索百度
- text.setText("baidu");
- // 获取提交按钮
- HtmlSubmitInput button = form.getInputByName("btnG");
- // 提交表单
- HtmlPage listPage = button.click();
- System.out.println(listPage.asXml());
- webClient.closeAllWindows();
- }
- /**
- * 打开google 搜索百度
- *
- * @param args
- * @throws Exception
- */
- public static void main(String[] args) throws Exception {
- String url = "http://www.google.com.hk";
- HtmlUnitDemo htmlUnit = new HtmlUnitDemo();
- byte[] getResponse = htmlUnit.sendGetRequest(url);
- System.out.println("Get Body : " + new String(getResponse, "utf-8"));
- byte[] postResponse = htmlUnit.sendPostRequest(url, null);
- System.out.println("Get Body : " + new String(postResponse, "utf-8"));
- htmlUnit.shutdown();
- }
- }