htmlUnit的功能比Jsoup要强大一些,是一个没有界面的浏览器,可以模拟登陆。
还可以支持XPath。
下面是一个模拟百度搜索的例子:
package com.swpu;
import java.io.IOException;
import java.net.MalformedURLException;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlForm;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput;
import com.gargoylesoftware.htmlunit.html.HtmlTextInput;
public class WorldBankCrawl {
public static void main(String[] args)
throws FailingHttpStatusCodeException, MalformedURLException,
IOException {
final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_38);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(false); // 必须加上
HtmlPage page = webClient
.getPage("https://www.baidu.com/?tn=96010190_dg");
final HtmlForm form = page.getFormByName("f");
final HtmlSubmitInput submitInput = form.getInputByValue("百度一下");
final HtmlTextInput input = form.getInputByName("wd");
input.setValueAttribute("西游记");
final HtmlPage nextPage = (HtmlPage) submitInput.click();
// String nextString = nextPage.asText();
// System.out.println(nextString);
// System.out.println(nextPage.asXml());
final java.util.List<?> images = nextPage
.getByXPath("//a[@title='万圣公主']/../..//img");
for (Object image : images) {
System.out.println(image);
}
System.out.println("src:\n");
System.out.println(nextPage
.getByXPath("//a[@title='万圣公主']/../..//@src"));
System.out.println(nextPage
.getByXPath("//a[@title='万圣公主']/../..//@title"));
System.out.println(nextPage.getByXPath(
"//a[@title='万圣公主']/../..//@title").size());
final java.util.List<?> table = nextPage
.getByXPath("//table[@class='c-table opr-toplist-table']");
System.out.println("test table:");
System.out.println(table);
final java.util.List<?> link = nextPage
.getByXPath("//a[@class='n']//@href");
System.out.println("link:" + link);
HtmlAnchor next2Anchor = (HtmlAnchor) nextPage.getByXPath(
"//a[@class='n']").get(0);
HtmlPage next2Page = next2Anchor.click();
System.out.println("NEXT PAGE:\n");
System.out.println(next2Page.asText());
System.out.println("测试XPath函数:");
System.out.println(next2Page
.getByXPath("//span[@title='《西游记》人物']/text()"));
System.out.println(next2Page.getByXPath("//span[@title='《西游记》人物']"));
System.out.println(next2Page.getByXPath("//div[2]"));
System.out.println("函数设置:");
System.out.println(next2Page.getByXPath("//a[@class='n']"));
// final HtmlTableHeader header = ((HtmlTable) table).getHeader();
// final List<HtmlTableRow> headerRows = header.getRows();
// final HtmlTableHeader header = ((HtmlTable) table).getHeader();
/*
* final java.util.List<HtmlTableRow> headerRows = header.getRows();
* for(HtmlTableRow headerRow : headerRows) {
* System.out.println(headerRow.asText()); }
*/
/*
* final java.util.List<?> attributeList =
* nextPage.getByXPath("//a[@title='万圣公主']/../..//@src;"); for(Object
* attr: attributeList) { System.out.println(attr); }
*/
// System.out.println(
// nextPage.getByXPath("//a[@title='万圣公主']/../..//img").get(0));
// System.out.println(nextPage.getByXPath("//a[@title='万圣公主']/../..//img"));
// //不需要特定顺序必须使用//
}
}
下面是登陆新浪的例子:
package com.swpu;
import java.io.IOException;
import java.net.MalformedURLException;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlInput;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
public class ParseSinaBlog {
public static void main(String[] args)
throws FailingHttpStatusCodeException, MalformedURLException,
IOException {
// TODO Auto-generated method stub
try {
final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_38);
webClient.getOptions().setJavaScriptEnabled(true); // 必须设置为true
webClient.getOptions().setCssEnabled(false);
webClient
.setAjaxController(new NicelyResynchronizingAjaxController());
webClient.getOptions().setThrowExceptionOnScriptError(false);
final HtmlPage page = (HtmlPage) webClient
.getPage("http://login.sina.com.cn/sso/login.php?"
+ "client=ssologin.js(v1.3.16)");
HtmlInput ln = page.getHtmlElementById("username");
HtmlInput pwd = page.getHtmlElementById("password");
HtmlInput btn = page.getFirstByXPath(".//*[@id='vForm']/"
+ "div[3]/ul/li[6]/div[2]/input");
ln.setAttribute("value", "你的用户名");
pwd.setAttribute("value", "你的密码");
HtmlPage page2 = btn.click();
// 登录完成,现在可以爬取任意你想要的页面了。
// System.out.println(page2.asText());
/*
* HtmlAnchor anchor = (HtmlAnchor)page.getByXPath
* ("//li/a[@href='http://weibo.com']");
*/
/*
* System.out.println("anchor:\n\n\n" + page.getByXPath
* ("//li/a[@href='http://weibo.com']"));
*/
/*
* HtmlPage weiboPage = anchor.click(); java.util.List<?> wbList =
* weiboPage.getByXPath ("//div[@class]"); // \"WB_text W_f14\
*
* for(Object object : wbList) { System.out.println(object); }
*/
/*
* HtmlPage nextPage = webClient.getPage("http://weibo.com/friends?"
* + "leftnav=1&wvr=6&isfriends=1&step=2");
* System.out.println(nextPage.asXml());
*/
/*
* HtmlPage page3 = webClient.getPage("http://weibo.com/" +
* "friends?leftnav=1&wvr=5&isfriends=1&step=2");
*/
// System.out.println(" : " + page3.asXml());
HtmlPage firstPage = webClient.getPage("http://weibo.com/"
+ "u/2795493364/home?leftnav=1");
System.out.println(firstPage.asXml());
/*
* java.util.List<?> contents = firstPage.getByXPath
* ("//div[@class='WB_text W_f14']");
*
* System.out.println("print contents:"); for(Object content :
* contents) { System.out.println(content);
*
*
* }
*/
} catch (Exception ex) {
System.out.println(ex.getMessage());
}
}
}
不会的就用一个例子试试。