htmlUnit的使用

  htmlUnit的功能比Jsoup要强大一些,是一个没有界面的浏览器,可以模拟登陆。

  还可以支持XPath。

  下面是一个模拟百度搜索的例子:

 

 

package com.swpu;

import java.io.IOException;
import java.net.MalformedURLException;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlForm;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput;
import com.gargoylesoftware.htmlunit.html.HtmlTextInput;


public class WorldBankCrawl {

	public static void main(String[] args)
			throws FailingHttpStatusCodeException, MalformedURLException,
			IOException {
		final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_38);
		webClient.getOptions().setCssEnabled(false);
		webClient.getOptions().setJavaScriptEnabled(false); // 必须加上
		HtmlPage page = webClient
				.getPage("https://www.baidu.com/?tn=96010190_dg");

		final HtmlForm form = page.getFormByName("f");
		final HtmlSubmitInput submitInput = form.getInputByValue("百度一下");
		final HtmlTextInput input = form.getInputByName("wd");
		input.setValueAttribute("西游记");

		final HtmlPage nextPage = (HtmlPage) submitInput.click();
		// String nextString = nextPage.asText();
		// System.out.println(nextString);
		// System.out.println(nextPage.asXml());
		final java.util.List<?> images = nextPage
				.getByXPath("//a[@title='万圣公主']/../..//img");
		for (Object image : images) {
			System.out.println(image);
		}
		System.out.println("src:\n");
		System.out.println(nextPage
				.getByXPath("//a[@title='万圣公主']/../..//@src"));
		System.out.println(nextPage
				.getByXPath("//a[@title='万圣公主']/../..//@title"));
		System.out.println(nextPage.getByXPath(
				"//a[@title='万圣公主']/../..//@title").size());

		final java.util.List<?> table = nextPage
				.getByXPath("//table[@class='c-table opr-toplist-table']");
		System.out.println("test table:");
		System.out.println(table);

		final java.util.List<?> link = nextPage
				.getByXPath("//a[@class='n']//@href");

		System.out.println("link:" + link);
		HtmlAnchor next2Anchor = (HtmlAnchor) nextPage.getByXPath(
				"//a[@class='n']").get(0);
		HtmlPage next2Page = next2Anchor.click();
		System.out.println("NEXT PAGE:\n");
		System.out.println(next2Page.asText());
		System.out.println("测试XPath函数:");
		System.out.println(next2Page
				.getByXPath("//span[@title='《西游记》人物']/text()"));
		System.out.println(next2Page.getByXPath("//span[@title='《西游记》人物']"));
		System.out.println(next2Page.getByXPath("//div[2]"));
		System.out.println("函数设置:");
		System.out.println(next2Page.getByXPath("//a[@class='n']"));

		// final HtmlTableHeader header = ((HtmlTable) table).getHeader();
		// final List<HtmlTableRow> headerRows = header.getRows();
		// final HtmlTableHeader header = ((HtmlTable) table).getHeader();
		/*
		 * final java.util.List<HtmlTableRow> headerRows = header.getRows();
		 * for(HtmlTableRow headerRow : headerRows) {
		 * System.out.println(headerRow.asText()); }
		 */

		/*
		 * final java.util.List<?> attributeList =
		 * nextPage.getByXPath("//a[@title='万圣公主']/../..//@src;"); for(Object
		 * attr: attributeList) { System.out.println(attr); }
		 */
		// System.out.println(
		// nextPage.getByXPath("//a[@title='万圣公主']/../..//img").get(0));

		// System.out.println(nextPage.getByXPath("//a[@title='万圣公主']/../..//img"));
		// //不需要特定顺序必须使用//
	}
}


 

下面是登陆新浪的例子:

package com.swpu;

import java.io.IOException;
import java.net.MalformedURLException;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlInput;
import com.gargoylesoftware.htmlunit.html.HtmlPage;


public class ParseSinaBlog {

	public static void main(String[] args)
			throws FailingHttpStatusCodeException, MalformedURLException,
			IOException {
		// TODO Auto-generated method stub

		try {

			final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_38);

			webClient.getOptions().setJavaScriptEnabled(true); // 必须设置为true
			webClient.getOptions().setCssEnabled(false);
			webClient
					.setAjaxController(new NicelyResynchronizingAjaxController());
			webClient.getOptions().setThrowExceptionOnScriptError(false);

			final HtmlPage page = (HtmlPage) webClient
					.getPage("http://login.sina.com.cn/sso/login.php?"
							+ "client=ssologin.js(v1.3.16)");

			HtmlInput ln = page.getHtmlElementById("username");
			HtmlInput pwd = page.getHtmlElementById("password");
			HtmlInput btn = page.getFirstByXPath(".//*[@id='vForm']/"
					+ "div[3]/ul/li[6]/div[2]/input");

			ln.setAttribute("value", "你的用户名");
			pwd.setAttribute("value", "你的密码");

			HtmlPage page2 = btn.click();
			// 登录完成,现在可以爬取任意你想要的页面了。

			// System.out.println(page2.asText());
			/*
			 * HtmlAnchor anchor = (HtmlAnchor)page.getByXPath
			 * ("//li/a[@href='http://weibo.com']");
			 */
			/*
			 * System.out.println("anchor:\n\n\n" + page.getByXPath
			 * ("//li/a[@href='http://weibo.com']"));
			 */
			/*
			 * HtmlPage weiboPage = anchor.click(); java.util.List<?> wbList =
			 * weiboPage.getByXPath ("//div[@class]"); // \"WB_text W_f14\
			 * 
			 * for(Object object : wbList) { System.out.println(object); }
			 */

			/*
			 * HtmlPage nextPage = webClient.getPage("http://weibo.com/friends?"
			 * + "leftnav=1&wvr=6&isfriends=1&step=2");
			 * System.out.println(nextPage.asXml());
			 */

			/*
			 * HtmlPage page3 = webClient.getPage("http://weibo.com/" +
			 * "friends?leftnav=1&wvr=5&isfriends=1&step=2");
			 */
			// System.out.println(" : " + page3.asXml());
			HtmlPage firstPage = webClient.getPage("http://weibo.com/"
					+ "u/2795493364/home?leftnav=1");
			System.out.println(firstPage.asXml());

			/*
			 * java.util.List<?> contents = firstPage.getByXPath
			 * ("//div[@class='WB_text W_f14']");
			 * 
			 * System.out.println("print contents:"); for(Object content :
			 * contents) { System.out.println(content);
			 * 
			 * 
			 * }
			 */

		} catch (Exception ex) {
			System.out.println(ex.getMessage());
		}

	}
}


不会的就用一个例子试试。

 

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值