htmlunit 是一款开源的java 页面分析工具,读取页面后,可以有效的使用htmlunit分析页面上的内容。项目可以模拟浏览器运行,被誉为java浏览器的开源实现。这个没有界面的浏览器,运行速度也是非常迅速的。
相关文件下载地址:
http://sourceforge.net/projects/htmlunit/files/
http://jaist.dl.sourceforge.net/project/htmlunit/htmlunit/2.15/htmlunit-2.15-bin.zip
我的需求是使用百度的高级新闻搜索,抓取指定站点新闻
手动搜索的设置如图所示
package com.html580;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.List;
import com.gargoylesoftware.htmlunit.ElementNotFoundException;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlForm;
import com.gargoylesoftware.htmlunit.html.HtmlHiddenInput;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlRadioButtonInput;
import com.gargoylesoftware.htmlunit.html.HtmlSelect;
import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput;
import com.gargoylesoftware.htmlunit.html.HtmlTextInput;
/**
* @description 抓取百度搜索结果
* @author html580
* @site http://www.html580.com
* @version 2014-7-23
*/
public class BaiduSpider {
public static void main(String[] args) {
try {
final WebClient webclient = new WebClient();
final HtmlPage htmlpage = webclient.getPage("http://www.baidu.com/gaoji/advanced.html");
//搜索按钮
final HtmlForm form = htmlpage.getFormByName("f1");
final HtmlSubmitInput button = form.getInputByValue("百度一下");
//搜索结果-关键词
final HtmlTextInput textField = form.getInputByName("q1");
textField.setValueAttribute("HTML我帮您");
//分页条数
final HtmlSelect htmlSelet=form.getSelectByName("rn");
htmlSelet.setDefaultValue("10");
//网页的时间
final HtmlSelect htmlSeletlm=form.getSelectByName("rn");
htmlSeletlm.setDefaultValue("0");
//语言
final List radioButtonCts = form.getRadioButtonsByName("ct");
radioButtonCts.get(0).setChecked(true);
radioButtonCts.get(1).setChecked(false);
radioButtonCts.get(2).setChecked(false);
//文档格式
final HtmlSelect htmlSeletft=form.getSelectByName("ft");
htmlSeletft.setDefaultValue("");
//关键词位置
final List radioButtonq5s = form.getRadioButtonsByName("q5");
radioButtonq5s.get(0).setChecked(true);
radioButtonq5s.get(1).setChecked(false);
radioButtonq5s.get(2).setChecked(false);
//站内搜索 限定要搜索指定的网站
final HtmlTextInput htmlTextInputq6 = form.getInputByName("q6");
htmlTextInputq6.setDefaultValue("html580.com");
//隐藏值
final HtmlHiddenInput hiddenInputtn = form.getInputByName("tn");
hiddenInputtn.setDefaultValue("baiduadv");
final HtmlPage page2 = button.click();
String result = page2.asXml();
System.out.println(result);
webclient.closeAllWindows();
} catch (FailingHttpStatusCodeException e) {
e.printStackTrace();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (ElementNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}