1.爬取页面效果图
点击"百度一下"按钮前页面
点击"百度一下"按钮后页面
天涯社区登录页面
登录进去之后个人主页
二、具体实现代码
HtmlUnit(底层也是采用httpclient)和jsoup API
package com.yuanhai.test; import java.io.IOException; import java.net.MalformedURLException; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.safety.Whitelist; import org.jsoup.select.Elements; import org.junit.Assert; import org.junit.Test; import com.gargoylesoftware.htmlunit.BrowserVersion; import com.gargoylesoftware.htmlunit.DefaultCredentialsProvider; import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.html.HtmlAnchor; import com.gargoylesoftware.htmlunit.html.HtmlButton; import com.gargoylesoftware.htmlunit.html.HtmlDivision; import com.gargoylesoftware.htmlunit.html.HtmlForm; import com.gargoylesoftware.htmlunit.html.HtmlInput; import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput; import com.gargoylesoftware.htmlunit.html.HtmlTextInput; //参考博文 //1.http://blog.csdn.net/zstu_cc/article/details/39250903 //2.http://blog.csdn.net/cslie/article/details/48735261 public class HtmlUnitAndJsoup { /* * 首先说说HtmlUnit相对于HttpClient的最明显的一个好处, * 是HtmlUnit不仅保存了这个网页对象,更难能可贵的是它还存有这个网页的所有基本操作甚至事件。 * 现在很多网站使用大量ajax,普通爬虫无法获取js生成的内容。 */ /* * 依赖的jar包 commons-lang3-3.1.jar htmlunit-2.13.jar htmlunit-core-js-2.13.jar * httpclient-4.3.1.jar httpcore-4.3.jar httpmime-4.3.1.jar sac-1.3.jar * xml-apis-1.4.01.jar commons-collections-3.2.1.jar commo