利用开源插件html-unit
https://github.com/xautlx/nutch-htmlunit
把插件倒入到nutch环境中
但是在执行过程中,会出现各种错误。原因是lib-htmlunit的HttpWebClient有问题,
作了如下修改:
package org.apache.nutch.protocol.htmlunit;
import org.apache.hadoop.conf.Configuration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URL;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlInput;
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.AjaxController;
import com.gargoylesoftware.htmlunit.BrowserVersion;
/**
* Htmlunit WebClient Helper
* Use one WebClient instance per thread by ThreadLocal to support multiple threads execution
*/
public class HttpWebClient {
private static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.protocol");
private static ThreadLocal threadWebClient = new ThreadLocal();
public static HtmlPage getHtmlPage(String url, Configuration conf) {
try {
WebClient webClient = threadWebClient.get();
if (webClient == null) {
LOG.info("Initing web client for thread: {}", Thread.currentThread().getId());
AjaxController ajaxController = new NicelyResynchronizingAjaxController();
webClient = new WebClient(BrowserVersion.FIREFOX_17);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.setAjaxController(ajaxController);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setPrintContentOnFailingStatusCode(false);
webClient.getOptions().setRedirectEnabled(true);
webClient.getOptions().setPopupBlockerEnabled(true);
webClient.setCache(new ExtHtmlunitCache());
// Enhanced WebConnection based on urlfilter
//百度云盘基本都是Ajax实现的,提供了账号密码方式
HtmlPage loginPage = webClient.getPage("http://yun.baidu.com");
loginPage.getElementById("TANGRAM__PSP_4__userName").setAttribute("value","280889189");
loginPage.getElementById("TANGRAM__PSP_4__password").setAttribute("value","123578951");
loginPage = ((HtmlInput)loginPage.getElementById("TANGRAM__PSP_4__submit")).click();
webClient.setWebConnection(new RegexHttpWebConnection(webClient,conf));
threadWebClient.set(webClient);
}
HtmlPage page = webClient.getPage(url);
// webClient.closeAllWindows();
return page;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
public static HtmlPage getHtmlPage(String url) {
return getHtmlPage(url, null);
}
}