nutch ajax mysql_Nutch爬取Ajax请求的动态网页

利用开源插件html-unit

https://github.com/xautlx/nutch-htmlunit

把插件倒入到nutch环境中

但是在执行过程中,会出现各种错误。原因是lib-htmlunit的HttpWebClient有问题,

作了如下修改:

package org.apache.nutch.protocol.htmlunit;

import org.apache.hadoop.conf.Configuration;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import java.net.URL;

import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;

import com.gargoylesoftware.htmlunit.WebClient;

import com.gargoylesoftware.htmlunit.html.HtmlPage;

import com.gargoylesoftware.htmlunit.html.HtmlInput;

import com.gargoylesoftware.htmlunit.WebRequest;

import com.gargoylesoftware.htmlunit.AjaxController;

import com.gargoylesoftware.htmlunit.BrowserVersion;

/**

* Htmlunit WebClient Helper

* Use one WebClient instance per thread by ThreadLocal to support multiple threads execution

*/

public class HttpWebClient {

private static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.protocol");

private static ThreadLocal threadWebClient = new ThreadLocal();

public static HtmlPage getHtmlPage(String url, Configuration conf) {

try {

WebClient webClient = threadWebClient.get();

if (webClient == null) {

LOG.info("Initing web client for thread: {}", Thread.currentThread().getId());

AjaxController ajaxController = new NicelyResynchronizingAjaxController();

webClient = new WebClient(BrowserVersion.FIREFOX_17);

webClient.getOptions().setCssEnabled(false);

webClient.getOptions().setJavaScriptEnabled(true);

webClient.setAjaxController(ajaxController);

webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);

webClient.getOptions().setThrowExceptionOnScriptError(false);

webClient.getOptions().setPrintContentOnFailingStatusCode(false);

webClient.getOptions().setRedirectEnabled(true);

webClient.getOptions().setPopupBlockerEnabled(true);

webClient.setCache(new ExtHtmlunitCache());

// Enhanced WebConnection based on urlfilter

//百度云盘基本都是Ajax实现的,提供了账号密码方式

HtmlPage loginPage = webClient.getPage("http://yun.baidu.com");

loginPage.getElementById("TANGRAM__PSP_4__userName").setAttribute("value","280889189");

loginPage.getElementById("TANGRAM__PSP_4__password").setAttribute("value","123578951");

loginPage = ((HtmlInput)loginPage.getElementById("TANGRAM__PSP_4__submit")).click();

webClient.setWebConnection(new RegexHttpWebConnection(webClient,conf));

threadWebClient.set(webClient);

}

HtmlPage page = webClient.getPage(url);

//            webClient.closeAllWindows();

return page;

} catch (Exception e) {

throw new RuntimeException(e);

}

}

public static HtmlPage getHtmlPage(String url) {

return getHtmlPage(url, null);

}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值