java webclient_Java WebClient 总结

/***

* 自定义的getpage,遇到验证码页面识别直至成功

**/

privateHtmlPage MyGetPage(StringBuffer URL) {

HtmlPage page= null;boolean flag = true;int TryTimeCnt = 1;int UnknowHostTryTimeCnt = 1;while(flag) {

flag= false;try{

logger.info(Thread.currentThread().getName()+ " webClient.getPage : " + URL + ",CrawlURL_id:"

+crawlURLId);

page=webClient.getPage(URL.toString());

Document doc=Jsoup.parse(page.asXml());int robotchecknum = 1;while (doc.select("title").text().equals("Robot Check")) {

logger.info(Thread.currentThread().getName()+ " " +dayformat1.format(System.currentTimeMillis())+ " [Robot Check,URL:" + URL + "]");

String captcha_str= AmazonGetCaptcha.GetCaptcha(newStringBuilder(doc.toString()));

logger.info(Thread.currentThread().getName()+ " " +dayformat1.format(System.currentTimeMillis())+ " end AmazonGetCaptcha.GetCaptcha");

logger.info(dayformat1.format(new Date()) + " " + Thread.currentThread().getName() + " : "

+captcha_str);

HtmlForm form= null;

logger.info(Thread.currentThread().getName()+ " page.getForms().get(0) Start");

form= page.getForms().get(0);

logger.info(Thread.currentThread().getName()+ " page.getForms().get(0) End");

HtmlButton button= null;

logger.info(Thread.currentThread().getName()+ " form.getElementsByTagName(button).get(0) Start");

button= (HtmlButton) form.getElementsByTagName("button").get(0);

logger.info(Thread.currentThread().getName()+ " form.getElementsByTagName(button).get(0) End");

logger.info(Thread.currentThread().getName()+ " setValueAttribute Start");

form.getInputByName("field-keywords").setValueAttribute(captcha_str);

logger.info(Thread.currentThread().getName()+ " setValueAttribute End");

logger.info(Thread.currentThread().getName()+ " button.click Start");boolean click_flag = false;while (!click_flag) {try{

click_flag= true;

page=button.click();

}catch(Exception e1) {

logger.error(Thread.currentThread().getName()+ " button.click出错了: " +e1);//e1.printStackTrace();

click_flag = false;

}

}

logger.info(Thread.currentThread().getName()+ " button.click end");while (page.asXml() == null) {

logger.info(Thread.currentThread().getName()+ " page xml null");

logger.info(Thread.currentThread().getName()+" "+page.asXml());

page.refresh();

logger.info(Thread.currentThread().getName()+ " refresh End!");

}

logger.info(Thread.currentThread().getName()+ " button.click End");

logger.info(Thread.currentThread().getName()+ " Start ParsePage!");

doc=Jsoup.parse(page.asXml());if (!doc.select("title").text().equals("Robot Check")) {

logger.info(Thread.currentThread().getName()+ " " + doc.select("title").text());

logger.info(Thread.currentThread().getName()+ " "

+ dayformat1.format(System.currentTimeMillis()) + " [Robot Check,captcha success:"

+ captcha_str + ",try num:" + robotchecknum + "]");

}

robotchecknum++;

}

}catch(FailingHttpStatusCodeException e) {

logger.error(Thread.currentThread().getName()+" "+e);

flag= true;

}catch(MalformedURLException e) {

logger.error(Thread.currentThread().getName()+" "+e);

flag= true;

}catch(UnknownHostException e) {

logger.error(Thread.currentThread().getName()+" "+e);

flag= true;

logger.info("found UnknownHostException,start sleep 20 min");try{

Thread.sleep(1000*60*Integer.parseInt(Configuration.getProperties("unknowhost_sleeptime")));

}catch(InterruptedException e1) {

logger.error(Thread.currentThread().getName()+" "+e1);

}

logger.info("found UnknownHostException,end sleep 20 min");

UnknowHostTryTimeCnt++;//访问异常数加一

logger.info(Thread.currentThread().getName() + " " +dayformat1.format(System.currentTimeMillis())+ " [UnknowHostTryTimeCnt:" + UnknowHostTryTimeCnt + "]");if (UnknowHostTryTimeCnt > Integer.parseInt(Configuration.getProperties("unknowhost_maxtrytime"))) {return null;

}

}catch(Exception eq) {

logger.error(Thread.currentThread().getName()+ " "+eq);

TryTimeCnt++;//访问异常数加一

logger.info(Thread.currentThread().getName() + " " +dayformat1.format(System.currentTimeMillis())+ " [TryTimeCnt:" + TryTimeCnt + "]");if (TryTimeCnt > 5) {return null;

}try{

Thread.sleep(1000);

}catch(InterruptedException e) {

e.printStackTrace();

logger.error(Thread.currentThread().getName()+e);

}

flag= true;

}try{

Thread.sleep(random.nextInt(500) + 1500);

}catch(InterruptedException e) {

logger.error(Thread.currentThread().getName()+e);

flag= true;

}

}returnpage;

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值