/***
* 自定义的getpage,遇到验证码页面识别直至成功
**/
privateHtmlPage MyGetPage(StringBuffer URL) {
HtmlPage page= null;boolean flag = true;int TryTimeCnt = 1;int UnknowHostTryTimeCnt = 1;while(flag) {
flag= false;try{
logger.info(Thread.currentThread().getName()+ " webClient.getPage : " + URL + ",CrawlURL_id:"
+crawlURLId);
page=webClient.getPage(URL.toString());
Document doc=Jsoup.parse(page.asXml());int robotchecknum = 1;while (doc.select("title").text().equals("Robot Check")) {
logger.info(Thread.currentThread().getName()+ " " +dayformat1.format(System.currentTimeMillis())+ " [Robot Check,URL:" + URL + "]");
String captcha_str= AmazonGetCaptcha.GetCaptcha(newStringBuilder(doc.toString()));
logger.info(Thread.currentThread().getName()+ " " +dayformat1.format(System.currentTimeMillis())+ " end AmazonGetCaptcha.GetCaptcha");
logger.info(dayformat1.format(new Date()) + " " + Thread.currentThread().getName() + " : "
+captcha_str);
HtmlForm form= null;
logger.info(Thread.currentThread().getName()+ " page.getForms().get(0) Start");
form= page.getForms().get(0);
logger.info(Thread.currentThread().getName()+ " page.getForms().get(0) End");
HtmlButton button= null;
logger.info(Thread.currentThread().getName()+ " form.getElementsByTagName(button).get(0) Start");
button= (HtmlButton) form.getElementsByTagName("button").get(0);
logger.info(Thread.currentThread().getName()+ " form.getElementsByTagName(button).get(0) End");
logger.info(Thread.currentThread().getName()+ " setValueAttribute Start");
form.getInputByName("field-keywords").setValueAttribute(captcha_str);
logger.info(Thread.currentThread().getName()+ " setValueAttribute End");
logger.info(Thread.currentThread().getName()+ " button.click Start");boolean click_flag = false;while (!click_flag) {try{
click_flag= true;
page=button.click();
}catch(Exception e1) {
logger.error(Thread.currentThread().getName()+ " button.click出错了: " +e1);//e1.printStackTrace();
click_flag = false;
}
}
logger.info(Thread.currentThread().getName()+ " button.click end");while (page.asXml() == null) {
logger.info(Thread.currentThread().getName()+ " page xml null");
logger.info(Thread.currentThread().getName()+" "+page.asXml());
page.refresh();
logger.info(Thread.currentThread().getName()+ " refresh End!");
}
logger.info(Thread.currentThread().getName()+ " button.click End");
logger.info(Thread.currentThread().getName()+ " Start ParsePage!");
doc=Jsoup.parse(page.asXml());if (!doc.select("title").text().equals("Robot Check")) {
logger.info(Thread.currentThread().getName()+ " " + doc.select("title").text());
logger.info(Thread.currentThread().getName()+ " "
+ dayformat1.format(System.currentTimeMillis()) + " [Robot Check,captcha success:"
+ captcha_str + ",try num:" + robotchecknum + "]");
}
robotchecknum++;
}
}catch(FailingHttpStatusCodeException e) {
logger.error(Thread.currentThread().getName()+" "+e);
flag= true;
}catch(MalformedURLException e) {
logger.error(Thread.currentThread().getName()+" "+e);
flag= true;
}catch(UnknownHostException e) {
logger.error(Thread.currentThread().getName()+" "+e);
flag= true;
logger.info("found UnknownHostException,start sleep 20 min");try{
Thread.sleep(1000*60*Integer.parseInt(Configuration.getProperties("unknowhost_sleeptime")));
}catch(InterruptedException e1) {
logger.error(Thread.currentThread().getName()+" "+e1);
}
logger.info("found UnknownHostException,end sleep 20 min");
UnknowHostTryTimeCnt++;//访问异常数加一
logger.info(Thread.currentThread().getName() + " " +dayformat1.format(System.currentTimeMillis())+ " [UnknowHostTryTimeCnt:" + UnknowHostTryTimeCnt + "]");if (UnknowHostTryTimeCnt > Integer.parseInt(Configuration.getProperties("unknowhost_maxtrytime"))) {return null;
}
}catch(Exception eq) {
logger.error(Thread.currentThread().getName()+ " "+eq);
TryTimeCnt++;//访问异常数加一
logger.info(Thread.currentThread().getName() + " " +dayformat1.format(System.currentTimeMillis())+ " [TryTimeCnt:" + TryTimeCnt + "]");if (TryTimeCnt > 5) {return null;
}try{
Thread.sleep(1000);
}catch(InterruptedException e) {
e.printStackTrace();
logger.error(Thread.currentThread().getName()+e);
}
flag= true;
}try{
Thread.sleep(random.nextInt(500) + 1500);
}catch(InterruptedException e) {
logger.error(Thread.currentThread().getName()+e);
flag= true;
}
}returnpage;
}