使用watij和xpath实现自动spider(完善中)

watij本来是用于web测试的,但是我发现利用它来做垂直爬虫,效果也很好
以下的代码抓了三个网站

package com.example.tests;

import watij.runtime.ie.IE;
import watij.finders.AttributeFinder;
import watij.finders.Finder;
import watij.finders.NameFinder;
import watij.finders.XPathFinder;
import watij.finders.FinderFactory.*;
import watij.elements.*;


public class WatijHotel {
public static void main(String[] args){
IE ie = new IE(),new_ie=null;
IE iectrip=null,ieelong=null;
try {
ie.start("http://hotel.qunar.com");
//ie.textField(new NameFinder("toCity")).set("北京");
//click("hchkParaSeachElong");
ie.checkbox(new AttributeFinder("id","hchkParaSeachElong")).click();
ie.button(new AttributeFinder("id","hbtnSearch")).click();
Links ls = ie.links(new AttributeFinder("target","_blank"));
//System.out.println(ls.toString());
ls.link(0).click();
new_ie = ie.childBrowser();
//System.out.println(new_ie.text());
//ie.link(new XPathFinder("//DIV[@id='jxContentPanel']//DIV[1]//DIV[1]//DIV[2]/A")).click();
new_ie.waitUntilReady(1000);
String text = new_ie.div(new AttributeFinder("class","detailInfoLinks")).text();
System.out.println(text);
String[] links = text.split("\\)");
String ctrip = "携程旅行网";
String elong = "艺龙旅行网";
String tctrip = "",telong="";
System.out.println(new_ie.childBrowserCount());

for(String link : links){
if(link.indexOf(ctrip)>=0){
try {
new_ie.link("预订网站").click();
tctrip = link + ")";
new_ie.link(tctrip).click();
String qunarprice = new_ie.table(
new AttributeFinder("class", "bookingTable"))
.text();
System.out.println(qunarprice);
new_ie.table(
new AttributeFinder("class", "bookingTable"))
.links().get(0).click();
int count = new_ie.childBrowserCount();
System.out.println(new_ie.childBrowserCount());
iectrip = new_ie.childBrowser(count - 1);
} catch (Exception e) {
e.printStackTrace();
}
} else if(link.indexOf(elong)>=0){
try {
new_ie.link("预订网站").click();
// table class="bookingTable"
telong = link + ")";
new_ie.link(telong).click();
String qunarprice = new_ie.table(
new AttributeFinder("class", "bookingTable"))
.text();
System.out.println(qunarprice);
new_ie.table(
new AttributeFinder("class", "bookingTable"))
.links().get(0).click();
int count = new_ie.childBrowserCount();
// System.out.println(new_ie.childBrowserCount());
ieelong = new_ie.childBrowser(count - 1);
// div class="taL left10_dbk2


} catch (Exception e) {
e.printStackTrace();
}
}
}
if(ieelong != null){
ieelong.waitUntilReady(10000);
// table class="border_2"
ieelong.waitUntilReady(20);
ieelong.div(new AttributeFinder("class", "taL left10_dbk2")).link(0).click();
//ieelong.executeScript("HotelDetails('50101472','rate','eLong')");
ieelong.waitUntilReady(10000);
//ieelong.div(new AttributeFinder("class", "taL left10_dbk2"));
//System.out.println("ieelong=" + ieelong.text());
// form id="HotSrch"
//ieelong.table();
System.out.println(ieelong.text());
//System.out.println(elongprice);
}
if(iectrip != null){
iectrip.waitUntilReady(10000);
// table class="pubGlobal_romList01"
String ctripprice = iectrip.table(new AttributeFinder("class","pubGlobal_romList01")).text();
System.out.println(ctripprice);
}
} catch (Exception e) {
e.printStackTrace();
} finally{
try {
if(ie != null)
ie.close();
if(new_ie != null)
new_ie.close();
if(ieelong != null)
ieelong.close();
if(iectrip != null)
iectrip.close();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

}

}
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值