基于Web-Harvest抓取
(2012-02-11 10:34:24)
1.建立工程,导入相应的包
编写配置文件
<?xml version="1.0" encoding="UTF-8"?>
<config charset="UTF-8">
<!-- iterates over all collected apartments and extract desired data -->
<file action="write" path="archstone/listings.xml" charset="UTF-8">
<![CDATA[ <catalog> ]]>
<!-- loop through all collected links on the downloaded page -->
<loop item="state">
<list>
<xpath expression="//select[@id='ctl00_ctl09_m_findAnApartment_cboStateSearchAsyncActio n']/option[position()>1]/@value">
<html-to-xml>
<http url="http://www.archstoneapartments.com"></http>
</html-to-xml>
</xpath>
</list>
<body>
<empty>
<var-def name="listings" id="listings">
<xpath expression="//div[@id='ctl00_DefaultContent_vBodyResults']//tr[(@class='item' or @class='altitem') and (position() mod 2 = 0)]">
<html-to-xml>
<http url="http://www.archstoneapartments.com/Search_Results.htm?state=${state}"></http>
</html-to-xml>
</xpath>
</var-def>
</empty>
<loop item="item" index="i">
<list>
<var name="listings"></var>
</list>
<body>
<xquery>
<xq-param name="item" type="node()">
<var name="item"></var>
</xq-param>
<xq-expression>
<![CDATA[
declare variable $item as node() external;
let $headline := data($item//td[@class='details-column']/div/div[1]/a/text())
let $addr1 := data($item//td[@class='details-column']/div/div[2]/text())
let $area := data($item//td[@class='details-column']/div/div[3]/text())
return
<listing>
<headline>{normalize-space($headline)}</headline>
<address>{normalize-space($addr1)}</address>
<area>{normalize-space($area)}</area>
</listing>
]]>
</xq-expression>
</xquery>
</body>
</loop>
</body>
</loop>
<![CDATA[ </catalog> ]]>
</file>
</config>
2.编写Java代码
import java.io.IOException;
import org.webharvest.definition.ScraperConfiguration;
import org.webharvest.runtime.Scraper;
public class Test {
public static void main(String[] args) throws IOException {
ScraperConfiguration config = new ScraperConfiguration("c:/archstone.xml");
Scraper scraper = new Scraper(config, "c:/tmp/");
scraper.setDebug(true);
System.out.println("running );
long startTime = System.currentTimeMillis();
scraper.execute();
System.out.println("time elapsed: " + (System.currentTimeMillis() - startTime));
}
}
3.查看执行结果
<catalog>
<listing>
<headline>Archstone Arrowhead</headline>
<address>7701 W. St. John Rd.</address>
<area>Glendale, AZ 85308</area>
</listing>
<listing>
<headline>Archstone Desert Harbor</headline>
<address>8885 W Thunderbird Rd.</address>
<area>Peoria, AZ 85381</area>
</listing>
<listing>
<headline>Archstone Tempe Groves</headline>
<address>909 W. Grove Pkwy.</address>
<area>Tempe, AZ 85283</area>
</listing>
<listing>
<headline>Ironwood Apartments at SanTan</headline>
<address>2910 S. Greenfield Rd.</address>
<area>Gilbert, AZ 85295</area>
</listing>
...
</catalog>
4.结论
Web-harvest很酷,可以对这个结果进行进一步使用和分析。到后来怎么获取数据呢?请看下面的简单代码:就是通过scraper去获取配置文件中的变量,并转化为java中的变量类型就行可以了
Variable title = (Variable) scraper.getContext().get("headline");
return title.toString();
在使用完一个scraper之后,要记得通过scraper.dispose()方法来释放,要不然会使用的内存越来越多,最后outofmemory!
支持中文的也不错,关键是要合理选择好字符集!
编写配置文件
<?xml version="1.0" encoding="UTF-8"?>
<config charset="UTF-8">
<!-- iterates over all collected apartments and extract desired data -->
<file action="write" path="archstone/listings.xml" charset="UTF-8">
<![CDATA[ <catalog> ]]>
<!-- loop through all collected links on the downloaded page -->
<loop item="state">
<list>
<xpath expression="//select[@id='ctl00_ctl09_m_findAnApartment_cboStateSearchAsyncActio
<html-to-xml>
<http url="http://www.archstoneapartments.com"></http>
</html-to-xml>
</xpath>
</list>
<body>
<empty>
<var-def name="listings" id="listings">
<xpath expression="//div[@id='ctl00_DefaultContent_vBodyResults']//tr[(@class='item' or @class='altitem') and (position() mod 2 = 0)]">
<html-to-xml>
<http url="http://www.archstoneapartments.com/Search_Results.htm?state=${state}"></http>
</html-to-xml>
</xpath>
</var-def>
</empty>
<loop item="item" index="i">
<list>
<var name="listings"></var>
</list>
<body>
<xquery>
<xq-param name="item" type="node()">
<var name="item"></var>
</xq-param>
<xq-expression>
<![CDATA[
declare variable $item as node() external;
let $headline := data($item//td[@class='details-column']/div/div[1]/a/text())
let $addr1 := data($item//td[@class='details-column']/div/div[2]/text())
let $area := data($item//td[@class='details-column']/div/div[3]/text())
return
<listing>
<headline>{normalize-space($headline)}</headline>
<address>{normalize-space($addr1)}</address>
<area>{normalize-space($area)}</area>
</listing>
]]>
</xq-expression>
</xquery>
</body>
</loop>
</body>
</loop>
<![CDATA[ </catalog> ]]>
</file>
</config>
2.编写Java代码
import java.io.IOException;
import org.webharvest.definition.ScraperConfiguration;
import org.webharvest.runtime.Scraper;
public class Test {
public static void main(String[] args) throws IOException {
ScraperConfiguration config = new ScraperConfiguration("c:/archstone.xml");
Scraper scraper = new Scraper(config, "c:/tmp/");
scraper.setDebug(true);
System.out.println("running );
long startTime = System.currentTimeMillis();
scraper.execute();
System.out.println("time elapsed: " + (System.currentTimeMillis() - startTime));
}
}
3.查看执行结果
<catalog>
<listing>
<headline>Archstone Arrowhead</headline>
<address>7701 W. St. John Rd.</address>
<area>Glendale, AZ 85308</area>
</listing>
<listing>
<headline>Archstone Desert Harbor</headline>
<address>8885 W Thunderbird Rd.</address>
<area>Peoria, AZ 85381</area>
</listing>
<listing>
<headline>Archstone Tempe Groves</headline>
<address>909 W. Grove Pkwy.</address>
<area>Tempe, AZ 85283</area>
</listing>
<listing>
<headline>Ironwood Apartments at SanTan</headline>
<address>2910 S. Greenfield Rd.</address>
<area>Gilbert, AZ 85295</area>
</listing>
...
</catalog>
4.结论
Web-harvest很酷,可以对这个结果进行进一步使用和分析。到后来怎么获取数据呢?请看下面的简单代码:就是通过scraper去获取配置文件中的变量,并转化为java中的变量类型就行可以了
Variable title = (Variable) scraper.getContext().get("headline");
return title.toString();
在使用完一个scraper之后,要记得通过scraper.dispose()方法来释放,要不然会使用的内存越来越多,最后outofmemory!
支持中文的也不错,关键是要合理选择好字符集!