一、页面的爬取
如何获取网页内容,不明觉厉呀,原来就是一个HttpClient搞定。 比如下面获取聚划算首页的例子,简洁完整地展示了HttpClient的使用过程。
一、页面的爬取
如何获取网页内容,不明觉厉呀,原来就是一个HttpClient搞定。 比如下面获取聚划算首页的例子,简洁完整地展示了HttpClient的使用过程。
import java.io.IOException;
import org.apache.commons.httpclient.*;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
public class HttpClientTest {
public static void main(String[] args){
spiderPage("http://ju.taobao.com/");
}
private static void spiderPage(String url) {
//1.构造HttpClient的实例
HttpClient httpClient = new HttpClient();
//2.创建GET方法的实例
GetMethod getMethod = new GetMethod(url);
//3.使用系统提供的默认的恢复策略
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,new DefaultHttpMethodRetryHandler());
//4.执行getMethod
try {
int statusCode = httpClient.executeMethod(getMethod);
if (statusCode != HttpStatus. SC_OK) {
System.err.println("Method failed: "+ getMethod.getStatusLine());
}
//5.读取内容并处理
byte[] responseBody = getMethod.getResponseBody();
System.out.println(new String(responseBody, "GBK"));
} catch (HttpException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
//6.释放连接
getMethod.releaseConnection();
}
}
}
爬取到的数据很长,截取了其中有用的数据部分,截图如下(为了紧凑把行首空格去掉了)
从上图可以看到,能得到商品名称、价格、折扣等信息。其实在<a href>标签里有商品的item_id,据此就可得到商品的所有信息。
为了展示直观,我们以取商品的名称列表为例。商品数据位于"class"属性值为"clearfix"的<ul>标签里,<ul>标签下有多个<li>标签。具体标签结构如下:
<ul class="clearfix">
<li>
<div>
<a href>
<h3 title>
</div>
</li>
<li>
<div>
<a href>
<h3 title>
</div>
</li>
</ul>
二、数据的处理
通过第一步我们已经爬取到了数据,String形式的页面源代码,下面是如何处理数据,拆分出需要的数据。同样也很简单,只需要使用org.htmlparser.Parser.
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.httpclient.*;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class HttpClientTest2 {
private static String spiderPage(String url) {
//1.构造HttpClient的实例
HttpClient httpClient = new HttpClient();
//2.创建GET方法的实例
GetMethod getMethod = new GetMethod(url);
//3.使用系统提供的默认的恢复策略
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,new DefaultHttpMethodRetryHandler());
//4.执行getMethod
String pageStr = "";
try {
int statusCode = httpClient.executeMethod(getMethod);
if (statusCode != HttpStatus. SC_OK) {
System.err.println("Method failed: "+ getMethod.getStatusLine());
}
//5.读取内容并处理
byte[] responseBody = getMethod.getResponseBody();
System.out.println(new String(responseBody, "GBK"));
pageStr = new String(responseBody, "GBK");
} catch (HttpException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
//6.释放连接
getMethod.releaseConnection();
}
return pageStr;
}
private static List<String> processData(String pageStr){
List<String> itemTitles = new ArrayList<String>();
//1.创建parser对象
Parser parser = new Parser();
try {
parser.setInputHTML(pageStr);
//2.创建AndFilter实例
AndFilter itemFilter = new AndFilter( new TagNameFilter("ul"), new HasAttributeFilter("class","clearfix"));
//3.筛选出所有"class"属性值为"clearfix"的<ul>标签节点
NodeList ulList = parser.extractAllNodesThatMatch(itemFilter);
if(ulList!=null && ulList.size()>0){
Tag ulTag = (Tag)ulList.elementAt(0);
//获取<ul>标签下有多个<li>标签
List<Tag> liTags = getChildren(ulTag,"li");
for (Tag liTag : liTags) {
//逐层获取标签
List<Tag> divTags = getChildren(liTag,"div");
List<Tag> ahrefTags = getChildren(divTags.get(0), "a");
String title = getChildren(ahrefTags.get(0), "h3").get(0).getAttribute("title");
itemTitles.add(title);
}
}
} catch (ParserException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return itemTitles;
}
public static List<Tag> getChildren(Tag parent, String tagname) {
List<Tag> list = new ArrayList<Tag>();
NodeList nList = parent.getChildren();
if(nList!=null){
for (int i = 0; i < nList.size(); i++) {
try {
Tag tag = (Tag) nList.elementAt(i);
if (tag.getTagName().equalsIgnoreCase(tagname))
list.add(tag);
} catch (Exception e) {
}
}
}
return list;
}
public static void main(String[] args){
String pageStr = spiderPage("http://ju.taobao.com/");
List<String> titleList = processData(pageStr);
for(String title:titleList){
System.out.println(title);
}
}
}
程序运行结果如下: