四十一、页面爬取处理

程序媛征男友

于 2014-10-11 17:19:45 发布

阅读量705

点赞数

分类专栏： Java

本文链接：https://blog.csdn.net/zhaonanemail/article/details/39997023

版权

Java 专栏收录该内容

19 篇文章 0 订阅

订阅专栏

一、页面的爬取

    如何获取网页内容，不明觉厉呀，原来就是一个HttpClient搞定。 比如下面获取聚划算首页的例子，简洁完整地展示了HttpClient的使用过程。

import java.io.IOException;
import org.apache.commons.httpclient.*;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;

public class HttpClientTest {
	public static void main(String[] args){
		spiderPage("http://ju.taobao.com/");
	}

	private static void spiderPage(String url) {
		//1.构造HttpClient的实例
		HttpClient httpClient = new HttpClient();
		//2.创建GET方法的实例
		GetMethod getMethod = new GetMethod(url);
		//3.使用系统提供的默认的恢复策略
		getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,new DefaultHttpMethodRetryHandler());
		//4.执行getMethod
		try {
			int statusCode = httpClient.executeMethod(getMethod);
			if (statusCode != HttpStatus. SC_OK) {
				System.err.println("Method failed: "+ getMethod.getStatusLine());
			}
			//5.读取内容并处理
			byte[] responseBody = getMethod.getResponseBody();
			System.out.println(new String(responseBody, "GBK"));
		} catch (HttpException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} finally {
			//6.释放连接
			getMethod.releaseConnection();
		}
	}
}

爬取到的数据很长，截取了其中有用的数据部分，截图如下（为了紧凑把行首空格去掉了）

从上图可以看到，能得到商品名称、价格、折扣等信息。其实在<a href>标签里有商品的item_id,据此就可得到商品的所有信息。

为了展示直观，我们以取商品的名称列表为例。商品数据位于"class"属性值为"clearfix"的<ul>标签里，<ul>标签下有多个<li>标签。具体标签结构如下：

二、数据的处理

通过第一步我们已经爬取到了数据，String形式的页面源代码，下面是如何处理数据，拆分出需要的数据。同样也很简单，只需要使用org.htmlparser.Parser.


import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.httpclient.*;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;


public class HttpClientTest2 {

	private static String spiderPage(String url) {
		//1.构造HttpClient的实例
		HttpClient httpClient = new HttpClient();
		//2.创建GET方法的实例
		GetMethod getMethod = new GetMethod(url);
		//3.使用系统提供的默认的恢复策略
		getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,new DefaultHttpMethodRetryHandler());
		//4.执行getMethod
		String pageStr = "";
		try {
			int statusCode = httpClient.executeMethod(getMethod);
			if (statusCode != HttpStatus. SC_OK) {
				System.err.println("Method failed: "+ getMethod.getStatusLine());
			}
			//5.读取内容并处理
			byte[] responseBody = getMethod.getResponseBody();
			System.out.println(new String(responseBody, "GBK"));
			pageStr =  new String(responseBody, "GBK");
		} catch (HttpException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} finally {
			//6.释放连接
			getMethod.releaseConnection();
		}
		
		return pageStr;
	}
	
	private static List<String> processData(String pageStr){
		List<String> itemTitles = new ArrayList<String>();
		//1.创建parser对象
		Parser parser = new Parser();
		try {
			parser.setInputHTML(pageStr);
			//2.创建AndFilter实例 
			AndFilter itemFilter =  new AndFilter( new TagNameFilter("ul"), new HasAttributeFilter("class","clearfix")); 
			//3.筛选出所有"class"属性值为"clearfix"的<ul>标签节点
			NodeList ulList = parser.extractAllNodesThatMatch(itemFilter);
			if(ulList!=null && ulList.size()>0){
				Tag ulTag = (Tag)ulList.elementAt(0);
				//获取<ul>标签下有多个<li>标签
				List<Tag> liTags = getChildren(ulTag,"li");		
				for (Tag liTag : liTags) {
					//逐层获取标签
					List<Tag> divTags = getChildren(liTag,"div");
					List<Tag> ahrefTags = getChildren(divTags.get(0), "a");
					String title = getChildren(ahrefTags.get(0), "h3").get(0).getAttribute("title");
					itemTitles.add(title);
				}
			}
		} catch (ParserException e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return itemTitles;
	}
	
	public static List<Tag> getChildren(Tag parent, String tagname) {
		List<Tag> list = new ArrayList<Tag>();

		NodeList nList = parent.getChildren();
		if(nList!=null){
			for (int i = 0; i < nList.size(); i++) {
				try {
					Tag tag = (Tag) nList.elementAt(i);
					if (tag.getTagName().equalsIgnoreCase(tagname))
						list.add(tag);
				} catch (Exception e) {
				}
			}
		}
		return list;
	}
	
	public static void main(String[] args){
		String pageStr = spiderPage("http://ju.taobao.com/");
		List<String> titleList = processData(pageStr);
		for(String title:titleList){
			System.out.println(title);
		}
	}
}

程序运行结果如下：