利用HttpClient、HtmlParser简单实现网络爬虫

  1. 下载必备的jar包

  2. 新建一个java项目

    • 解压刚才下载的压缩文件,将以下包导入新建项目

  3. 封装工具包,便于代码复用

    • 这是我自己封装的工具包
    • HttpUtil.java
    package com.yinzhengfei.spider.util;
    
    import java.io.BufferedReader;
    import java.io.InputStream;
    import java.io.InputStreamReader;
    
    import org.apache.commons.io.IOUtils;
    import org.apache.http.HttpEntity;
    import org.apache.http.HttpResponse;
    import org.apache.http.client.HttpClient;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.util.EntityUtils;
    
    /**
     * @author	尹正飞
     * @Email	feld_yin@163.com
     * @qq		657812595 
     * @version 2013-3-28 下午4:28:56 
     */
    
    public class HttpUtil {
    	
    	public static String getHtml(HttpClient httpClient,String url){
    		
    		StringBuffer html = null;
    		try {
    			HttpGet httpGet = new HttpGet(url);
    			HttpResponse ht = httpClient.execute(httpGet);
    			HttpEntity entity = ht.getEntity();
    			String charset = EntityUtils.getContentCharSet(entity);
    			InputStream in = entity.getContent();
    //			System.out.println("相应的字符集是:"+charset);
    //			本来应该是这样的 BufferedReader br = new BufferedReader(new InputStreamReader(in,charset));
    //			有些网站获取不到编码方式,会报空指针,这个有待改进,所以我强制写成utf-8
    			BufferedReader br = new BufferedReader(new InputStreamReader(in,"utf-8"));
    			String lineString = null;
    			html = new StringBuffer();
    			while((lineString = br.readLine())!= null){
    				html.append(lineString);
    			}
    			br.close();
    		} catch (Exception e) {
    			e.printStackTrace();
    		} 
    		
    		return html.toString();
    	}
    	
    	public static byte [] getImage(HttpClient httpClient,String url){
    		
    		try {
    			HttpGet httpGet = new HttpGet(url);
    			HttpResponse ht = httpClient.execute(httpGet);
    			HttpEntity entity = ht.getEntity();
    			InputStream in = entity.getContent();
    			return IOUtils.toByteArray(in);
    		} catch (Exception e) {
    			e.printStackTrace();
    		} 
    		
    		return null;
    	}
    
    }
    

     
    • ParserHtmlUtil.java

    package com.yinzhengfei.spider.util;
    
    import java.util.ArrayList;
    import java.util.List;
    
    import org.htmlparser.Node;
    import org.htmlparser.NodeFilter;
    import org.htmlparser.Parser;
    import org.htmlparser.nodes.TagNode;
    import org.htmlparser.util.NodeList;
    
    /**
     * @author	尹正飞
     * @Email	feld_yin@163.com
     * @qq		657812595 
     * @version 2013-3-28 下午3:11:13 
     */
    
    public class ParserHtmlUtil {
    	
    	/**
    	 * 提取某个属性值的标签列表
    	 * @param html	被提取HTML文本
    	 * @param tagType	标签类型
    	 * @param attributeName	某个属性名称
    	 * @param attributeValue	属性的值
    	 * @return
    	 */
    	public static <T extends TagNode> List<T> parserTags(String html, final Class<T> tagType, final String attributeName, final String attributeValue){
    		
    		if(html == null || tagType == null)
    			return null;
    		
    		Parser parser = new Parser();
    		NodeList nodeList = null;
    		try {
    			parser.setInputHTML(html);
    			nodeList = parser.parse(new NodeFilter() {
    				private static final long serialVersionUID = 1L;
    				@Override
    				public boolean accept(Node node) {
    					
    					if(node.getClass() != tagType)
    						return false;
    					
    					if(attributeName == null && attributeValue == null)
    						return true;
    					
    					T tt = (T)node;
    					if(attributeValue.equals(tt.getAttribute(attributeName)))
    						return true;
    					
    					return false;
    				}
    			});
    		} catch (Exception e) {
    			e.printStackTrace();
    		}
    		
    		List<T> list = new ArrayList<T>();
    		for(int i=0; i<nodeList.size(); i++){
    			list.add((T) nodeList.elementAt(i));
    		}
    		
    		return list;
    	}
    	
    	/**
    	 * 提取某种类型的标签列表
    	 * @param html	被提取HTML文本
    	 * @param tagType	标签类型
    	 * @return
    	 */
    	public static <T extends TagNode> List<T> parserTags(String html, final Class<T> tagType){
    		return parserTags(html, tagType, null, null);
    	}
    	
    	/**
    	 * 提取某个属性值的标签
    	 * @param html	被提取HTML文本
    	 * @param tagType	标签类型
    	 * @param attributeName	某个属性名称
    	 * @param attributeValue	属性的值
    	 * @return
    	 */
    	public static <T extends TagNode> T parserTag(String html, final Class<T> tagType, final String attributeName, final String attributeValue){
    		List<T> list = parserTags(html, tagType, attributeName, attributeValue);
    		if(list != null && list.size() > 0)
    			return list.get(0);
    		return null;
    	}
    	
    	/**
    	 * 提取某种类型的标签
    	 * @param html	被提取HTML文本
    	 * @param tagType	标签类型
    	 * @return
    	 */
    	public static <T extends TagNode> T parserTag(String html, final Class<T> tagType){
    		List<T> list = parserTags(html, tagType, null, null);
    		if(list != null && list.size() > 0)
    			return list.get(0);
    		return null;
    	}
    
    }
    


  4. 抓取一个网站信息(抓取时,先要了解此网站页面信息的规律,这个可以用firebug,我这里测试抓取的是薄荷网食物库常见菜肴

  • 根据网站页面封装一下抓取信息(我这里只演示抓取菜肴分类、菜肴名称、菜肴能量、菜肴制作烹饪类型(其中菜肴分类、烹饪类型是可以再封装,我这里就不在详细封装了))

Dish.java

/**
 * @author	尹正飞
 * @Email	feld_yin@163.com
 * @qq		657812595 
 * @version 2013-4-2 下午4:40:49 
 */

public class Dish {
	
	private String category;
	private String name;
	private String energy;
	private String cookType;
	
	public String getCategory() {
		return category;
	}
	public void setCategory(String category) {
		this.category = category;
	}
	public String getName() {
		return name;
	}
	public void setName(String name) {
		this.name = name;
	}
	public String getEnergy() {
		return energy;
	}
	public void setEnergy(String energy) {
		this.energy = energy;
	}
	public String getCookType() {
		return cookType;
	}
	public void setCookType(String cookType) {
		this.cookType = cookType;
	}
	@Override
	public String toString() {
		return "Dish [category=" + category + ", name=" + name + ", energy="
				+ energy + ", cookType=" + cookType + "]";
	}

}

  • 这里只是简单的抓取,我就没太多的封装,大家可以根据自己需求封装

Test.java

import java.util.ArrayList;
import java.util.List;


import org.apache.http.client.HttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.htmlparser.tags.Div;
import org.htmlparser.tags.HeadingTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.ParagraphTag;
import org.htmlparser.tags.Span;

import com.yinzhengfei.spider.util.HttpUtil;
import com.yinzhengfei.spider.util.ParserHtmlUtil;


/**
 * @author	尹正飞
 * @Email	feld_yin@163.com
 * @qq		657812595 
 * @version 2013-3-28 上午10:50:59 
 */

public class Test {

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		try {
			
			HttpClient httpClient = new DefaultHttpClient();
			
			String baseUrl = "http://www.boohee.com";
			String html = HttpUtil.getHtml(httpClient, baseUrl+"/food");
			List<LinkTag> linkList = ParserHtmlUtil.parserTags(html, LinkTag.class, "class", "green2");
			for(LinkTag lt:linkList){
				System.out.println(lt.getLink()+"------------"+lt.getLinkText());
				for(LinkTag l:pageFood(httpClient, baseUrl, lt)){
					Dish dish = new Dish();
					dish.setCategory(lt.getLinkText());
					dish.setName(l.getLinkText());
					System.out.println(foodInfo(httpClient, baseUrl, l,dish));
				}
				System.out.println("+++++++++++++++++++++++++++++++++++++++++++++");
			}
			
		} catch (Exception e) {
			e.printStackTrace();
		}

	}
	
	public static List<LinkTag> pageFood(HttpClient httpClient,String baseUrl, LinkTag lt){
		
		List<LinkTag> list = new ArrayList<LinkTag>();
		
		String foodListHtml = HttpUtil.getHtml(httpClient, baseUrl+lt.getLink());
		//采集当前页的信息
		list.addAll(ParserHtmlUtil.parserTags(foodListHtml, LinkTag.class, "class", "gray1"));
		
		//读取有多少页
		LinkTag pageNextPageNum = ParserHtmlUtil.parserTag(foodListHtml, LinkTag.class,"class","next_page");
		if(pageNextPageNum != null){
			list.addAll(pageFood(httpClient, baseUrl, pageNextPageNum));
		}
		
		return list;
	}
	
	public static Dish foodInfo(HttpClient httpClient, String baseUrl, LinkTag lt, Dish dish){
		
		String foodInfoHtml = HttpUtil.getHtml(httpClient, baseUrl+lt.getLink());
		//热量
		Span energySpan = ParserHtmlUtil.parserTag(foodInfoHtml, Span.class, "class", "stress red1");
		dish.setEnergy(energySpan.getStringText() + " 大卡(100克)");
		
		//做法
		ParagraphTag cookingP = ParserHtmlUtil.parserTag(foodInfoHtml, ParagraphTag.class, "style", "border:0;");
		if(cookingP != null)
			dish.setCookType(cookingP.getStringText().trim());
		
		List<Div> ingredientsDivList = ParserHtmlUtil.parserTags(foodInfoHtml, Div.class, "class", "part divide10");
		for(Div div:ingredientsDivList){
			HeadingTag hTag = ParserHtmlUtil.parserTag(div.getChildrenHTML(), HeadingTag.class);
			if(hTag == null)
				break;
			String hStr = hTag.getStringText();
			String [] names = dish.getName().split(",");
			if("主料".equals(hStr)){
				
				continue;
			}else if("原料".equals(hStr)){

				continue;
			}
			else if("辅料".equals(hStr)){

				continue;
			}else if("详细说明".equals(hStr)){

				continue;
			}else if("调料".equals(hStr)){

				continue;
			}else if("类别".equals(hStr)){

				continue;
			}else if("口味".equals(hStr)){

				continue;
			}else if("食用效果".equals(hStr)){

				continue;
			}else if((names[0]+"做法").equals(hStr)){

				continue;
			}else if((names[0]+"相关食物").equals(hStr)){

				continue;
			}
			System.out.println(hStr);
		}
		
		return dish;
		
	}

}

  • 运行结果(数据老多,我就截个屏,基本上它上面的家常菜肴都给它抓下来了)

  • 上面有些菜肴信息我是没有抓取的,如图

有兴趣的话可以将上述菜肴信息封装到Dish.java文件中,并在上图空白处实现抓取页面相应信息代码。

注:上述抓取代码用到递归,大数据时有可能内存溢出,有兴趣者可以再完善下


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值