-
下载必备的jar包
- HttpClient官方地址:http://hc.apache.org/downloads.cgi
- HtmlParser官方地址:http://sourceforge.net/projects/htmlparser/files/
-
新建一个java项目
- 解压刚才下载的压缩文件,将以下包导入新建项目
-
封装工具包,便于代码复用
- 这是我自己封装的工具包
- HttpUtil.java
package com.yinzhengfei.spider.util; import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import org.apache.commons.io.IOUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.util.EntityUtils; /** * @author 尹正飞 * @Email feld_yin@163.com * @qq 657812595 * @version 2013-3-28 下午4:28:56 */ public class HttpUtil { public static String getHtml(HttpClient httpClient,String url){ StringBuffer html = null; try { HttpGet httpGet = new HttpGet(url); HttpResponse ht = httpClient.execute(httpGet); HttpEntity entity = ht.getEntity(); String charset = EntityUtils.getContentCharSet(entity); InputStream in = entity.getContent(); // System.out.println("相应的字符集是:"+charset); // 本来应该是这样的 BufferedReader br = new BufferedReader(new InputStreamReader(in,charset)); // 有些网站获取不到编码方式,会报空指针,这个有待改进,所以我强制写成utf-8 BufferedReader br = new BufferedReader(new InputStreamReader(in,"utf-8")); String lineString = null; html = new StringBuffer(); while((lineString = br.readLine())!= null){ html.append(lineString); } br.close(); } catch (Exception e) { e.printStackTrace(); } return html.toString(); } public static byte [] getImage(HttpClient httpClient,String url){ try { HttpGet httpGet = new HttpGet(url); HttpResponse ht = httpClient.execute(httpGet); HttpEntity entity = ht.getEntity(); InputStream in = entity.getContent(); return IOUtils.toByteArray(in); } catch (Exception e) { e.printStackTrace(); } return null; } }
- ParserHtmlUtil.java
package com.yinzhengfei.spider.util; import java.util.ArrayList; import java.util.List; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.nodes.TagNode; import org.htmlparser.util.NodeList; /** * @author 尹正飞 * @Email feld_yin@163.com * @qq 657812595 * @version 2013-3-28 下午3:11:13 */ public class ParserHtmlUtil { /** * 提取某个属性值的标签列表 * @param html 被提取HTML文本 * @param tagType 标签类型 * @param attributeName 某个属性名称 * @param attributeValue 属性的值 * @return */ public static <T extends TagNode> List<T> parserTags(String html, final Class<T> tagType, final String attributeName, final String attributeValue){ if(html == null || tagType == null) return null; Parser parser = new Parser(); NodeList nodeList = null; try { parser.setInputHTML(html); nodeList = parser.parse(new NodeFilter() { private static final long serialVersionUID = 1L; @Override public boolean accept(Node node) { if(node.getClass() != tagType) return false; if(attributeName == null && attributeValue == null) return true; T tt = (T)node; if(attributeValue.equals(tt.getAttribute(attributeName))) return true; return false; } }); } catch (Exception e) { e.printStackTrace(); } List<T> list = new ArrayList<T>(); for(int i=0; i<nodeList.size(); i++){ list.add((T) nodeList.elementAt(i)); } return list; } /** * 提取某种类型的标签列表 * @param html 被提取HTML文本 * @param tagType 标签类型 * @return */ public static <T extends TagNode> List<T> parserTags(String html, final Class<T> tagType){ return parserTags(html, tagType, null, null); } /** * 提取某个属性值的标签 * @param html 被提取HTML文本 * @param tagType 标签类型 * @param attributeName 某个属性名称 * @param attributeValue 属性的值 * @return */ public static <T extends TagNode> T parserTag(String html, final Class<T> tagType, final String attributeName, final String attributeValue){ List<T> list = parserTags(html, tagType, attributeName, attributeValue); if(list != null && list.size() > 0) return list.get(0); return null; } /** * 提取某种类型的标签 * @param html 被提取HTML文本 * @param tagType 标签类型 * @return */ public static <T extends TagNode> T parserTag(String html, final Class<T> tagType){ List<T> list = parserTags(html, tagType, null, null); if(list != null && list.size() > 0) return list.get(0); return null; } }
-
抓取一个网站信息(抓取时,先要了解此网站页面信息的规律,这个可以用firebug,我这里测试抓取的是薄荷网食物库常见菜肴)
- 根据网站页面封装一下抓取信息(我这里只演示抓取菜肴分类、菜肴名称、菜肴能量、菜肴制作烹饪类型(其中菜肴分类、烹饪类型是可以再封装,我这里就不在详细封装了))
Dish.java
/** * @author 尹正飞 * @Email feld_yin@163.com * @qq 657812595 * @version 2013-4-2 下午4:40:49 */ public class Dish { private String category; private String name; private String energy; private String cookType; public String getCategory() { return category; } public void setCategory(String category) { this.category = category; } public String getName() { return name; } public void setName(String name) { this.name = name; } public String getEnergy() { return energy; } public void setEnergy(String energy) { this.energy = energy; } public String getCookType() { return cookType; } public void setCookType(String cookType) { this.cookType = cookType; } @Override public String toString() { return "Dish [category=" + category + ", name=" + name + ", energy=" + energy + ", cookType=" + cookType + "]"; } }
- 这里只是简单的抓取,我就没太多的封装,大家可以根据自己需求封装
Test.java
import java.util.ArrayList; import java.util.List; import org.apache.http.client.HttpClient; import org.apache.http.impl.client.DefaultHttpClient; import org.htmlparser.tags.Div; import org.htmlparser.tags.HeadingTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.ParagraphTag; import org.htmlparser.tags.Span; import com.yinzhengfei.spider.util.HttpUtil; import com.yinzhengfei.spider.util.ParserHtmlUtil; /** * @author 尹正飞 * @Email feld_yin@163.com * @qq 657812595 * @version 2013-3-28 上午10:50:59 */ public class Test { /** * @param args */ public static void main(String[] args) { try { HttpClient httpClient = new DefaultHttpClient(); String baseUrl = "http://www.boohee.com"; String html = HttpUtil.getHtml(httpClient, baseUrl+"/food"); List<LinkTag> linkList = ParserHtmlUtil.parserTags(html, LinkTag.class, "class", "green2"); for(LinkTag lt:linkList){ System.out.println(lt.getLink()+"------------"+lt.getLinkText()); for(LinkTag l:pageFood(httpClient, baseUrl, lt)){ Dish dish = new Dish(); dish.setCategory(lt.getLinkText()); dish.setName(l.getLinkText()); System.out.println(foodInfo(httpClient, baseUrl, l,dish)); } System.out.println("+++++++++++++++++++++++++++++++++++++++++++++"); } } catch (Exception e) { e.printStackTrace(); } } public static List<LinkTag> pageFood(HttpClient httpClient,String baseUrl, LinkTag lt){ List<LinkTag> list = new ArrayList<LinkTag>(); String foodListHtml = HttpUtil.getHtml(httpClient, baseUrl+lt.getLink()); //采集当前页的信息 list.addAll(ParserHtmlUtil.parserTags(foodListHtml, LinkTag.class, "class", "gray1")); //读取有多少页 LinkTag pageNextPageNum = ParserHtmlUtil.parserTag(foodListHtml, LinkTag.class,"class","next_page"); if(pageNextPageNum != null){ list.addAll(pageFood(httpClient, baseUrl, pageNextPageNum)); } return list; } public static Dish foodInfo(HttpClient httpClient, String baseUrl, LinkTag lt, Dish dish){ String foodInfoHtml = HttpUtil.getHtml(httpClient, baseUrl+lt.getLink()); //热量 Span energySpan = ParserHtmlUtil.parserTag(foodInfoHtml, Span.class, "class", "stress red1"); dish.setEnergy(energySpan.getStringText() + " 大卡(100克)"); //做法 ParagraphTag cookingP = ParserHtmlUtil.parserTag(foodInfoHtml, ParagraphTag.class, "style", "border:0;"); if(cookingP != null) dish.setCookType(cookingP.getStringText().trim()); List<Div> ingredientsDivList = ParserHtmlUtil.parserTags(foodInfoHtml, Div.class, "class", "part divide10"); for(Div div:ingredientsDivList){ HeadingTag hTag = ParserHtmlUtil.parserTag(div.getChildrenHTML(), HeadingTag.class); if(hTag == null) break; String hStr = hTag.getStringText(); String [] names = dish.getName().split(","); if("主料".equals(hStr)){ continue; }else if("原料".equals(hStr)){ continue; } else if("辅料".equals(hStr)){ continue; }else if("详细说明".equals(hStr)){ continue; }else if("调料".equals(hStr)){ continue; }else if("类别".equals(hStr)){ continue; }else if("口味".equals(hStr)){ continue; }else if("食用效果".equals(hStr)){ continue; }else if((names[0]+"做法").equals(hStr)){ continue; }else if((names[0]+"相关食物").equals(hStr)){ continue; } System.out.println(hStr); } return dish; } }
- 运行结果(数据老多,我就截个屏,基本上它上面的家常菜肴都给它抓下来了)
- 上面有些菜肴信息我是没有抓取的,如图
有兴趣的话可以将上述菜肴信息封装到Dish.java文件中,并在上图空白处实现抓取页面相应信息代码。
注:上述抓取代码用到递归,大数据时有可能内存溢出,有兴趣者可以再完善下