Java之网络爬虫WebCollector+selenium+phantomjs(三)

经过前面两篇的学习Java之网络爬虫WebCollector+selenium+phantomjs(一)Java之网络爬虫WebCollector+selenium+phantomjs(二)的学习后,我们来做一个小例子。我们所要做的东西为:爬取到京东列表页面,在页面上抽取出商品信息(名称、价格、评价),然后打印出抽取的商品信息。

贴出代码:

Goods.java

/*
 * Copyright (C) 2015 zhao
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package com.zhao.crawler;

/**
 *商品信息
 *
 * @author <a href="ls.zhaoxiangyu@gmail.com">zhao</>
 * @date 2015-10-21
 */
public class Goods {
	private String platform;
	private String url;
	private String name;
	private Float price;
	private Integer commit;
	
	public Goods(){
	}
	
	public String getPlatform() {
		return platform;
	}

	public void setPlatform(String platform) {
		this.platform = platform;
	}
	public String getName() {
		return name;
	}
	public void setName(String name) {
		this.name = name;
	}
	public String getUrl() {
		return url;
	}
	public void setUrl(String url) {
		this.url = url;
	}
	public Float getPrice() {
		return price;
	}
	public void setPrice(Float price) {
		this.price = price;
	}
	public Integer getCommit() {
		return commit;
	}
	public void setCommit(Integer commit) {
		this.commit = commit;
	}
	
	@Override
	public String toString() {
		return "{platform="+platform+",url=" + url + ",name=" + name + ",price="
				+ price + ",commit=" + commit + "}";
	}
	
}

上面类为封装的商品信息。

EECrawler.java

/*
 * Copyright (C) 2015 zhao
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package com.zhao.crawler;

import java.util.concurrent.atomic.AtomicInteger;

import cn.edu.hfut.dmic.webcollector.crawler.DeepCrawler;
import cn.edu.hfut.dmic.webcollector.model.Links;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
import cn.edu.hfut.dmic.webcollector.net.HttpResponse;
import cn.edu.hfut.dmic.webcollector.util.RegexRule;

/**
 *电商平台爬虫
 *
 * @author <a href="ls.zhaoxiangyu@gmail.com">zhao</>
 * @date 2015-10-20
 */
public abstract class ECCrawler extends DeepCrawler {
	
	private String seedFormat;//种子格式化 
	protected RegexRule regexRule;
	
	public RegexRule getRegexRule() {
		return regexRule;
	}
	public void setRegexRule(RegexRule regexRule) {
		this.regexRule = regexRule;
	}
	public void addRegex(String urlRegex) {
		this.regexRule.addRule(urlRegex);
	}
	public ECCrawler(String crawlPath,String seedFormat ){
		super(crawlPath);
		this.seedFormat=seedFormat;
		this.regexRule=new RegexRule();
	}
	
	/*用一个自增id来生成唯一文件名*/
    AtomicInteger id=new AtomicInteger(0);
	
	@Override
	public Links visitAndGetNextLinks(Page page) {
		Links nextLinks = new Links();
		String conteType = page.getResponse().getContentType();
		if (conteType != null && conteType.contains("text/html")) {
			org.jsoup.nodes.Document doc = page.getDoc();
			if (doc != null)
				nextLinks.addAllFromDocument(page.getDoc(), regexRule);
		}
		try {
			visit(page, nextLinks);
		} catch (Exception ex) {
			LOG.info("Exception", ex);
		}
		return nextLinks;
	}
	@Override
	public void start(int depth) throws Exception {
		addSeed();
		super.start(depth);
	}
	/**
	 * add seed
	 *
	 * @throws Exception
	 */
	private void addSeed() throws Exception{
		int totalPage=getTotalPage(getPage(getSeed(seedFormat, 1)));
		for(int page=1;page<=totalPage;page++){
			this.addSeed(getSeed(seedFormat, page));
		}
	}
	
	/**
	 * 根据url获取Page实例
	 *
	 * @param url
	 * @return
	 * @throws Exception
	 */
	private Page getPage(String url) throws Exception {
		HttpRequest httpRequest = new HttpRequest(url);
		HttpResponse response = httpRequest.getResponse();
		Page page = new Page();
		page.setUrl(url);
		page.setHtml(response.getHtmlByCharsetDetect());
		page.setResponse(response);
		return page;
	}
	
	/**
	 *获取查询商品总页数
	 *
	 * @return
	 */
	public abstract int getTotalPage(Page page);
	
	/**
	 * 获取seed url
	 *
	 * @param seedFormat
	 * @param page
	 * @return
	 */
	public String getSeed(String seedFormat,Object ... page){
		return String.format(seedFormat, page);
	}
	
	public abstract void visit(Page page, Links links);
}

上面抽象类继承DeepCrawler,为爬取电商列表也基类,爬取列表页html(包括js动态生成的html),并且可以抽取到列表页数,允许捕获所有页商品信息。

GoodsList.java

/*
 * Copyright (C) 2015 zhao
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package com.zhao.crawler;

import java.util.ArrayList;

import cn.edu.hfut.dmic.webcollector.model.Page;

/**
 *
 *
 * @author <a href="ls.zhaoxiangyu@gmail.com">zhao</>
 * @date 2015-10-23
 */
public abstract class GoodsList extends ArrayList<Goods> {

	/**
	 * 
	 */
	private static final long serialVersionUID = -6935403464055289581L;

	public abstract void addGoods(Page page);
}


上面抽象类为存储商品信息的容器,继承自ArrayList,并且添加addGoods方法,用来添加商品信息到容器中。

JDCrawler.java

/*
 * Copyright (C) 2015 zhao
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package com.zhao.crawler.jd;

import cn.edu.hfut.dmic.webcollector.model.Links;
import cn.edu.hfut.dmic.webcollector.model.Page;

import com.zhao.crawler.ECCrawler;
import com.zhao.crawler.Goods;

/**
 *JD 爬虫
 *
 * @author <a href="ls.zhaoxiangyu@gmail.com">zhao</>
 * @date 2015-10-20
 */
public class JDCrawler extends ECCrawler {

	private JDGoodsList goodsList;
	
	/**
	 *
	 *
	 * @param crawlPath
	 * @param seekFormat
	 */
	public JDCrawler(String crawlPath, String seekFormat) {
		super(crawlPath, seekFormat);
		goodsList=new JDGoodsList();
	}

	@Override
	public int getTotalPage(Page page) {
//		Element ele=page.getDoc().select("div#J_bottomPage").select("span.p-skip >em").first().select("b").first();
//		return ele==null?0:Integer.parseInt(ele.text());
		return 1;
	}

	@Override
	public void visit(Page page, Links links) {
		System.out.println("url:"+page.getUrl()+"\tlinks size:"+links.size());
		goodsList.addGoods(page);
	}
	
	public static void main(String[] args) throws Exception {
		JDCrawler crawler=new JDCrawler("D:/test/crawler/jd/", "http://list.jd.com/list.html?cat=1319,1523,7052&page=%s&go=0&JL=6_0_0");
		crawler.setThreads(100);//抓取启动线程数
		crawler.start(1);//层数
		
		crawler.print();
	}
	
	protected void print(){
		for(Goods g:goodsList){
			System.out.println(g);
		}
	}
}

继承ECCrawler,实现京东平台专属爬取类。获取页码数利用浏览器审查元素,定位到页面信息即可,为了方便测试,这里只返回1。启动时我们直接爬取种子页面,所以设置为1即可,具体的抽取商品信息交给了下面JDGoodsList来处理。抓取结束后,执行一边打印函数,打印出商品信息。

JDGoodsList.java

/*
 * Copyright (C) 2015 zhao
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package com.zhao.crawler.jd;

import java.util.List;

import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;

import cn.edu.hfut.dmic.webcollector.model.Page;

import com.zhao.crawler.Goods;
import com.zhao.crawler.GoodsList;
import com.zhao.crawler.util.PageUtils;
import com.zhao.crawler.util.Platform;
import com.zhao.crawler.util.Tools;

/**
 * 
 * 
 * @author <a href="ls.zhaoxiangyu@gmail.com">zhao</>
 * @date 2015-10-23
 */
public class JDGoodsList extends GoodsList {

	/**
	 * 
	 */
	private static final long serialVersionUID = -7487110223660262262L;

	@Override
	public void addGoods(Page page) {
		WebDriver driver = null;
		try {
			driver = PageUtils.getWebDriver(page);
			List<WebElement> eles = driver.findElements(By.cssSelector("li.gl-item"));
			if (!eles.isEmpty()) {
				for (WebElement ele : eles) {
					Goods g = new Goods();
					g.setPlatform(Platform.JD);// 电商平台
					// 价格
					String priceStr = ele.findElement(By.className("p-price"))
							.findElement(By.className("J_price"))
							.findElement(By.tagName("i"))
							.getText();
					if (Tools.notEmpty(priceStr)) {
						g.setPrice(Float.parseFloat(priceStr));
					} else {
						g.setPrice(-1f);
					}
					// 商品名
					g.setName(ele.findElement(By.className("p-name"))
							.findElement(By.tagName("em")).getText());
					// 商品链接
					g.setUrl(ele.findElement(By.className("p-name"))
							.findElement(By.tagName("a"))
							.getAttribute("href"));
					// 评价
					String commitStr = ele
							.findElement(By.className("p-commit"))
							.findElement(By.tagName("a"))
							.getText();
					if (Tools.notEmpty(commitStr)) {
						g.setCommit(Integer.parseInt(commitStr));
					} else {
						g.setCommit(-1);
					}

					add(g);
				}
			} else {
				System.out.println("else is empty");
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if (driver != null) {
				driver.quit();
			}
		}
	}
}


PageUtils.java

/*
 * Copyright (C) 2015 zhao
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package com.zhao.crawler.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

import org.openqa.selenium.WebDriver;
import org.openqa.selenium.htmlunit.HtmlUnitDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriver;

import com.gargoylesoftware.htmlunit.BrowserVersion;

import cn.edu.hfut.dmic.webcollector.model.Page;

/**
 * 
 * 
 * @author <a href="ls.zhaoxiangyu@gmail.com">zhao</>
 * @date 2015-10-22
 */
public class PageUtils {
	
	/**
	 * 获取webcollector 自带 htmlUnitDriver实例(模拟默认浏览器)
	 *
	 * @param page
	 * @return
	 */
	public static HtmlUnitDriver getDriver(Page page) {
		HtmlUnitDriver driver = new HtmlUnitDriver();
		driver.setJavascriptEnabled(true);
		driver.get(page.getUrl());
		return driver;
	}

	/**
	 * 获取webcollector 自带htmlUnitDriver实例 
	 *
	 * @param page
	 * @param browserVersion 模拟浏览器
	 * @return
	 */
	public static HtmlUnitDriver getDriver(Page page,
			BrowserVersion browserVersion) {
		HtmlUnitDriver driver = new HtmlUnitDriver(browserVersion);
		driver.setJavascriptEnabled(true);
		driver.get(page.getUrl());
		return driver;
	}
	
	/**
	 * 获取PhantomJsDriver(可以爬取js动态生成的html)
	 *
	 * @param page
	 * @return
	 */
	public static WebDriver getWebDriver(Page page) {
//    	WebDriver driver = new HtmlUnitDriver(true);
    	
//    	System.setProperty("webdriver.chrome.driver", "D:\\Installs\\Develop\\crawling\\chromedriver.exe");
//    	WebDriver driver = new ChromeDriver();
    	
    	System.setProperty("phantomjs.binary.path", "D:/Program Files/phantomjs-2.0.0-windows/bin/phantomjs.exe");
    	WebDriver driver = new PhantomJSDriver();
    	driver.get(page.getUrl());
    	
//    	JavascriptExecutor js = (JavascriptExecutor) driver;
//    	js.executeScript("function(){}");
    	return driver;
    }
	
	/**
	 * 直接调用原生phantomJS(即不通过selenium)
	 *
	 * @param page
	 * @return
	 */
	public static String getPhantomJSDriver(Page page) {
    	Runtime rt = Runtime.getRuntime();
    	Process process = null;
    	try {
			process = rt.exec("D:/Program Files/phantomjs-2.0.0-windows/bin/phantomjs.exe" + 
			"D:/MyEclipseWorkSpace/WebCollectorDemo/src/main/resources/parser.js " +
			page.getUrl().trim());
			InputStream in = process.getInputStream();
			InputStreamReader reader = new InputStreamReader(
					in, "UTF-8");
			BufferedReader br = new BufferedReader(reader);
			StringBuffer sbf = new StringBuffer();
			String tmp = "";
			while((tmp = br.readLine())!=null){    
                sbf.append(tmp);    
            }
			return sbf.toString();
		} catch (IOException e) {
			e.printStackTrace();
		}
    	
    	return null;
    }
}

获取WebDriver工具类,上篇有介绍。

Platform.java

/*
 * Copyright (C) 2015 zhao
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package com.zhao.crawler.util;

/**
 *电商平台标识
 *
 * @author <a href="ls.zhaoxiangyu@gmail.com">zhao</>
 * @date 2015-10-23
 */
public interface Platform {
	/**
	 * 京东
	 */
	public static final String JD="JD";
}

Tools.java

/*
 * Copyright (C) 2015 zhao
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package com.zhao.crawler.util;

import org.apache.commons.lang3.StringUtils;

/**
 *
 *
 * @author <a href="ls.zhaoxiangyu@gmail.com">zhao</>
 * @date 2015-10-23
 */
public class Tools {
	
	/**
	 * 判断字符窜是否等于null、"","  ","null"
	 * 
	 * @param str
	 * @return
	 */
	public static boolean isEmpty(String str){
		return StringUtils.isBlank(str)||"null".equals(str);
	}
	
	/**
	 * 判断字符窜是否不等于null、"","  ","null"
	 * 
	 * @param str
	 * @return
	 */
	public static boolean notEmpty(String str){
		return !StringUtils.isBlank(str)&&!"null".equals(str);
	}
	
}
运行程序,控制台输出结果为:


ok,成功抽取商品信息。

自此,此次学习结束。源码下载地址(免费下载):WebCollectorDemo



  • 3
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值