java jsoup 网络爬虫 学习例子(八)京东和淘宝商品比价 PhantomJS
/*
* filename getHtml.js
* phantomjs.exe 2.0.0
* author InJavaWeTrust
*/
var system = require('system');
var address = '';
if (system.args.length != 2) {
console.log('Try to pass two args when invoking this script!');
phantom.exit();
} else {
address = system.args[1];
}
var page = require('webpage').create();
var url = address;
phantom.outputEncoding = 'GBK';
page.open(url, function (status) {
if (status !== 'success') {
console.log('Failed to get the page!');
} else {
console.log(page.content);
}
phantom.exit();
});
package com.iteye.injavawetrust.phantomjs;
import java.util.List;
/**
*
* @author InJavaWeTrust
*
*/
public interface ProductList {
/**
* 爬取商品列表
* @return
*/
public List<ProductInfo> getProductList();
}
package com.iteye.injavawetrust.phantomjs;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
* @author InJavaWeTrust
*
*/
public class TBProductList implements ProductList{
private static PriceCheckUtil pcu = PriceCheckUtil.getInstance();
private String tbUrl;
private String productName;
public TBProductList(String tbUrl, String productName) {
this.tbUrl = tbUrl;
this.productName = productName;
}
@Override
public List<ProductInfo> getProductList() {
List<ProductInfo> tbProductList = new ArrayList<ProductInfo>();
ProductInfo productInfo = null;
String url = "";
int page = 0;
for(int i = 0; i < 10; i++){
try {
System.out.println("TB Product 第[" + (i + 1) + "]页");
if(i == 0){
url = tbUrl;
}else{
page += 44;
url = Constants.TBURL + pcu.getUrlCode(productName) + Constants.TBPAGE + page;
}
System.out.println(url);
Document doc = Jsoup.parse(pcu.getHtmlByPhantomjs(url));
Elements itemlist = doc.select("div[class=m-itemlist]");
Iterator<Element> it = itemlist.iterator();
while(it.hasNext()){
Element item = it.next();
Elements items = item.select("div[data-category=auctions]");
Iterator<Element> one = items.iterator();
while(one.hasNext()){
Element e = one.next();
Elements price = e.select("div[class=price g_price g_price-highlight]>strong");
String productPrice = price.text();
Elements title = e.select("div[class=row row-2 title]>a");
String productName = title.text();
productInfo = new ProductInfo();
productInfo.setProductName(productName);
productInfo.setProductPrice(productPrice);
tbProductList.add(productInfo);
}
}
} catch(Exception e) {
System.out.println("Get TB product has error");
System.out.println(e.getMessage());
}
}
return tbProductList;
}
public static void main(String[] args) {
try{
String productName = "铅笔";
String tbUrl = Constants.TBURL + pcu.getUrlCode(productName);
List<ProductInfo> list = new TBProductList(tbUrl, productName).getProductList();
for(ProductInfo pi : list){
System.out.println("[" + pi.getProductName() + "] [" + pi.getProductPrice() + "]");
}
}catch(Exception e){
e.printStackTrace();
}
}
}
package com.iteye.injavawetrust.phantomjs;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
* @author InJavaWeTrust
*
*/
public class JDProductList implements ProductList{
private String jdUrl;
private String productName;
private static PriceCheckUtil pcu = PriceCheckUtil.getInstance();
public JDProductList(String jdUrl, String productName){
this.jdUrl = jdUrl;
this.productName = productName;
}
@Override
public List<ProductInfo> getProductList() {
List<ProductInfo> jdProductList = new ArrayList<ProductInfo>();
ProductInfo productInfo = null;
String url = "";
for(int i = 0; i < 10; i++){
try {
System.out.println("JD Product 第[" + (i + 1) + "]页");
if(i == 0) {
url = jdUrl;
}else{
url = Constants.JDURL + pcu.getGbk(productName) + Constants.JDENC + Constants.JDPAGE + (i + 1);
}
System.out.println(url);
Document document = Jsoup.connect(url).timeout(5000).get();
Elements uls = document.select("ul[class=gl-warp clearfix]");
Iterator<Element> ulIter = uls.iterator();
while(ulIter.hasNext()) {
Element ul = ulIter.next();
Elements lis = ul.select("li[data-sku]");
Iterator<Element> liIter = lis.iterator();
while(liIter.hasNext()) {
Element li = liIter.next();
Element div = li.select("div[class=gl-i-wrap]").first();
Elements title = div.select("div[class=p-name p-name-type-2]>a");
String productName = title.attr("title"); //得到商品名称
Elements price = div.select(".p-price>strong");
String productPrice =price.attr("data-price"); //得到商品价格
productInfo = new ProductInfo();
productInfo.setProductName(productName);
productInfo.setProductPrice(productPrice);
jdProductList.add(productInfo);
}
}
} catch(Exception e) {
System.out.println("Get JD product has error [" + url + "]");
System.out.println(e.getMessage());
}
}
return jdProductList;
}
public static void main(String[] args) {
try {
String productName = "书包";
String jdUrl = Constants.JDURL + pcu.getGbk(productName) + Constants.JDENC;
List<ProductInfo> list = new JDProductList(jdUrl, productName).getProductList();
System.out.println(list.size());
for(ProductInfo pi : list){
System.out.println(pi.getProductName() + " " + pi.getProductPrice());
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
package com.iteye.injavawetrust.phantomjs;
/**
*
* @author InJavaWeTrust
*
*/
public class Constants {
/**
* JDURL
*/
public static String JDURL = "http://search.jd.com/Search?keyword=";
/**
* JD汉字编码格式
*/
public static String JDENC = "&enc=utf-8";
/**
* JD分页
*/
public static String JDPAGE ="&page=";
/**
* TBURL
*/
public static String TBURL = "https://s.taobao.com/search?q=";
/**
* 淘宝分页
*/
public static String TBPAGE = "&s=";
/**
* 超时时间
*/
public static int TIMEOUT = 50000;
/**
* 获取页面script
*/
public static String SCRIPT = "E:\\InJavaWeTrust\\js\\getHtml.js ";
/**
* phantomjs.exe path
*/
public static String PHANTOMJSPATH = "D:\\Program Files\\phantomjs\\bin\\phantomjs.exe ";
}
package com.iteye.injavawetrust.phantomjs;
import java.io.Serializable;
import java.util.Date;
/**
*
* @author InJavaWeTrust
*
*/
public class ProductInfo implements Serializable{
private static final long serialVersionUID = 8179244535272774089L;
/**
* 商品ID
*/
private String productid;
/**
* 商品名称
*/
private String productName;
/**
* 商品价格
*/
private String productPrice;
/**
* 月销售笔数
*/
private String tradeNum;
/**
* 商品URL
*/
private String productUrl;
/**
* 商品网店名称
*/
private String shopName;
/**
* 电商名称
*/
private String ecName;
/**
* 爬取入库日期
*/
private Date date;
public String getProductid() {
return productid;
}
public void setProductid(String productid) {
this.productid = productid;
}
public String getProductName() {
return productName;
}
public void setProductName(String productName) {
this.productName = productName;
}
public String getProductPrice() {
return productPrice;
}
public void setProductPrice(String productPrice) {
this.productPrice = productPrice;
}
public String getTradeNum() {
return tradeNum;
}
public void setTradeNum(String tradeNum) {
this.tradeNum = tradeNum;
}
public String getProductUrl() {
return productUrl;
}
public void setProductUrl(String productUrl) {
this.productUrl = productUrl;
}
public String getShopName() {
return shopName;
}
public void setShopName(String shopName) {
this.shopName = shopName;
}
public String getEcName() {
return ecName;
}
public void setEcName(String ecName) {
this.ecName = ecName;
}
public Date getDate() {
return date;
}
public void setDate(Date date) {
this.date = date;
}
}
package com.iteye.injavawetrust.phantomjs;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLEncoder;
import java.text.SimpleDateFormat;
import java.util.List;
import java.util.TimeZone;
import org.apache.commons.logging.LogFactory;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.HttpMethod;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
/**
*
* @author InJavaWeTrust
*
*/
public class PriceCheckUtil {
private PriceCheckUtil() {
}
private static final PriceCheckUtil instance = new PriceCheckUtil();
public static PriceCheckUtil getInstance() {
return instance;
}
/**
* 商品汉字转码
* @param productName 商品名称
* @return
*/
public String getGbk(String productName){
String retGbk = "";
try {
retGbk = new String(productName.getBytes("UTF-8"), "GBK");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return retGbk;
}
/**
* 对淘宝浏览器汉字进行转换
* @param productName 商品名称
* @return
*/
public String getUrlCode(String productName){
String retUrlCode = "";
try {
retUrlCode = URLEncoder.encode(productName, "utf8");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return retUrlCode;
}
/**
* 从列表list中找到与productName相似度最高的ProductInfo
*
* @param productName
* @param list
* @return 相似度最高的productName
*/
public ProductInfo getSimilarity(String productName, List<ProductInfo> list) {
ProductInfo productInfo = null;
/**
* 找到list中所有的productName与字符串productName的相似度,保存在lens数组中
*/
double lens[] = new double[list.size()];
for (int i = 0; i < list.size() - 1; i++) {
lens[i] = sim(productName, list.get(i).getProductName());
}
/**
* 遍历出最大的相似度maxLen
*/
double maxLen = 0.0;
for (int i = 0; i < lens.length; i++) {
if (maxLen < lens[i]) {
maxLen = lens[i];
}
}
/**
* 遍历出最大的相似度的索引maxLenIndex
*/
int maxLenIndex = 0;
for (int i = 0; i < lens.length; i++) {
if (maxLen == lens[i]) {
maxLenIndex = i;
}
}
productInfo = list.get(maxLenIndex);
return productInfo;
}
/**
* 求三个数中最小的一个
* @param one
* @param two
* @param three
* @return
*/
public int min(int one, int two, int three) {
int min = one;
if(two < min) {
min = two;
}
if(three < min) {
min = three;
}
return min;
}
/**
* 计算矢量距离
* Levenshtein Distance(LD)
* @param str1
* @param str2
* @return
*/
public int ld(String str1, String str2) {
int d[][]; //矩阵
int n = str1.length();
int m = str2.length();
int i; //遍历str1的
int j; //遍历str2的
char ch1; //str1的
char ch2; //str2的
int temp; //记录相同字符,在某个矩阵位置值的增量,不是0就是1
if(n == 0) {
return m;
}
if(m == 0) {
return n;
}
d = new int[n+1][m+1];
for(i=0; i<=n; i++) { //初始化第一列
d[i][0] = i;
}
for(j=0; j<=m; j++) { //初始化第一行
d[0][j] = j;
}
for(i=1; i<=n; i++) { //遍历str1
ch1 = str1.charAt(i-1);
//去匹配str2
for(j=1; j<=m; j++) {
ch2 = str2.charAt(j-1);
if(ch1 == ch2) {
temp = 0;
} else {
temp = 1;
}
//左边+1,上边+1, 左上角+temp取最小
d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+temp);
}
}
return d[n][m];
}
/**
* 计算相似度
* @param str1
* @param str2
* @return
*/
public double sim(String str1, String str2) {
int ld = ld(str1, str2);
return 1 - (double) ld / Math.max(str1.length(), str2.length());
}
/**
* 毫秒转换成hhmmss
* @param ms 毫秒
* @return hh:mm:ss
*/
public String msToss(long ms) {
SimpleDateFormat formatter = new SimpleDateFormat("HH:mm:ss");
formatter.setTimeZone(TimeZone.getTimeZone("GMT+00:00"));
String ss = formatter.format(ms);
return ss;
}
/**
* 禁止htmlunit日志输出
*/
public void offLog(){
LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log",
"org.apache.commons.logging.impl.NoOpLog");
}
/**
* 获取淘宝数据
* @param url
* @return
* @throws Exception
*/
public String getXmlByHtmlunit(String url) throws Exception {
offLog();
String ret = "";
WebClient webClient = new WebClient(BrowserVersion.CHROME);
// 1 启动JS
webClient.getOptions().setJavaScriptEnabled(true);
// 2 禁用Css,可避免自动二次请求CSS进行渲染
webClient.getOptions().setCssEnabled(false);
// 3 启动客户端重定向
webClient.getOptions().setRedirectEnabled(true);
// 4 JS运行错误时,是否抛出异常
webClient.getOptions().setThrowExceptionOnScriptError(false);
// 5AJAX support
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
// 6 设置超时
webClient.getOptions().setTimeout(Constants.TIMEOUT);
WebRequest webRequest = new WebRequest(new URL(url));
webRequest.setHttpMethod(HttpMethod.GET);
HtmlPage page = webClient.getPage(webRequest);
webClient.waitForBackgroundJavaScript(10000);
ret = page.asXml();
webClient.close();
return ret;
}
/**
* 通过Phantomjs得到html页面
* @param url
* @return
*/
public String getHtmlByPhantomjs(String url) {
StringBuilder html = new StringBuilder();
try {
Runtime rt = Runtime.getRuntime();
Process p = rt.exec(Constants.PHANTOMJSPATH + Constants.SCRIPT + url);
InputStream is = p.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is));
String tmp = "";
while ((tmp = br.readLine()) != null) {
html.append(tmp);
}
} catch (IOException e) {
e.printStackTrace();
}
return html.toString();
}
}
package com.iteye.injavawetrust.phantomjs;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
/**
*
* @author InJavaWeTrust
*
*/
public class PriceCheckMain {
private static PriceCheckUtil pcu = PriceCheckUtil.getInstance();
public List<Map<String, ProductInfo>> getProductList(String productName) {
String jdUrl = Constants.JDURL + productName + Constants.JDENC;
String tbUrl = Constants.TBURL + productName;
return getProductFromUrls(jdUrl, tbUrl, productName);
}
public List<Map<String, ProductInfo>> getProductFromUrls(String jdUrl, String tbUrl, String productName) {
List<Map<String, ProductInfo>> retListMap = new ArrayList<Map<String,ProductInfo>>();
List<ProductInfo> jdProductList = new JDProductList(jdUrl, productName).getProductList();
List<ProductInfo> tbProductList = new TBProductList(tbUrl, productName).getProductList();
for(int i = 0; i < jdProductList.size(); i++){
String jdProductName = jdProductList.get(i).getProductName();
Map<String, ProductInfo> map = new HashMap<String, ProductInfo>();
map.put("JD", jdProductList.get(i));
ProductInfo tbProduct = pcu.getSimilarity(jdProductName, tbProductList);
map.put("TB", tbProduct);
retListMap.add(map);
}
return retListMap;
}
public static void main(String[] args) {
System.out.println("输入商品名称:");
Scanner scanner = new Scanner(System.in);
String productName = scanner.next();
scanner.close();
System.out.println("京东和淘宝[" + productName + "]商品比价开始。。。。。。");
try{
long starTime = System.currentTimeMillis();
List<Map<String, ProductInfo>> list = new PriceCheckMain().getProductList(productName);
for(Map<String, ProductInfo> map : list){
String jdName = map.get("JD").getProductName();
String jdPrice = map.get("JD").getProductPrice();
String ddName = map.get("TB").getProductName();
String ddPrice = map.get("TB").getProductPrice();
System.out.println("[" + jdName + "] [" + ddName + "]");
System.out.println("[" + jdPrice + "] [" + ddPrice + "]");
System.out.println("-----------------------------------------------------------");
}
long endTime = System.currentTimeMillis();
System.out.println("用时 [" + pcu.msToss(endTime - starTime) + "]");
}catch(Exception e){
System.out.println("error");
System.out.println(e.getMessage());
}
}
}
运行结果:
输入商品名称:
铅笔
京东和淘宝[铅笔]商品比价开始。。。。。。
JD Product 第[1]页
http://search.jd.com/Search?keyword=铅笔&enc=utf-8
JD Product 第[2]页
http://search.jd.com/Search?keyword=閾呯瑪&enc=utf-8&page=2
。。。。。。。。。。。。
TB Product 第[1]页
https://s.taobao.com/search?q=铅笔
TB Product 第[2]页
https://s.taobao.com/search?q=%E9%93%85%E7%AC%94&s=44
。。。。。。。。。。。。。。。。。
[马可9002铅笔 马克三角铅 笔易握正姿木杆 安全无毒2H HB 2B HB HB] [马可9001铅笔 三角形杆橡皮头 学生写字铅笔 HB 2B 满28元包邮]
[12.00] [8.96]
-----------------------------------------------------------
用时 [00:01:35]