java jsoup 网络爬虫 学习例子(六)京东和当当商品比价
package com.iteye.injavawetrust.jdvsdd;
import java.util.List;
/**
*
* @author InJavaWeTrust
*
*/
public interface ProductList {
/**
* 爬取商品列表
* @return
*/
public List<ProductInfo> getProductList();
}
package com.iteye.injavawetrust.jdvsdd;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
* @author InJavaWeTrust
*
*/
public class JDProductList implements ProductList{
private String jdUrl;
private String productName;
private static PriceCheckUtil pcu = PriceCheckUtil.getInstance();
public JDProductList(String jdUrl, String productName){
this.jdUrl = jdUrl;
this.productName = productName;
}
@Override
public List<ProductInfo> getProductList() {
List<ProductInfo> jdProductList = new ArrayList<ProductInfo>();
ProductInfo productInfo = null;
String url = "";
for(int i = 0; i < 10; i++){
try {
System.out.println("JD Product 第[" + (i + 1) + "]页");
if(i == 0) {
url = jdUrl;
}else{
url = Constants.JDURL + pcu.getGbk(productName) + Constants.JDENC + Constants.JDPAGE + (i + 1);
}
System.out.println(url);
Document document = Jsoup.connect(url).timeout(5000).get();
Elements uls = document.select("ul[class=gl-warp clearfix]");
Iterator<Element> ulIter = uls.iterator();
while(ulIter.hasNext()) {
Element ul = ulIter.next();
Elements lis = ul.select("li[data-sku]");
Iterator<Element> liIter = lis.iterator();
while(liIter.hasNext()) {
Element li = liIter.next();
Element div = li.select("div[class=gl-i-wrap]").first();
Elements title = div.select("div[class=p-name p-name-type-2]>a");
String productName = title.attr("title"); //得到商品名称
Elements price = div.select(".p-price>strong");
String productPrice =price.attr("data-price"); //得到商品价格
productInfo = new ProductInfo();
productInfo.setProductName(productName);
productInfo.setProductPrice(productPrice);
jdProductList.add(productInfo);
}
}
} catch(Exception e) {
System.out.println("Get JD product has error [" + url + "]");
System.out.println(e.getMessage());
}
}
return jdProductList;
}
public static void main(String[] args) {
try {
String productName = "书包";
String jdUrl = Constants.JDURL + pcu.getGbk(productName) + Constants.JDENC;
List<ProductInfo> list = new JDProductList(jdUrl, productName).getProductList();
System.out.println(list.size());
for(ProductInfo pi : list){
System.out.println(pi.getProductName() + " " + pi.getProductPrice());
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
package com.iteye.injavawetrust.jdvsdd;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
* @author InJavaWeTrust
*
*/
public class DDProductList implements ProductList{
private String ddUrl;
private String productName;
public DDProductList(String ddUrl, String productName) {
this.ddUrl = ddUrl;
this.productName = productName;
}
@Override
public List<ProductInfo> getProductList() {
List<ProductInfo> ddProductList = new ArrayList<ProductInfo>();
ProductInfo productInfo = null;
String url = "";
for(int i = 0; i < 10; i++){
try{
System.out.println("DD Product 第[" + (i + 1) + "]页");
if(i == 0){
url = ddUrl;
}else{
url = Constants.DDURL + productName + Constants.ACT + Constants.DDPAGE + (i + 1);
}
System.out.println(url);
Document document = Jsoup.connect(url).timeout(5000).get();
Elements uls = document.select("ul[class=bigimg cloth_shoplist]");
Iterator<Element> ulIter = uls.iterator();
while(ulIter.hasNext()){
Element ul = ulIter.next();
Elements lis = ul.select("li");
Iterator<Element> liIter = lis.iterator();
while(liIter.hasNext()){
Element li = liIter.next();
Elements price = li.select("p[class=price]>span");
String productPrice = price.html().replaceAll("¥", "");
Elements title = li.select("p[class=name]>a");
String productName = title.attr("title");
productInfo = new ProductInfo();
productInfo.setProductName(productName);
productInfo.setProductPrice(productPrice);
ddProductList.add(productInfo);
}
}
} catch(Exception e){
System.out.println("Get DD product has error [" + url + "]");
System.out.println(e.getMessage());
}
}
return ddProductList;
}
public static void main(String[] args) {
String productName = "学生铅笔";
String ddUrl = Constants.DDURL + productName;
List<ProductInfo> list = new DDProductList(ddUrl, productName).getProductList();
System.out.println(list.size());
for(ProductInfo pi : list){
System.out.println(pi.getProductName() + " " + pi.getProductPrice());
}
}
}
package com.iteye.injavawetrust.jdvsdd;
import java.io.Serializable;
import java.util.Date;
/**
*
* @author InJavaWeTrust
*
*/
public class ProductInfo implements Serializable{
private static final long serialVersionUID = 8179244535272774089L;
/**
* 商品ID
*/
private String productid;
/**
* 商品名称
*/
private String productName;
/**
* 商品价格
*/
private String productPrice;
/**
* 月销售笔数
*/
private String tradeNum;
/**
* 商品URL
*/
private String productUrl;
/**
* 商品网店名称
*/
private String shopName;
/**
* 电商名称
*/
private String ecName;
/**
* 爬取入库日期
*/
private Date date;
public String getProductid() {
return productid;
}
public void setProductid(String productid) {
this.productid = productid;
}
public String getProductName() {
return productName;
}
public void setProductName(String productName) {
this.productName = productName;
}
public String getProductPrice() {
return productPrice;
}
public void setProductPrice(String productPrice) {
this.productPrice = productPrice;
}
public String getTradeNum() {
return tradeNum;
}
public void setTradeNum(String tradeNum) {
this.tradeNum = tradeNum;
}
public String getProductUrl() {
return productUrl;
}
public void setProductUrl(String productUrl) {
this.productUrl = productUrl;
}
public String getShopName() {
return shopName;
}
public void setShopName(String shopName) {
this.shopName = shopName;
}
public String getEcName() {
return ecName;
}
public void setEcName(String ecName) {
this.ecName = ecName;
}
public Date getDate() {
return date;
}
public void setDate(Date date) {
this.date = date;
}
}
package com.iteye.injavawetrust.jdvsdd;
/**
*
* @author InJavaWeTrust
*
*/
public class Constants {
/**
* JDURL
*/
public static String JDURL = "http://search.jd.com/Search?keyword=";
/**
* JD汉字编码格式
*/
public static String JDENC = "&enc=utf-8";
/**
* JD分页
*/
public static String JDPAGE ="&page=";
/**
* 当当URL
*/
public static String DDURL = "http://search.dangdang.com/?key=";
/**
* 当当ACT
*/
public static String ACT = "&act=input";
/**
* 当当分页
*/
public static String DDPAGE = "&page_index=";
}
package com.iteye.injavawetrust.jdvsdd;
import java.io.UnsupportedEncodingException;
import java.text.SimpleDateFormat;
import java.util.List;
import java.util.TimeZone;
/**
*
* @author InJavaWeTrust
*
*/
public class PriceCheckUtil {
private PriceCheckUtil() {
}
private static final PriceCheckUtil instance = new PriceCheckUtil();
public static PriceCheckUtil getInstance() {
return instance;
}
/**
* 商品汉字转码
* @param productName 商品名称
* @return
*/
public String getGbk(String productName){
String retGbk = "";
try {
retGbk = new String(productName.getBytes("UTF-8"), "GBK");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return retGbk;
}
/**
* 从列表list中找到与productName相似度最高的ProductInfo
*
* @param productName
* @param list
* @return 相似度最高的productName
*/
public ProductInfo getSimilarity(String productName, List<ProductInfo> list) {
ProductInfo productInfo = null;
/**
* 找到list中所有的productName与字符串productName的相似度,保存在lens数组中
*/
double lens[] = new double[list.size()];
for (int i = 0; i < list.size() - 1; i++) {
lens[i] = sim(productName, list.get(i).getProductName());
}
/**
* 遍历出最大的相似度maxLen
*/
double maxLen = 0.0;
for (int i = 0; i < lens.length; i++) {
if (maxLen < lens[i]) {
maxLen = lens[i];
}
}
/**
* 遍历出最大的相似度的索引maxLenIndex
*/
int maxLenIndex = 0;
for (int i = 0; i < lens.length; i++) {
if (maxLen == lens[i]) {
maxLenIndex = i;
}
}
productInfo = list.get(maxLenIndex);
return productInfo;
}
/**
* 求三个数中最小的一个
* @param one
* @param two
* @param three
* @return
*/
public int min(int one, int two, int three) {
int min = one;
if(two < min) {
min = two;
}
if(three < min) {
min = three;
}
return min;
}
/**
* 计算矢量距离
* Levenshtein Distance(LD)
* @param str1
* @param str2
* @return
*/
public int ld(String str1, String str2) {
int d[][]; //矩阵
int n = str1.length();
int m = str2.length();
int i; //遍历str1的
int j; //遍历str2的
char ch1; //str1的
char ch2; //str2的
int temp; //记录相同字符,在某个矩阵位置值的增量,不是0就是1
if(n == 0) {
return m;
}
if(m == 0) {
return n;
}
d = new int[n+1][m+1];
for(i=0; i<=n; i++) { //初始化第一列
d[i][0] = i;
}
for(j=0; j<=m; j++) { //初始化第一行
d[0][j] = j;
}
for(i=1; i<=n; i++) { //遍历str1
ch1 = str1.charAt(i-1);
//去匹配str2
for(j=1; j<=m; j++) {
ch2 = str2.charAt(j-1);
if(ch1 == ch2) {
temp = 0;
} else {
temp = 1;
}
//左边+1,上边+1, 左上角+temp取最小
d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+temp);
}
}
return d[n][m];
}
/**
* 计算相似度
* @param str1
* @param str2
* @return
*/
public double sim(String str1, String str2) {
int ld = ld(str1, str2);
return 1 - (double) ld / Math.max(str1.length(), str2.length());
}
/**
* 毫秒转换成hhmmss
* @param ms 毫秒
* @return hh:mm:ss
*/
public String msToss(long ms) {
SimpleDateFormat formatter = new SimpleDateFormat("HH:mm:ss");
formatter.setTimeZone(TimeZone.getTimeZone("GMT+00:00"));
String ss = formatter.format(ms);
return ss;
}
}
package com.iteye.injavawetrust.jdvsdd;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
/**
*
* @author InJavaWeTrust
*
*/
public class PriceCheckMain {
private static PriceCheckUtil pcu = PriceCheckUtil.getInstance();
public List<Map<String, ProductInfo>> getProductList(String productName) {
String jdUrl = Constants.JDURL + pcu.getGbk(productName) + Constants.JDENC;
String ddUrl = Constants.DDURL + productName;
return getProductFromUrls(jdUrl, ddUrl, productName);
}
public List<Map<String, ProductInfo>> getProductFromUrls(String jdUrl, String ddUrl, String productName) {
List<Map<String, ProductInfo>> retListMap = new ArrayList<Map<String,ProductInfo>>();
List<ProductInfo> jdProductList = new JDProductList(jdUrl, productName).getProductList();
List<ProductInfo> ddProductList = new DDProductList(ddUrl, productName).getProductList();
for(int i = 0; i < jdProductList.size(); i++){
String jdProductName = jdProductList.get(i).getProductName();
Map<String, ProductInfo> map = new HashMap<String, ProductInfo>();
map.put("JD", jdProductList.get(i));
ProductInfo ddProduct = pcu.getSimilarity(jdProductName, ddProductList);
map.put("DD", ddProduct);
retListMap.add(map);
}
return retListMap;
}
public static void main(String[] args) {
System.out.println("输入商品名称:");
Scanner scanner = new Scanner(System.in);
String productName = scanner.next();
scanner.close();
System.out.println("京东和当当[" + productName + "]商品比价开始。。。。。。");
try{
long starTime = System.currentTimeMillis();
List<Map<String, ProductInfo>> list = new PriceCheckMain().getProductList(productName);
for(Map<String, ProductInfo> map : list) {
String jdName = map.get("JD").getProductName();
String jdPrice = map.get("JD").getProductPrice();
String ddName = map.get("DD").getProductName();
String ddPrice = map.get("DD").getProductPrice();
System.out.println("[" + jdName + "] [" + ddName + "]");
System.out.println("[" + jdPrice + "] [" + ddPrice + "]");
System.out.println("-----------------------------------------------------------");
}
long endTime = System.currentTimeMillis();
System.out.println("用时 [" + pcu.msToss(endTime - starTime) + "]");
}catch(Exception e){
System.out.println("error");
System.out.println(e.getMessage());
}
}
}
运行结果:
输入商品名称:
铅笔
京东和当当[铅笔]商品比价开始。。。。。。
JD Product 第[1]页
http://search.jd.com/Search?keyword=閾呯瑪&enc=utf-8
JD Product 第[2]页
http://search.jd.com/Search?keyword=閾呯瑪&enc=utf-8&page=2
..............................
DD Product 第[1]页
http://search.dangdang.com/?key=铅笔
DD Product 第[2]页
http://search.dangdang.com/?key=铅笔&act=input&page_index=2
DD Product 第[3]页
...................................
[得力(deli) S908 木世界系列六角笔杆原木HB铅笔/素描绘图学生铅笔 50支/桶] [ 【开学必备文具】正品 得力文具S907/s908原木HB/2B铅笔绘图素描儿童学生铅笔50支装 ]
[18.00] [16.80]
-----------------------------------------------------------
[得力(deli) 7084 安全考试专用填涂答题卡2B木质铅笔/学生铅笔 12支/盒] [ 得力文具(deli) 0641 削笔器 削笔刀 卷笔刀 削笔机学习用品手摇转笔刀笔刨文具 ]
[10.00] [13.50]
-----------------------------------------------------------
[得力(deli)7083 安全石墨铅芯素描 绘图HB铅笔/学生铅笔 12支/盒] [ 六一儿童节礼物!三菱 绘图HB/2B铅笔12支装 ]
[9.00] [35.00]
-----------------------------------------------------------
[辉柏嘉(Faber-castell)114468 水溶性彩色铅笔 水溶彩铅 48色套装(赠毛笔+笔刨)] [ 德国Faber-castell辉柏嘉三角杆学生铅笔 儿童铅笔 HB 2H 2B 12支 ]
[109.00] [12.00]
................................................
[中华6725桶装彩色铅笔36色24色18色12色原木三角杆彩色铅笔 美术绘画涂鸦涂色彩铅 12色] [ 晨光彩色铅笔36色 24色绘图涂鸦 桶装绘画彩铅彩笔 儿童绘画 ]
[8.80] [12.00]
-----------------------------------------------------------
用时 [00:00:10]