使用jsoup抓取jd的数据
一、逐步分析
(1)获取所有三级分类
/**
* 获取jd所有的三级分类
* @param url jd有全部分类的链接:https://www.jd.com/allSort.aspx
* @return 返回所有有效的三级分类链接
* @throws IOException
*/
public List<String> getLevel3(String url) throws IOException{
List<String> level3List = new ArrayList<String>();
Document doc = Jsoup.connect(url).get();
Elements eles = doc.select("div dl dd a");
//直接使用html标签,多个用空格隔开,返回值eles是所有符合的a标签:
//<a href="//e.jd.com/ebook.html" target="_blank">电子书</a> .....
@SuppressWarnings("unused")
int i=0;
for (Element ele : eles) {
String catUrl = ele.attr("href");
logger.debug(i+++ele.text()+"=="+catUrl);
//处理有效连接https://list.jd.com/list.html?cat=9987,653,655
if(catUrl.startsWith("//list.jd.com/list.html?cat")){
level3List.add("http:"+catUrl);
}
}
return level3List;
}
@Test
public void getLevel3_test() throws IOException{
String url = "https://www.jd.com/allSort.aspx";
logger.debug(getLevel3(url).size());
}
表明jd共有1285个分类,我的程序抓到的符合我规定的三级分类有1183个(抓到了绝大部分)
(2)获取三级分类下的页数
/**
* 获取三级分类下的页数
* @param url:一个三级分类的链接https://list.jd.com/list.html?cat=9987,653,655
* @return 返回页数
* @throws IOException
*/
public int getPage(String url) throws IOException{
try {
Document doc = Jsoup.connect(url).get();
String text = doc.select("#J_topPage .fp-text i").text();
int page = Integer.parseInt(text);
return page;
} catch (Exception e) {
return 0;
}
}
@Test
public void getPage_test() throws IOException{
String url = "https://list.jd.com/list.html?cat=9987,653,655";
int page = getPage(url);
logger.debug(page);
}
表明该三级分类下有160页商品
(3)获取所有“商品列表页面”的链接
/**
* 获取所有“商品列表页面”的链接
* @param 所有有效三级分类的链接集合
* @return
* @throws IOException
*/
public List<String> getPageUrlList(List<String> level3List) throws IOException{
List<String> pageUrlList = new ArrayList<String>();
for (String pageUrl : level3List) {
int pageNum = 0;
pageNum = getPage(pageUrl);//获取该三级分类下有多少页
for (int page = 1; page <= pageNum; page++) {
//拼接得到所有“商品列表页面”的链接https://list.jd.com/list.html?cat=9987,653,655&page=2
String str = pageUrl+"&page="+page;
logger.debug(str);
pageUrlList.add(str);
}
}
return pageUrlList;
}
@Test
public void getPageUrlList_test() throws IOException{
String url = "https://www.jd.com/allSort.aspx";
List<String> level3 = getLevel3(url);
List<String> pageUrlList = getPageUrlList(level3);
for (String string : pageUrlList) {
logger.debug(string);
}
logger.debug(pageUrlList.size());
}
表明jd有19万3千2百多页的商品
(4)从某个“商品列表页面”中获得这一页所有商品的链接
/**
* 从某个“商品列表页面”中获得这一页商品的链接
* @param url:某“商品列表页面”的链接
* @return 该“商品列表页面”中所有商品的链接组成的集合
* @throws IOException
*/
public List<String> getItemURLListByPage(String url) throws IOException{
List<String> itemUrlList = new ArrayList<String>();
try {//class="gl-i-wrap i-sku-item" 样式,空格代表多个样式需要分在两个.select中写,如下
Elements eles = Jsoup.connect(url).get().select(".gl-item .gl-i-wrap").select(".j-sku-item div.p-img a");
for (Element ele : eles) {
String string = "http:"+ele.attr("href");
itemUrlList.add(string);
}
} catch (Exception e) {
System.out.println("error:"+url);
}
return itemUrlList;
}
@Test
public void getItemURLListByPage_test() throws IOException{
String url = "https://list.jd.com/list.html?cat=9987,653,655&page=2";
List<String> itemURLListByPage = getItemURLListByPage(url);
for (String string : itemURLListByPage) {
logger.debug(string);//打印该“商品列表页面”中商品链接组成的集合
}
logger.debug(itemURLListByPage.size());//打印该页面有几个商品
}
表明该页有60个商品
(5)拿到jd所有商品的链接组成的集合
/**
* 拿到jd所有商品的链接组成的集合
* @param url:jd有全部三级分类的链接:https://www.jd.com/allSort.aspx
* @return jd所有商品的链接组成的集合
* @throws IOException
*/
public List<String> getAllItemUrl(String url) throws IOException{
List<String> allItemUrlList = new ArrayList<String>();//存放jd所有商品的链接组成的集合
List<String> level3 = getLevel3(url);//所有三级分类的链接组成的集合
List<String> pageUrlList = getPageUrlList(level3);//jd所有“商品列表页面”的链接组成的集合
for (String string : pageUrlList) {
List<String> itemURLListByPage = getItemURLListByPage(string);//某个“商品列表页面”中所有商品链接组成的集合
allItemUrlList.addAll(itemURLListByPage);//将每个页面中商品练级组成的集合汇聚到一个大集合中
}
return allItemUrlList;
}
@Test
public void getAllItemUrl_test() throws IOException{
long start = System.currentTimeMillis();
String url = "https://www.jd.com/allSort.aspx";
List<String> allItemUrl = getAllItemUrl(url);
for (String string : allItemUrl) {
logger.info(string);
}
logger.info(allItemUrl.size());//打印jd有多少商品
long end = System.currentTimeMillis();
long time = ((end-start)/1000)/60;
logger.info("共用时"+time+"分钟");
}
表明jd商品总数达到1千1百5十5万2千6百8十6个,2017年时时800多万
共用时78162秒大约5个多小时
表明jd商品总数达到1千1百5十3万7千7百8十2个,2017年时时800多万
共用时78162秒大约5个多小时
(6)获取商品id
/**
* 获取商品id
* @param url:商品页面的链接
* @return 拆分出的商品id
*/
public String getItemId(String url){
String id = url.replace("http://item.jd.com/", "").replace(".html", "");
return id;
}
@Test
public void getItemId_test() throws IOException{
String url = "http://item.jd.com/6055054.html";
String itemId = getItemId(url);
logger.debug(itemId);
}
(7)获取title
/**
* 抓取titile
* @param doc 某商品链接对应得商品信息结构
* @return 返回title
*/
public String getTitle(Document doc){
//选择器
return doc.select(".sku-name").text();//找到div拿到文字
}
@Test
public void getTitle_test() throws IOException{
String url = "https://item.jd.com/6055054.html";
Document doc = Jsoup.connect(url).get();
//选择器
String text = getTitle(doc);
logger.debug(text);
}
(8)获取卖点(返回json数据)
jd卖点要单独获取
/**
* 获取买点
* @param id:商品id
* @return 商品卖点
* @throws IOException
*/
public String getSellPoint(String id) throws IOException{
String sellPointUrl = "http://ad.3.cn/ads/mgets?skuids=AD_"+id;
try {
String sellPointJson = Jsoup.connect(sellPointUrl).ignoreContentType(true).get().text();
JsonNode sellPointJsonNode = Mapper.readTree(sellPointJson);
String sellPoint = sellPointJsonNode.get(0).get("ad").asText();
return sellPoint;
} catch (Exception e) {
return null;
}
}
@Test
public void getSellPoint_test() throws IOException{
String id = "6055054";
String text = getSellPoint(id);
logger.debug(text);
}
(9)获取商品价格(返回json)
京东的价格是单独发起的,这个链接如何获得可以自己深入:http://p.3.cn/prices/mgets?skuIds=J_6055054
/**
* 获取价格
* @param id 商品id
* @return 商品价格
* @throws IOException
*/
public long getPrice(String id) throws IOException{
String url = "http://p.3.cn/prices/mgets?skuIds=J_"+id;
String priceJson = Jsoup.connect(url).ignoreContentType(true).get().body().text();//[{"op":"1299.00","m":"99999.00","id":"J_5663902","p":"1299.00"}]
JsonNode jsonNode = Mapper.readTree(priceJson);
Long price = jsonNode.get(0).get("p").asLong()*100;
return price;
}
@Test
public void getPrice_test() throws IOException{
String id = "6055054";
long text = getPrice(id);
logger.debug(text);
}
(10)获取图片
/**
* 获取图片
* @param doc:某商品链接对应得商品信息结构
* @return 图片的链接
*/
public String getImage(Document doc){
String img = "";
Elements eles = doc.select("ul.lh li img");
for (Element ele : eles) {
img += "http:"+ele.attr("src")+",";
}
if(!(null==img||"".equals(img))){
img = img.substring(0, img.length()-1);
}
return img;
}
@Test
public void getImage_test() throws IOException{
String url = "https://item.jd.com/6055054.html";
Document doc = Jsoup.connect(url).get();
//选择器
String text = getImage(doc);
logger.debug(text);
}
2018-06-18 19:24:11,788 DEBUG [com.jt.jsoup4JD.jsoup4JD] - http://img14.360buyimg.com/n5/s54x54_jfs/t15094/122/1086149603/353795/da2168a0/5a4341c4N1c27b681.jpg,http://img14.360buyimg.com/n5/s54x54_jfs/t15349/187/807188784/167239/6f0444f5/5a3b77caNd7bbb2f2.jpg,http://img14.360buyimg.com/n5/s54x54_jfs/t13954/61/2247307424/200508/263aac74/5a3b77ccN56764546.jpg,http://img14.360buyimg.com/n5/s54x54_jfs/t13930/361/2297070615/86453/e952a663/5a3b77cbN6a69711e.jpg,http://img14.360buyimg.com/n5/s54x54_jfs/t15067/293/863509976/112178/db7554f6/5a3b77cbN5a2cb830.jpg
(11)获取商品详情(返回jsonp数据)
京东的商品详情是单独发起的
/**
* 获取商品详情
* @param id:商品id
* @return 商品详情
*/
public String getItemDes(String id){
String itemDesUrl = "http://d.3.cn/desc/"+id;
try {//jsonp的数据要用.execute().body()来获取
String itemDesJsonp = Jsoup.connect(itemDesUrl).ignoreContentType(true).execute().body();
String itemDesJson = itemDesJsonp.replace("showdesc(", "");
itemDesJson = itemDesJson.substring(0, itemDesJson.length()-1);
JsonNode itemDesJsonNode = Mapper.readTree(itemDesJson);
String itemDes = itemDesJsonNode.get("content").asText();
return itemDes;
} catch (Exception e) {
return null;
}
}
@Test
public void getItemDes_test() throws IOException{
String id = "6055054";
String text = getItemDes(id);
logger.debug(text);
}
![](https://i-blog.csdnimg.cn/blog_migrate/c3cbc605ce8c10db5c9763d9d3369d6f.png)
(12)从商品页面拿到商品信息,落地:存入数据库
package com.jt.jsoup4JD.pojo;
import java.util.Arrays;
public class Item{
private long id;
private String title;
private String sellPoint;
private long price;
private Integer num;
private String barcode;
private String image;
private String[] images;
private long cid;
private Integer status;
private String itemDesc;
public String getItemDesc() {
return itemDesc;
}
public void setItemDesc(String itemDesc) {
this.itemDesc = itemDesc;
}
public String[] getImages() {
return image.split(",");
}
public void setImages(String[] images) {
this.images = images;
}
public long getId() {
return id;
}
public void setId(long id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getSellPoint() {
return sellPoint;
}
public void setSellPoint(String sellPoint) {
this.sellPoint = sellPoint;
}
public long getPrice() {
return price;
}
public void setPrice(long price) {
this.price = price;
}
public Integer getNum() {
return num;
}
public void setNum(Integer num) {
this.num = num;
}
public String getBarcode() {
return barcode;
}
public void setBarcode(String barcode) {
this.barcode = barcode;
}
public String getImage() {
return image;
}
public void setImage(String image) {
this.image = image;
}
public long getCid() {
return cid;
}
public void setCid(long cid) {
this.cid = cid;
}
public Integer getStatus() {
return status;
}
public void setStatus(Integer status) {
this.status = status;
}
@Override
public String toString() {
return "Item [id=" + id + ", title=" + title + ", sellPoint=" + sellPoint + ", price=" + price + ", num=" + num
+ ", barcode=" + barcode + ", image=" + image + ", images=" + Arrays.toString(images) + ", cid=" + cid
+ ", status=" + status + ", itemDesc=" + itemDesc + "]";
}
}
/**
* 从商品页面拿到商品信息,落地:存入数据库
* @param url:某个商品页面的链接
* @return 返回一个商品类
* @throws IOException
*/
public Item getItem(String url) throws IOException{
Document doc = Jsoup.connect(url).get();
Item item = new Item();
//设置id
String id = getItemId(url);
item.setId(Long.parseLong(id));
//设置title
String text = getTitle(doc);
item.setTitle(text);
//抓取卖点
String sellPoint = getSellPoint(id);
item.setSellPoint(sellPoint);
//获取价格
long price = getPrice(id);
item.setPrice(price);
//获取图片
String image = getImage(doc);
item.setImage(image);
//获取商品描述
String itemDes = getItemDes(id);
item.setItemDesc(itemDes);
return item;
}
@Test//从商品页面拿到商品信息,落地,存入数据库
public void getItem_test() throws IOException{
String url="http://item.jd.com/6055054.html";
Item item = getItem(url);
System.out.println(item.toString());
}