jsoup

使用jsoup抓取jd的数据

一、逐步分析

(1)获取所有三级分类


    
        /**
	 * 获取jd所有的三级分类
	 * @param url jd有全部分类的链接:https://www.jd.com/allSort.aspx
	 * @return 返回所有有效的三级分类链接
	 * @throws IOException
	 */
	public List<String> getLevel3(String url) throws IOException{
		List<String> level3List = new ArrayList<String>();
		Document doc = Jsoup.connect(url).get();
		Elements eles = doc.select("div dl dd a");
		//直接使用html标签,多个用空格隔开,返回值eles是所有符合的a标签:
		//<a href="//e.jd.com/ebook.html" target="_blank">电子书</a> .....
		@SuppressWarnings("unused")
		int i=0;
		for (Element ele : eles) {
			String catUrl = ele.attr("href");
			logger.debug(i+++ele.text()+"=="+catUrl);
			//处理有效连接https://list.jd.com/list.html?cat=9987,653,655
			if(catUrl.startsWith("//list.jd.com/list.html?cat")){
				level3List.add("http:"+catUrl);
			}
		}
		return level3List;
	}
	@Test
	public void getLevel3_test() throws IOException{
		String url = "https://www.jd.com/allSort.aspx";
		logger.debug(getLevel3(url).size());
	}


表明jd共有1285个分类,我的程序抓到的符合我规定的三级分类有1183个(抓到了绝大部分)

(2)获取三级分类下的页数


/**
	 * 获取三级分类下的页数
	 * @param url:一个三级分类的链接https://list.jd.com/list.html?cat=9987,653,655
	 * @return 返回页数
	 * @throws IOException
	 */
	public int getPage(String url) throws IOException{
		try {
			Document doc = Jsoup.connect(url).get();
			String text = doc.select("#J_topPage .fp-text i").text();
			int page = Integer.parseInt(text);
			return page;
		} catch (Exception e) {
			return 0;
		}
	}
	@Test
	public void getPage_test() throws IOException{
		String url = "https://list.jd.com/list.html?cat=9987,653,655";
		int page = getPage(url);
		logger.debug(page);
	}


表明该三级分类下有160页商品

(3)获取所有“商品列表页面”的链接


/**
	 * 获取所有“商品列表页面”的链接
	 * @param  所有有效三级分类的链接集合
	 * @return 
	 * @throws IOException 
	 */
	public List<String> getPageUrlList(List<String> level3List) throws IOException{
		List<String> pageUrlList = new ArrayList<String>();
		for (String pageUrl : level3List) {
			int pageNum = 0;
			pageNum = getPage(pageUrl);//获取该三级分类下有多少页
			
			for (int page = 1; page <= pageNum; page++) {
				//拼接得到所有“商品列表页面”的链接https://list.jd.com/list.html?cat=9987,653,655&page=2
				String str = pageUrl+"&page="+page;
				logger.debug(str);
				pageUrlList.add(str);
			}
		}
		return pageUrlList;
	}
	@Test
	public void getPageUrlList_test() throws IOException{
		String url = "https://www.jd.com/allSort.aspx";
		List<String> level3 = getLevel3(url);
		List<String> pageUrlList = getPageUrlList(level3);
		for (String string : pageUrlList) {
			logger.debug(string);
		}
		logger.debug(pageUrlList.size());
	}


表明jd有19万3千2百多页的商品

(4)从某个“商品列表页面”中获得这一页所有商品的链接


/**
	 * 从某个“商品列表页面”中获得这一页商品的链接
	 * @param url:某“商品列表页面”的链接
	 * @return 该“商品列表页面”中所有商品的链接组成的集合
	 * @throws IOException
	 */
	public List<String> getItemURLListByPage(String url) throws IOException{
		List<String> itemUrlList = new ArrayList<String>();
		try {//class="gl-i-wrap i-sku-item" 样式,空格代表多个样式需要分在两个.select中写,如下
			Elements eles = Jsoup.connect(url).get().select(".gl-item .gl-i-wrap").select(".j-sku-item div.p-img a");
			for (Element ele : eles) {
				String string = "http:"+ele.attr("href");
				itemUrlList.add(string);
			}
		} catch (Exception e) {
			System.out.println("error:"+url);
		}
		return itemUrlList;
	}
	@Test
	public void getItemURLListByPage_test() throws IOException{
		String url = "https://list.jd.com/list.html?cat=9987,653,655&page=2";
		List<String> itemURLListByPage = getItemURLListByPage(url);
		for (String string : itemURLListByPage) {
			logger.debug(string);//打印该“商品列表页面”中商品链接组成的集合
		}
		logger.debug(itemURLListByPage.size());//打印该页面有几个商品
	}


表明该页有60个商品

(5)拿到jd所有商品的链接组成的集合

/**
	 * 拿到jd所有商品的链接组成的集合
	 * @param url:jd有全部三级分类的链接:https://www.jd.com/allSort.aspx
	 * @return jd所有商品的链接组成的集合
	 * @throws IOException
	 */
	public List<String> getAllItemUrl(String url) throws IOException{
		List<String> allItemUrlList = new ArrayList<String>();//存放jd所有商品的链接组成的集合
		List<String> level3 = getLevel3(url);//所有三级分类的链接组成的集合
		List<String> pageUrlList = getPageUrlList(level3);//jd所有“商品列表页面”的链接组成的集合
		for (String string : pageUrlList) {
			List<String> itemURLListByPage = getItemURLListByPage(string);//某个“商品列表页面”中所有商品链接组成的集合
			allItemUrlList.addAll(itemURLListByPage);//将每个页面中商品练级组成的集合汇聚到一个大集合中
		}
		return allItemUrlList;
	}
	@Test
	public void getAllItemUrl_test() throws IOException{
		long start = System.currentTimeMillis();
		String url = "https://www.jd.com/allSort.aspx";
		List<String> allItemUrl = getAllItemUrl(url);
		for (String string : allItemUrl) {
			logger.info(string);
		}
		logger.info(allItemUrl.size());//打印jd有多少商品
		long end = System.currentTimeMillis();
		long time = ((end-start)/1000)/60;
		logger.info("共用时"+time+"分钟");
	}

表明jd商品总数达到1千1百5十5万2千6百8十6个,2017年时时800多万

共用时78162秒大约5个多小时


表明jd商品总数达到1千1百5十3万7千7百8十2个,2017年时时800多万

共用时78162秒大约5个多小时

(6)获取商品id

	/**
	 * 获取商品id
	 * @param url:商品页面的链接
	 * @return 拆分出的商品id
	 */
	public String getItemId(String url){
		String id = url.replace("http://item.jd.com/", "").replace(".html", "");
		return id;
	}
	@Test
	public void getItemId_test() throws IOException{
		String url = "http://item.jd.com/6055054.html";
		String itemId = getItemId(url);
		logger.debug(itemId);
	}


(7)获取title


/**
	 * 抓取titile
	 * @param doc 某商品链接对应得商品信息结构
	 * @return 返回title
	 */
	public String getTitle(Document doc){
		//选择器
		return doc.select(".sku-name").text();//找到div拿到文字
	}
	@Test
	public void getTitle_test() throws IOException{
		String url = "https://item.jd.com/6055054.html";
		Document doc = Jsoup.connect(url).get();
		//选择器
		String text = getTitle(doc);
		logger.debug(text);
	}

(8)获取卖点(返回json数据)


jd卖点要单独获取


/**
	 * 获取买点
	 * @param id:商品id
	 * @return 商品卖点
	 * @throws IOException
	 */
	public String getSellPoint(String id) throws IOException{
		String sellPointUrl = "http://ad.3.cn/ads/mgets?skuids=AD_"+id;
		try {
			String sellPointJson = Jsoup.connect(sellPointUrl).ignoreContentType(true).get().text();
			JsonNode sellPointJsonNode = Mapper.readTree(sellPointJson);
			String sellPoint = sellPointJsonNode.get(0).get("ad").asText();
			return sellPoint;	
		} catch (Exception e) {
			return null;
		}
	}
	@Test
	public void getSellPoint_test() throws IOException{
		String id = "6055054";
		String text = getSellPoint(id);
		logger.debug(text);
	}


(9)获取商品价格(返回json)


京东的价格是单独发起的,这个链接如何获得可以自己深入:http://p.3.cn/prices/mgets?skuIds=J_6055054


/**
	 * 获取价格
	 * @param id 商品id
	 * @return 商品价格
	 * @throws IOException
	 */
	public long getPrice(String id) throws IOException{
		String url = "http://p.3.cn/prices/mgets?skuIds=J_"+id;
		String priceJson = Jsoup.connect(url).ignoreContentType(true).get().body().text();//[{"op":"1299.00","m":"99999.00","id":"J_5663902","p":"1299.00"}]
		JsonNode jsonNode = Mapper.readTree(priceJson);
		Long price = jsonNode.get(0).get("p").asLong()*100;
		return price;
	}
	@Test
	public void getPrice_test() throws IOException{
		String id = "6055054";
		long text = getPrice(id);
		logger.debug(text);
	}

(10)获取图片

/**
	 * 获取图片
	 * @param doc:某商品链接对应得商品信息结构
	 * @return 图片的链接
	 */
	public String getImage(Document doc){
		String img = "";
		Elements eles = doc.select("ul.lh li img");
		for (Element ele : eles) {
			img += "http:"+ele.attr("src")+",";
		}
		if(!(null==img||"".equals(img))){
			img = img.substring(0, img.length()-1);
		}
		return img;
	}
	@Test
	public void getImage_test() throws IOException{
		String url = "https://item.jd.com/6055054.html";
		Document doc = Jsoup.connect(url).get();
		//选择器
		String text = getImage(doc);
		logger.debug(text);
	}
2018-06-18 19:24:11,788 DEBUG [com.jt.jsoup4JD.jsoup4JD] - http://img14.360buyimg.com/n5/s54x54_jfs/t15094/122/1086149603/353795/da2168a0/5a4341c4N1c27b681.jpg,http://img14.360buyimg.com/n5/s54x54_jfs/t15349/187/807188784/167239/6f0444f5/5a3b77caNd7bbb2f2.jpg,http://img14.360buyimg.com/n5/s54x54_jfs/t13954/61/2247307424/200508/263aac74/5a3b77ccN56764546.jpg,http://img14.360buyimg.com/n5/s54x54_jfs/t13930/361/2297070615/86453/e952a663/5a3b77cbN6a69711e.jpg,http://img14.360buyimg.com/n5/s54x54_jfs/t15067/293/863509976/112178/db7554f6/5a3b77cbN5a2cb830.jpg

(11)获取商品详情(返回jsonp数据)


京东的商品详情是单独发起的


/**
	 * 获取商品详情
	 * @param id:商品id
	 * @return 商品详情
	 */
	public String getItemDes(String id){
		String itemDesUrl = "http://d.3.cn/desc/"+id;
		try {//jsonp的数据要用.execute().body()来获取
			String itemDesJsonp = Jsoup.connect(itemDesUrl).ignoreContentType(true).execute().body();
			String itemDesJson = itemDesJsonp.replace("showdesc(", "");
			itemDesJson = itemDesJson.substring(0, itemDesJson.length()-1);
			JsonNode itemDesJsonNode = Mapper.readTree(itemDesJson);
			String itemDes = itemDesJsonNode.get("content").asText();
			return itemDes;	
		} catch (Exception e) {
			return null;
		}
	}
	@Test
	public void getItemDes_test() throws IOException{
		String id = "6055054";
		String text = getItemDes(id);
		logger.debug(text);
	}


(12)从商品页面拿到商品信息,落地:存入数据库

package com.jt.jsoup4JD.pojo;

import java.util.Arrays;

public class Item{
	private long id;
	private String title;
	private String sellPoint;
	private long price;
	private Integer num;
	private String barcode;
	private String image;
	private String[] images; 
	private long cid;
	private Integer status;
	private String itemDesc; 
	public String getItemDesc() {
		return itemDesc;
	}
	public void setItemDesc(String itemDesc) {
		this.itemDesc = itemDesc;
	}
	public String[] getImages() {
		return image.split(",");
	}
	public void setImages(String[] images) {
		this.images = images;
	}
	public long getId() {
		return id;
	}
	public void setId(long id) {
		this.id = id;
	}
	public String getTitle() {
		return title;
	}
	public void setTitle(String title) {
		this.title = title;
	}
	public String getSellPoint() {
		return sellPoint;
	}
	public void setSellPoint(String sellPoint) {
		this.sellPoint = sellPoint;
	}
	public long getPrice() {
		return price;
	}
	public void setPrice(long price) {
		this.price = price;
	}
	public Integer getNum() {
		return num;
	}
	public void setNum(Integer num) {
		this.num = num;
	}
	public String getBarcode() {
		return barcode;
	}
	public void setBarcode(String barcode) {
		this.barcode = barcode;
	}
	public String getImage() {
		return image;
	}
	public void setImage(String image) {
		this.image = image;
	}
	public long getCid() {
		return cid;
	}
	public void setCid(long cid) {
		this.cid = cid;
	}
	public Integer getStatus() {
		return status;
	}
	public void setStatus(Integer status) {
		this.status = status;
	}
	@Override
	public String toString() {
		return "Item [id=" + id + ", title=" + title + ", sellPoint=" + sellPoint + ", price=" + price + ", num=" + num
				+ ", barcode=" + barcode + ", image=" + image + ", images=" + Arrays.toString(images) + ", cid=" + cid
				+ ", status=" + status + ", itemDesc=" + itemDesc + "]";
	}
}

/**
	 * 从商品页面拿到商品信息,落地:存入数据库
	 * @param url:某个商品页面的链接
	 * @return 返回一个商品类
	 * @throws IOException
	 */
	public Item getItem(String url) throws IOException{
		Document doc = Jsoup.connect(url).get();
		Item item = new Item();
		//设置id
		String id = getItemId(url);
		item.setId(Long.parseLong(id));
		//设置title
		String text = getTitle(doc);
		item.setTitle(text);
		//抓取卖点
		String sellPoint = getSellPoint(id);
		item.setSellPoint(sellPoint);
		//获取价格
		long price = getPrice(id);
		item.setPrice(price);
		//获取图片
		String image = getImage(doc);
		item.setImage(image);
		//获取商品描述
		String itemDes = getItemDes(id);
		item.setItemDesc(itemDes);
		return item;
	}
	@Test//从商品页面拿到商品信息,落地,存入数据库
	public void getItem_test() throws IOException{
		String url="http://item.jd.com/6055054.html";
		Item item = getItem(url);
		System.out.println(item.toString());
	}


  • 1
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值