jingdong

前言

京东是电商平台,最好获取数据的平台的吧。基本没有什么反爬措施。

本文章只是实现了 京东 关键词和单个商品信息获取的功能。

如果要稳定的话,还需修改代码。

本文只做学习用,请知悉。如有侵权,请联系作者。

import requests,re,json
from lxml import etree
from urllib import parse

proxies = ''

class JingDong(object):
    def __init__(self,input_str,input_type):
        if input_type == 1:
            print('单个商品链接爬取')
            self.get_goods(input_str)
        elif input_type == 2:
            print('关键词爬取,目前限制最大1000')
            self.get_keywords(input_str)
        else:
            pass

    def get_keywords(self,keywords):
        words = parse.quote(keywords)
        page = 1
        s = 0
        total = 0
        list1 = []
        while page <= 30:
            url = 'https://search.jd.com/s_new.php?keyword={}&psort=3&suggest=4.def.0.V04--38s0&wq={}&psort=3&page={}&s={}&click=0'.format(words,words,str(page),str(s))
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
                'Accept-Encoding': 'gzip, deflate, br',
                'Host': 'search.jd.com'
            }
            try:
                key_resp = requests.get(url,headers=headers,proxies=proxies,verify=False)
                resp_html = etree.HTML(key_resp.text)
                goods_list = resp_html.xpath('*//div[@id="J_goodsList"]//li')
                print(len(goods_list))
                total += len(goods_list)
                list = []
                for j in goods_list:

                    href = j.xpath('.//a/@href')[0]
                    list.append(href)
                    list1.append(href)
                print(list)
            except:
                pass
            page += 1
            s += 30
        print('total goods num....',total)
        list2 = set(list1)
        print('去重后goods num.....',len(list2))
        # for j in list2:
        #     j = 'https:' + j
        #     self.get_goods(j)

    def get_goods(self,url):
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
        resp = requests.get(url,headers=headers,proxies=proxies,verify=False)
        selector = etree.HTML(resp.text)
        # 商品原始链接
        commodity_url = selector.xpath('//link[@rel="canonical"]/@href')[0]
        # 商品ID
        goods_id = re.findall("(\d+)",commodity_url)[0]
        # 商品名称
        goods_name = selector.xpath('*//div[@class="item ellipsis"]/@title')[0]
        # 店铺名称
        shop_name = selector.xpath('*//div[@class="name"]/a/text()')[0]
        # 店铺链接
        shop_link = selector.xpath('*//div[@class="name"]/a/@href')[0]
        # 商品详情
        # 1.文本信息
        detail_list = selector.xpath('//ul[@class="parameter2 p-parameter-list"]/li/text()')
        detail_content = ""
        for i in detail_list:
            i = "".join(i.split()) + ","
            detail_content += i

        # 2.详情图片
        detail_link = re.findall("desc: '(.*?)',",resp.text)[0]
        detail_img_list = self.get_detail_link(detail_link,commodity_url)
        # 轮播图
        picture_list = selector.xpath('//ul[@class="lh"]/li/img/@src')
        for j in range(len(picture_list)):
            picture_list[j] = 'https:' + picture_list[j].replace('/n5/','/n0/')

        # 商品属性
        property = []
        coloeSize = re.findall("colorSize: \[(.*?)\]",resp.text)
        if len(coloeSize) > 0:
            str_size = "[" + coloeSize[0] + "]"
            list = eval(str_size)
            # price
            url = 'https://p.3.cn/prices/mgets?skuIds='
            for m in list:
                skuId = str(m['skuId'])
                url += skuId + ','
            resp_price = requests.get(url,headers=headers,verify=False)
            json_price = json.loads(resp_price.text)
            for h in json_price:
                price_skuid = h['id']
                p_skuid = re.findall(('\d+'),price_skuid)[0]
                for q in list:
                    if str(q['skuId']) == p_skuid:
                        q['price'] = h['p']
                    else:
                        pass

            for i in list:
                pro_dict = {}
                pro_dict['skuId'] = str(i['skuId'])
                pro_dict['pro'] = i
                if 'price' in i:
                    pro_dict['price'] = i['price']
                else:
                    pro_dict['price'] = ''
                property.append(pro_dict)
                #  skuid  img
            img = selector.xpath('//div[@id="choose-attr-1"]/div[@class="dd"]/div')
            for j in img:
                sku_id = j.xpath('./@data-sku')[0]
                img = j.xpath('./a/img/@src')[0]
                for k in property:
                    if sku_id == k['skuId']:
                        k['image'] = img
                    else:
                        pass
        else:
            property = []
            skuId = goods_id
            url = 'https://p.3.cn/prices/mgets?skuIds='+ skuId
            resp_price = requests.get(url, headers=headers, verify=False)
            json_price = json.loads(resp_price.text)
            # print(json_price)
            price = json_price[0]['p']
            pro_dict = {}
            pro_dict['skuId'] = skuId
            pro_dict['price'] = price
            pro_dict['image'] = ''
            pro_dict['pro'] = []
            property.append(pro_dict)


        comments_list = self.get_comments(goods_id,commodity_url)
        goods = {}
        goods['goods_name'] = goods_name
        goods['shop_name'] = shop_name
        goods['shop_link'] = shop_link
        goods['detail_content'] = detail_content
        goods['detail_img_list'] = detail_img_list
        goods['round_pictures'] = picture_list
        goods['property'] = property
        goods['comments'] = comments_list
        print(goods)





    def get_detail_link(self,url,referer_url):
        # https://cd.jd.com/description/channel?skuId=33406234360&mainSkuId=11427847914&charset=utf-8&cdn=2&callback=showdesc
        real_url = "https:" + url
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
            'Referer': 'https:' + referer_url,
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Host': 'cd.jd.com',
        }
        resp = requests.get(real_url,headers=headers,proxies=proxies,verify=False)
        # print(resp.text)
        json_data = json.loads(resp.text)
        content = json_data['content']
        img_list = re.findall('data-lazyload="(.*?)"',content)
        return img_list


    def get_comments(self,id,referer_url):
        page = 0
        comment_url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId={}&score=3&sortType=5&page={}&pageSize=10&isShadowSku=0&rid=0&fold=1'.format(id,str(page))
        com_headers = {
            'Referer': 'https:' + referer_url,
            'Host': 'club.jd.com',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
        }
        com_resp = requests.get(comment_url,headers=com_headers,proxies=proxies,verify=False)
        # print(com_resp.text)
        comment_str = re.findall('fetchJSON_comment98\((.*)\);',com_resp.text)[0]
        # print(comment_str)
        comment_json = json.loads(comment_str)
        print(len(comment_json['comments']))
        comment_list = []
        for j in comment_json['comments']:
            comment_dict = {}
            comment_dict['id'] = j['id']
            comment_dict['content'] = j['content']
            comment_dict['images'] = [k['imgUrl'] for k in j['images']]
            comment_list.append(comment_dict)
        return comment_list


JingDong('https://item.jd.com/69422164809.html',1)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值