爬取京东美的店铺每款具体SKU商品信息(按评论数量)

import csv
import json
import random
import re
import time

import requests
from lxml import etree


def parse_detail(goods_id):
    print(f'goods_id: {goods_id}')
    time.sleep(random.uniform(1, 3))
    url = f'https://item.jd.com/{goods_id}.html'
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
        'accept': '*/*',
        'accept-language': 'zh-CN,zh;q=0.9',
    }
    response = requests.get(url, headers=headers)
    html_tree = etree.HTML(response.text)
    title = ''.join(html_tree.xpath('//div[@class="sku-name"]/text()')).strip()
    price = get_price(goods_id)
    price_all = get_price_all(goods_id)
    comment_num = get_comment_num(goods_id)
    one_data = [title, goods_id, price, price_all, comment_num]
    color_size_list = json.loads(re.findall('colorSize:(.*?),        warestatus', response.text)[0])
    if color_size_list:
        color_size_dict = {str(i['skuId']): i for i in color_size_list}
        color_size = color_size_dict[goods_id]
        del color_size['skuId']
        value_list = list(color_size.values())
        value_list.reverse()
        one_data += value_list
    print(one_data)
    f_csv.writerow(one_data)
    result_csv.flush()


def get_comment_num(goods_id):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
        'Accept': '*/*',
        'Referer': 'https://item.jd.com/',
        'Accept-Language': 'zh-CN,zh;q=0.9',
    }
    params = (
        ('productId', str(goods_id)),
        ('score', '0'),
        ('sortType', '5'),
        ('page', '0'),
        ('pageSize', '10'),
        ('isShadowSku', '0'),
        ('fold', '1'),
    )
    response = requests.get('https://club.jd.com/comment/productPageComments.action', headers=headers, params=params)
    try:
        return response.json()['productCommentSummary']['commentCountStr']
    except:
        return 0


def get_price(goods_id):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
        'Accept': '*/*',
        'Referer': 'https://item.jd.com/',
        'Accept-Language': 'zh-CN,zh;q=0.9',
    }
    response = requests.get(f'https://p.3.cn/prices/mgets?skuIds={goods_id}', headers=headers)
    return float(response.json()[0]['p'])

def get_price_all(goods_id):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
        'Accept': '*/*',
        'Referer': 'https://item.jd.com/',
        'Accept-Language': 'zh-CN,zh;q=0.9',
    }
    response = requests.get(f'https://p.3.cn/prices/mgets?skuIds={goods_id}', headers=headers)
    return float(response.json()[0]['op'])


def spider():
    headers = {
        'accept': '*/*',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9',
        'cookie': 'shshshfpa=d03ee0fc-3606-cd7a-c5e9-86f178064d93-1608791650; shshshfpb=oMji7R53J6TmYGrW4Krr0xg%3D%3D; __jdu=16087916292931993003218; pinId=Z5XlXXgHuFoX88t6pp47POfvBnT67rLt343XDId8yeg; unick=%E5%93%88%E8%A1%8C%E5%93%88%E5%B0%94%E6%BB%A8%E7%A7%91%E6%8A%80%E5%8A%9E%E5%85%AC; TrackID=1QC5TqNfz3VjZaUtU3xANy8aVdbp65E2sur2uj8c51oU6H-sou--J6Lx8Q-3SbyYDJ48y4cy6ISpRecoDvG609_Sy-KHWsHlXI8OGomk7mQqoQNZy85X8w4c8wLtq105R; areaId=1; user-key=896e2d98-0d88-4a92-b3b3-59e8d352d656; ipLoc-djd=1-2901-55548-0; unpl=V2_ZzNtbUBWExx8ChEGfhFYBWJUEwhLVEIWcwBHB3scDARuAhdeclRCFnUUR1NnGFwUZAMZXEdcQB1FCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHseXQxhCxJaQV9AHXcITlx5GF0HZAMbbXJQcyVFDEJXcxFcNWYzE20AAx8QdQlAUHNUXAJmChRVQlBAHXYARFRzEV4EZgERXUtnQiV2; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_30a883fc49514f0d8202790b14a08042|1622552673200; __jda=122270672.16087916292931993003218.1608791629.1622464495.1622552673.38; __jdc=122270672; shshshfp=1523c777125a6e3072ec238127d98f2b; __jdb=122270672.4.16087916292931993003218|38.1622552673; JSESSIONID=EBFCCB2576B024064BDD1ABF9C0FC0F5.s1; 3AB9D23F7A4B3C9B=6QGO3GYIOXC6KMODS2E3OHSWQETFW5GZZYOAWQCN2QNAFOL67WV4EZL6LDMQ3D4D5WPAOT6EHQNMTWRYSFA2U4TUJA; shshshsID=9b37bb97ca9f98d0429a1b9edfa3c20e_5_1622552834495',
        'referer': 'https://mall.jd.com/',
        'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90"',
        'sec-ch-ua-mobile': '?0',
        'sec-fetch-dest': 'script',
        'sec-fetch-mode': 'no-cors',
        'sec-fetch-site': 'same-site',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
    }
    crawled_goods_id_list = []
    for page in range(1, 2):
        print(f'page: {page}')
        # params = {
        #     ('appId', '1536243'),
        #     ('orderBy: 2'),
        #     ('direction: 1'),
        #     ('categoryId: 0'),
        #     ('pageSize: 60'),
        #     ('venderId: 1000282702'),
        #     ('pagePrototypeId: 17'),
        #     ('pageNo: '),
        #     ('shopId: 1000282702'),
        #     ('pageInstanceId: 183902541'),
        #     ('moduleInstanceId: 252859746'),
        #     ('prototypeId: 19268'),
        #     ('templateId: 630128'),
        #     ('layoutInstanceId: 252859746'),
        # }
        while True:
            time.sleep(random.uniform(1, 3))
            response = requests.get(f'https://module-jshop.jd.com/module/getModuleHtml.html?appId=996152&orderBy=0&pageNo=1&direction=1&shopId=776475&categoryId=0&pageSize=60&venderId=780241&maxPrice=0&pagePrototypeId=17&minPrice=0&pageInstanceId=102098631&moduleInstanceId=102783750&prototypeId=34172&templateId=792077&layoutInstanceId=102783750&origin=0&callback=jshop_module_render_callback&_=1622616795609', headers=headers)
            html_tree = etree.HTML(response.text)
            goods_id_list = html_tree.xpath('//ul/li/div/div[2]/div/ul/li/@sid')
            # sku_list = []
            # for i in goods_id_list:
            #     m = re.findall("\d+", i)
            #     sku_list.append(m[0])
            if goods_id_list:
                break
        for goods_id in goods_id_list:
            m = re.findall("\d+", goods_id)
            if m not in crawled_goods_id_list:
                crawled_goods_id_list.append(m[0])
                parse_detail(m[0])


if __name__ == '__main__':
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    result_csv = open(f'{timestamp}.csv', 'w', encoding='utf-8-sig', newline='')
    f_csv = csv.writer(result_csv)
    f_csv.writerow(['标题','sku', '价格', '总价', '评论数'])
    spider()
    result_csv.close()
  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值