import csv
import json
import random
import re
import time
import requests
from lxml import etree
def parse_detail(goods_id):
print(f'goods_id: {goods_id}')
time.sleep(random.uniform(1, 3))
url = f'https://item.jd.com/{goods_id}.html'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
'accept': '*/*',
'accept-language': 'zh-CN,zh;q=0.9',
}
response = requests.get(url, headers=headers)
html_tree = etree.HTML(response.text)
title = ''.join(html_tree.xpath('//div[@class="sku-name"]/text()')).strip()
price = get_price(goods_id)
price_all = get_price_all(goods_id)
comment_num = get_comment_num(goods_id)
one_data = [title, goods_id, price, price_all, comment_num]
color_size_list = json.loads(re.findall('colorSize:(.*?), warestatus', response.text)[0])
if color_size_list:
color_size_dict = {str(i['skuId']): i for i in color_size_list}
color_size = color_size_dict[goods_id]
del color_size['skuId']
value_list = list(color_size.values())
value_list.reverse()
one_data += value_list
print(one_data)
f_csv.writerow(one_data)
result_csv.flush()
def get_comment_num(goods_id):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
'Accept': '*/*',
'Referer': 'https://item.jd.com/',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
params = (
('productId', str(goods_id)),
('score', '0'),
('sortType', '5'),
('page', '0'),
('pageSize', '10'),
('isShadowSku', '0'),
('fold', '1'),
)
response = requests.get('https://club.jd.com/comment/productPageComments.action', headers=headers, params=params)
try:
return response.json()['productCommentSummary']['commentCountStr']
except:
return 0
def get_price(goods_id):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
'Accept': '*/*',
'Referer': 'https://item.jd.com/',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
response = requests.get(f'https://p.3.cn/prices/mgets?skuIds={goods_id}', headers=headers)
return float(response.json()[0]['p'])
def get_price_all(goods_id):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
'Accept': '*/*',
'Referer': 'https://item.jd.com/',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
response = requests.get(f'https://p.3.cn/prices/mgets?skuIds={goods_id}', headers=headers)
return float(response.json()[0]['op'])
def spider():
headers = {
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': 'shshshfpa=d03ee0fc-3606-cd7a-c5e9-86f178064d93-1608791650; shshshfpb=oMji7R53J6TmYGrW4Krr0xg%3D%3D; __jdu=16087916292931993003218; pinId=Z5XlXXgHuFoX88t6pp47POfvBnT67rLt343XDId8yeg; unick=%E5%93%88%E8%A1%8C%E5%93%88%E5%B0%94%E6%BB%A8%E7%A7%91%E6%8A%80%E5%8A%9E%E5%85%AC; TrackID=1QC5TqNfz3VjZaUtU3xANy8aVdbp65E2sur2uj8c51oU6H-sou--J6Lx8Q-3SbyYDJ48y4cy6ISpRecoDvG609_Sy-KHWsHlXI8OGomk7mQqoQNZy85X8w4c8wLtq105R; areaId=1; user-key=896e2d98-0d88-4a92-b3b3-59e8d352d656; ipLoc-djd=1-2901-55548-0; unpl=V2_ZzNtbUBWExx8ChEGfhFYBWJUEwhLVEIWcwBHB3scDARuAhdeclRCFnUUR1NnGFwUZAMZXEdcQB1FCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHseXQxhCxJaQV9AHXcITlx5GF0HZAMbbXJQcyVFDEJXcxFcNWYzE20AAx8QdQlAUHNUXAJmChRVQlBAHXYARFRzEV4EZgERXUtnQiV2; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_30a883fc49514f0d8202790b14a08042|1622552673200; __jda=122270672.16087916292931993003218.1608791629.1622464495.1622552673.38; __jdc=122270672; shshshfp=1523c777125a6e3072ec238127d98f2b; __jdb=122270672.4.16087916292931993003218|38.1622552673; JSESSIONID=EBFCCB2576B024064BDD1ABF9C0FC0F5.s1; 3AB9D23F7A4B3C9B=6QGO3GYIOXC6KMODS2E3OHSWQETFW5GZZYOAWQCN2QNAFOL67WV4EZL6LDMQ3D4D5WPAOT6EHQNMTWRYSFA2U4TUJA; shshshsID=9b37bb97ca9f98d0429a1b9edfa3c20e_5_1622552834495',
'referer': 'https://mall.jd.com/',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90"',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'script',
'sec-fetch-mode': 'no-cors',
'sec-fetch-site': 'same-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
crawled_goods_id_list = []
for page in range(1, 2):
print(f'page: {page}')
# params = {
# ('appId', '1536243'),
# ('orderBy: 2'),
# ('direction: 1'),
# ('categoryId: 0'),
# ('pageSize: 60'),
# ('venderId: 1000282702'),
# ('pagePrototypeId: 17'),
# ('pageNo: '),
# ('shopId: 1000282702'),
# ('pageInstanceId: 183902541'),
# ('moduleInstanceId: 252859746'),
# ('prototypeId: 19268'),
# ('templateId: 630128'),
# ('layoutInstanceId: 252859746'),
# }
while True:
time.sleep(random.uniform(1, 3))
response = requests.get(f'https://module-jshop.jd.com/module/getModuleHtml.html?appId=996152&orderBy=0&pageNo=1&direction=1&shopId=776475&categoryId=0&pageSize=60&venderId=780241&maxPrice=0&pagePrototypeId=17&minPrice=0&pageInstanceId=102098631&moduleInstanceId=102783750&prototypeId=34172&templateId=792077&layoutInstanceId=102783750&origin=0&callback=jshop_module_render_callback&_=1622616795609', headers=headers)
html_tree = etree.HTML(response.text)
goods_id_list = html_tree.xpath('//ul/li/div/div[2]/div/ul/li/@sid')
# sku_list = []
# for i in goods_id_list:
# m = re.findall("\d+", i)
# sku_list.append(m[0])
if goods_id_list:
break
for goods_id in goods_id_list:
m = re.findall("\d+", goods_id)
if m not in crawled_goods_id_list:
crawled_goods_id_list.append(m[0])
parse_detail(m[0])
if __name__ == '__main__':
timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
result_csv = open(f'{timestamp}.csv', 'w', encoding='utf-8-sig', newline='')
f_csv = csv.writer(result_csv)
f_csv.writerow(['标题','sku', '价格', '总价', '评论数'])
spider()
result_csv.close()
04-12
07-08
“相关推荐”对你有帮助么?
-
非常没帮助
-
没帮助
-
一般
-
有帮助
-
非常有帮助
提交