今天做了一个京东商品价格的需求,整理一下。
第一步:打开Chrome浏览器自带抓包工具,选择network选项卡
第二步:按下Ctrl+F5,打开search,在里面输入价格,例如图中输入的是1318.00.然后回车就会出现包含价格的接口出现
分析下接口,可以看到价格为‘price’下面的’p’字段中。
第三步:分析下请求接口,为get请求。
第四步,分析下请求url,发现url形式为:https://item-soa.jd.com/getWareBusiness?skuId= ,只需要获取到商品id,即可请求到价格。
第五步,商品id一般包含在商品的url中,可通过正则表达式,提取出商品id,构造商品价格的url
示例如下:
url为 :‘https://item.jd.com/5915827.html’
skuid= re.findall('/(\d+?)\.', url)
即可获取到商品ID
思路为上文所述,下文附上代码,感兴趣可以看下:
import json
import re
import time
import pandas as pd
from lxml import etree
import numpy as np
import requests
import random
class JDSpider(object):
def __init__(self):
self.url_temp = 'https://item-soa.jd.com/getWareBusiness?skuId={}'
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
}
def get_url_list(self):
df = pd.read_excel('../待处理文件/京东.xlsx',sheet_name='京东链接')
return df
def parse(self,url):
time.sleep(0.4)
try:
rest = requests.get(url,headers=self.headers)
#https://item-soa.jd.com/getWareBusiness?&skuId= 价格url
# html=etree.HTML(rest.text)
return rest.content.decode()
except requests.exceptions.ProxyError:
time.sleep(3)
except requests.exceptions.SSLError:
time.sleep(3)
except ValueError:
pass
def get_price(self,str_html):
json_html = json.loads(str_html)
price = None
try:
price = json_html["price"]["p"]
except Exception as e:
price = '-1.00'
return price
def save(self,df,price_list):
try:
df['价格'] = price_list
df.to_excel('../待处理文件/京东价格.xlsx', index=False)
except ValueError:
pass
def run(self):
df = self.get_url_list()
#
skuId_list=[]
for str in df['链接']:
result = re.findall('/(\d+?)\.', str)
# print(result)
skuId_list.append(result[0])
skuId_list = df['链接'].apply(lambda x:re.findall('/(\d+?)\.',x)[0])
url_list = [self.url_temp.format(i) for i in skuId_list]
price_list = []
for url in url_list:
rest = self.parse(url)
price = self.get_price(rest)
print(price)
price_list.append(price)
self.save(df,price_list)
if __name__ == '__main__':
js = JDSpider()
js.run()