提前准备参数
import re
import requests
import random
import time
import pandas as pd
df = []
# 这里的信息要从list_detial里找,上面的截图就是
headers = {
'cookie': '',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.62',
'referer': 'https://detail.tmall.com/item.htm?id=539339147058&price=99&sourceType=item&sourceType=item&suid=e9ca3123-b90d-4ed5-b31f-08f696654755&ut_sk=1.XVltFfIJHZADAP26 uLU46qu_21646297_1646213190055.Copy.ShareGlobalNavigation_1&un=bf83687cd45b2be19a2d98b7feb65231&share_crt_v=1&un_site=0&spm=a2159r.13376460.0.0&tbSocialPopKey=shareItem&sp_tk=Q0c0ZTI0cnRhZHM=&cpp=1&shareurl=true&short_name=h.fMQsL31&bxsign=scdLssu8BjsPSMY422ksv9WiDQBjg3Ih_OKP8X8cJgTl5W3pRkm74cPZeKXptUmWhTSFuRQNlnWc7SUD1w2I3VdhjkFDjEAGAre3x1CMDXKBsh4MyL5hqQLpo6LhUFDB22U&sm=3d25d1?tk=CG4e24rtads&app=chrome&skuId=4571541867149',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'
}
url = 'https://rate.tmall.com/list_detail_rate.htm'
def get_html(url, header, page):
t_param = time.time()
t_list = str(t_param).split(".")
# 这里的参数对应上面的url信息
params = {
'itemId': 556732726926,
'spuId': 866309580,
'sellerId': 725677994,
'order': 3,
'currentPage': page,
'append': 0,
'content': 1,
"callback": 'jsonp'+str(int(t_list[1][3:]) + 1),
"_ksTS": t_list[0] + t_list[1][:3] + "_" + t_list[1][3:]
}
r = requests.get(url, headers=header, params=params)
if r.status_code == 200:
return r.text
else:
pass
def get_item(num):
user_name = []
item_type = []
rate_content = []
rate_date = []
for page in range(1, num):
try:
text = get_html(url, headers, page)
user_name.extend(re.findall('"displayUserNick":"(.*?)"', text))
item_type.extend(re.findall('"auctionSku":"(.*?)"', text))
rate_content.extend(re.findall('"rateContent":"(.*?)"', text))
rate_date.extend(re.findall('"rateDate":"(.*?)"', text))
# 输出看看
# print(user_name)
# print(item_type)
# print(rate_date)
# print(rate_content)
print(f"第{page}页爬取完毕")
time.sleep(random.randint(3,9))
except:
print("Nothing you catch")
for i in range(len(user_name)):
df.append([user_name[i], rate_date[i], item_type[i], rate_content[i]])
df1 = pd.DataFrame(df, columns=['user_name', 'rate_date', 'item_type', 'rate_content'])
df1.to_csv('taobao_items.csv', index=False, encoding='utf-8')
if __name__ == '__main__':
# num 控制页数
num = 5
get_item(num)