代码很简单,朋友需要就简单写了一下
只针对天猫,其他没有测试
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
File Name : Tianmao.py
date : 19-1-23
Author : Hebel
-------------------------------------------------
Description:
note:
-------------------------------------------------
"""
import requests
import re
import json
from urllib import parse
def collect_tianmao_goods_comments(goods_url ,cookies, fied_page_number=0):
"""
采集天猫商品评论, 没有指定翻页默认第一页。
:param goods_url:
:param cookies:
:param fied_page_number: 指定翻页页数,不指定会获取所有页
:return:
"""
assert isinstance(cookies, dict)
if cookies.get("x5sec", None) is None:
raise ValueError("无效的cookies,缺失 x5sec 值")
parseResult = parse.urlparse(goods_url)
param_dict = parse.parse_qs(parseResult.query)
id = param_dict.get("id")[0] if param_dict.get("id") else None #商品id
user_id = param_dict.get("user_id")[0] if param_dict.get("user_id") else "725677994" # 卖家用户id ,725677994是天猫超市的user_id
assert id and user_id
url = "https://rate.tmall.com/list_detail_rate.htm?itemId={id}&spuId=385196899&sellerId={user_id}&order=3¤tPage={page_number}"
start_url = url.format(id=id, user_id=user_id, page_number=1)
headers = {'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
'accept-encoding': "gzip, deflate, br",
'accept-language': "zh-CN,zh;q=0.9",
'user-agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) snap Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36"}
print(start_url)
resp = requests.get(url=start_url, headers=headers, cookies=cookies, timeout=6)
if not resp.status_code is 200:
raise IOError("请求失败")
html = resp.content.decode("utf8")
str_json_list = re.findall('jsonp\d+\((.*)\)',html)
if str_json_list:
dict_data = json.loads(str_json_list[0])
rateDetail = dict_data.get("rateDetail")
paginator = rateDetail.get("paginator")
items = paginator.get("items")
print("当前商品评论总数:{count}".format(count=items))
page_number_all = items//20+1 if items%20>0 else items//20
print("当前总页数",page_number_all)
page_number = page_number_all if fied_page_number <= 0 else page_number_all if fied_page_number > page_number_all else fied_page_number
comments_all_list = []
for pn in range(page_number):
pn+=1
new_url = url.format(id=id, user_id=user_id, page_number=pn)
resp = requests.get(url=new_url, headers=headers, cookies=cookies, timeout=6)
if resp.status_code is 200:
html = resp.content.decode("utf8")
str_json_list = re.findall('jsonp\d+\((.*)\)', html)
if str_json_list:
dict_data = json.loads(str_json_list[0])
rateDetail = dict_data.get("rateDetail")
rateList = rateDetail.get("rateList")
comments_all_list.extend(rateList)
if comments_all_list:
return comments_all_list
else:
return "获取评论失败"
if __name__ == '__main__':
# 商品url
goods_url = "https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.13.17635faanAojEG&id=19903071933&skuId=3629482623721&areaId=440100&user_id=1601196901&cat_id=50076895&is_b=1&rn=59a6f41da133ea26d10f936220f732db"
# 需要登录账号后的cookie才可以获取评论
cookies = {'x5sec': '7b22726174656d616e616765723b32223a226361383530303865386661396433336465363930353433363061313130643265434a4c2f6f4f4946454d65337063542f394e37725a426f4d4d6a55324d446b794d544d774d447378227d'}
fied_page_number = 7
# #
comments_data = collect_tianmao_goods_comments(goods_url=goods_url,cookies=cookies, fied_page_number=fied_page_number)
if not comments_data is None:
print(len(comments_data))
# for comments in comments_data:
# print(comments)