python抓取淘宝商品评论最新思路
import json
import re
from lxml import etree
import pandas as pd
import time
import xlrd
import csv
import xlwt
import jsonpath
def loads_jsonp(jsonp):
"""
解析jsonp数据格式为json
:return:
"""
try:
return json.loads(re.match(".*?({.*}).*",jsonp, re.S).group(1))
except:
raise ValueError('Invalid Input')
def get_content(id_list,sellerid_list):
for i in range(len(sellerid_list)):
print('正在下载第{}部手机'.format(i))
a = 0
url = 'https://rate.tmall.com/list_detail_rate.htm'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'referer': 'https://detail.tmall.com/item.htm',
'cookie': 'cna=fK/yFfEa3WUCAXGMVkIYqqv7; hng=CN%7Czh-CN%7CCNY%7C156; lid=%E8%80%81%E9%85%92%E4%B8%8E%E5%8F%8B%E7%99%BD; enc=lgHG9OVbedGZ3Xdvmc1TJR92NcN1To9MQqms3vs1O5h8EgGWmUIGuyW3esLsoOPd6G2wOaCMfkka22f6p0er8A%3D%3D; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; uc1=cookie14=UoTbmEp8z6drTg%3D%3D; t=4057ea780bd0c7b3ce45c002c946bb86; uc3=lg2=VT5L2FSpMGV7TQ%3D%3D&vt3=F8dByua%2Bf8RBacKtvjw%3D&id2=UU8NZ4IcDYgRKw%3D%3D&nk2=o9LSXMikt1Vokw%3D%3D; tracknick=%5Cu8001%5Cu9152%5Cu4E0E%5Cu53CB%5Cu767D; uc4=nk4=0%40oYS4e8JkzVaiSTi9AhAnC8t3qfaA&id4=0%40U22KV%2FhDp9TrrKd7h45sjW1X8R8O; lgc=%5Cu8001%5Cu9152%5Cu4E0E%5Cu53CB%5Cu767D; _tb_token_=7b3f3ee874379; cookie2=1796a9e22d10467716fabfd0fd15debb; x5sec=7b22726174656d616e616765723b32223a223035363934376336376130653533323161653332656134326132646664336436435075766a653846454f7170305a69566862713242413d3d227d; l=dBP5OKreqoQECND8BOfZKurza779qIdf1sPzaNbMiICPO01kq-ZOWZKcSiTDCnGV3s1wR3Jt3efYByTiSyznhZXRFJXn9Mp9SdTeR; isg=BImJ77_bMNsPVsxcEyN8LcgPmLXj1n0IQgB98yv_xnDbcqqEcyWP2arktJbhKhVA',
}
filename = id_list[a] + '.csv'
a = a + 1
row0 = ["手机机型","评论时间","评论内容"]
fp = open(filename, 'a', encoding='utf-8-sig', newline='') #使用utf-8-sig编码方式,防止用excel打开乱码
# 将首行信息写入
writer = csv.writer(fp)
writer.writerow(row0)
for page in range(2):
print('正在下载第{}页'.format(page))
try:
params = {'itemId': id_list[i],
'currentPage': page,
'sellerId': sellerid_list[i], }
session = requests.Session()
proxies = {
"http": "http://:@http-dyn.abuyun.com:9020",
}
timeslep = 20 * random.random()
time.sleep(timeslep)
r = requests.get(url=url, headers=headers, params=params)
#print(r.text)
except:
print('请求错误')
continue
try:
if r.status_code == 200:
content = loads_jsonp(r.text)
auctionSku = jsonpath.jsonpath(content, '$..auctionSku')
rate_Content = jsonpath.jsonpath(content, '$..rateContent')
rate_Date = jsonpath.jsonpath(content, '$..rateDate')
#print(auctionSku)
for x in range(len(auctionSku)):
comment = [None] * 3
comment[0] = auctionSku[x]
comment[1] = rate_Date[x]
comment[2] = rate_Content[x]
writer.writerow(comment)
except:
print('解析失败')
pass
file = '淘宝手机排行表.xls'
workbook = xlrd.open_workbook(file)
data = xlrd.open_workbook(filename=file)
sheet1 = data.sheet_by_index(0) # 通过索引获取sheet
# print(sheet1.name,sheet1.nrows,sheet1.ncols)
id_list = sheet1.col_values(3, 85) # 获取列内容,第一行第三列开始
sellerid_list = sheet1.col_values(4, 85)
print(id_list+'-'+sellerid_list)
get_content(id_list,sellerid_list)