python抓取淘宝商品评论最新版

python抓取淘宝商品评论最新思路

import json
import  re
from lxml import etree
import pandas as pd
import time
import xlrd
import csv
import xlwt
import jsonpath


def loads_jsonp(jsonp):
        """
        解析jsonp数据格式为json
        :return:
        """
        try:
            return json.loads(re.match(".*?({.*}).*",jsonp, re.S).group(1))
        except:
            raise ValueError('Invalid Input')


def get_content(id_list,sellerid_list):
    for i in range(len(sellerid_list)):
        print('正在下载第{}部手机'.format(i))
        a = 0
        url = 'https://rate.tmall.com/list_detail_rate.htm'
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
            'referer': 'https://detail.tmall.com/item.htm',
            'cookie': 'cna=fK/yFfEa3WUCAXGMVkIYqqv7; hng=CN%7Czh-CN%7CCNY%7C156; lid=%E8%80%81%E9%85%92%E4%B8%8E%E5%8F%8B%E7%99%BD; enc=lgHG9OVbedGZ3Xdvmc1TJR92NcN1To9MQqms3vs1O5h8EgGWmUIGuyW3esLsoOPd6G2wOaCMfkka22f6p0er8A%3D%3D; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; uc1=cookie14=UoTbmEp8z6drTg%3D%3D; t=4057ea780bd0c7b3ce45c002c946bb86; uc3=lg2=VT5L2FSpMGV7TQ%3D%3D&vt3=F8dByua%2Bf8RBacKtvjw%3D&id2=UU8NZ4IcDYgRKw%3D%3D&nk2=o9LSXMikt1Vokw%3D%3D; tracknick=%5Cu8001%5Cu9152%5Cu4E0E%5Cu53CB%5Cu767D; uc4=nk4=0%40oYS4e8JkzVaiSTi9AhAnC8t3qfaA&id4=0%40U22KV%2FhDp9TrrKd7h45sjW1X8R8O; lgc=%5Cu8001%5Cu9152%5Cu4E0E%5Cu53CB%5Cu767D; _tb_token_=7b3f3ee874379; cookie2=1796a9e22d10467716fabfd0fd15debb; x5sec=7b22726174656d616e616765723b32223a223035363934376336376130653533323161653332656134326132646664336436435075766a653846454f7170305a69566862713242413d3d227d; l=dBP5OKreqoQECND8BOfZKurza779qIdf1sPzaNbMiICPO01kq-ZOWZKcSiTDCnGV3s1wR3Jt3efYByTiSyznhZXRFJXn9Mp9SdTeR; isg=BImJ77_bMNsPVsxcEyN8LcgPmLXj1n0IQgB98yv_xnDbcqqEcyWP2arktJbhKhVA',
            }
        filename = id_list[a] + '.csv'
        a = a + 1
        row0 = ["手机机型","评论时间","评论内容"]
        fp = open(filename, 'a', encoding='utf-8-sig', newline='')   #使用utf-8-sig编码方式,防止用excel打开乱码
        # 将首行信息写入
        writer = csv.writer(fp)
        writer.writerow(row0)
        for page in range(2):
            print('正在下载第{}页'.format(page))
            try:
                params = {'itemId': id_list[i],
                          'currentPage': page,
                          'sellerId': sellerid_list[i], }
                session = requests.Session()
                proxies = {
                    "http": "http://:@http-dyn.abuyun.com:9020",
                }
                timeslep = 20 * random.random()
                time.sleep(timeslep)
                r = requests.get(url=url, headers=headers, params=params)
                #print(r.text)
            except:
                print('请求错误')
                continue
            try:
                if r.status_code == 200:
                    content = loads_jsonp(r.text)
                    auctionSku = jsonpath.jsonpath(content, '$..auctionSku')
                    rate_Content = jsonpath.jsonpath(content, '$..rateContent')
                    rate_Date = jsonpath.jsonpath(content, '$..rateDate')
                    #print(auctionSku)
                    for x in range(len(auctionSku)):
                        comment = [None] * 3
                        comment[0] = auctionSku[x]
                        comment[1] = rate_Date[x]
                        comment[2] = rate_Content[x]
                        writer.writerow(comment)
            except:
                print('解析失败')
                pass


file = '淘宝手机排行表.xls'
workbook = xlrd.open_workbook(file)
data = xlrd.open_workbook(filename=file)
sheet1 = data.sheet_by_index(0)  # 通过索引获取sheet
# print(sheet1.name,sheet1.nrows,sheet1.ncols)
id_list = sheet1.col_values(3, 85)  # 获取列内容,第一行第三列开始
sellerid_list = sheet1.col_values(4, 85)
print(id_list+'-'+sellerid_list)
get_content(id_list,sellerid_list)


  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

不要香菜哈

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值