爬取淘宝 一类商品及其评论

# -*- coding: utf-8 -*-
import scrapy
import json
import re
import requests
import re
import time
import random

class PplSpider(scrapy.Spider):
    name = 'ppl'
    # allowed_domains = ['www.com']
    start_urls = ['https://www.taobao.com/']
    def start_requests(self):
        page = 0
        page = page*44
        page_num={}
        page_num["page"] = page
        url_mode='https://s.taobao.com/search?q=%E6%B8%B8%E6%88%8F&s='+str(page)
        cookies = '填上你的cookie'
        cookies={i.split("=")[0]: i.split("=")[1] for i in cookies.split(";")}
        header = {
            'User-Agent': random.choice(User_Agents),
            'referer':' https://www.taobao.com/'
            }

        yield scrapy.Request(url_mode , callback=self.parse, headers=header, cookies=cookies, meta={'page':page_num})
    def parse(self, response):
        last_page = -3 #方便爬取所有的评论
        page = response.meta['page']
        print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" + str(page))
        response = response.text
        goods_match = re.findall(r'g_page_config = ({.*?});', response)
        goods_items = json.loads(goods_match[0])
        goods_items = goods_items['mods']['itemlist']['data']['auctions']
        for goods_item in goods_items:
            dic = {}
            goods = {
                'title': goods_item['raw_title'],
                'price': goods_item['view_price'],
                'location': goods_item['item_loc'],
                'sales': goods_item['view_sales']  # 获取不了 需要调接口
            }
            print(goods)
            itemId = goods_item["nid"]
            user_id = goods_item["user_id"]
            print(itemId)
            print(user_id)
            time.sleep(10)
            time.sleep(random.randint(6,7))
            url = "https://rate.tmall.com/list_detail_rate.htm"
            header = {
                "cookie":"",
                "referer": "https://detail.tmall.com/item.htm",
                'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36 SLBrowser/6.0.1.8131"
            }
            for i in range(1,100):
                params = {  # 必带信息
                    "itemId": itemId,  # 商品id
                    "sellerId": user_id,
                    "currentPage": str(i),  # 页码
                    }
                # req = requests.get(url, params, headers=header,proxies=proxies, verify=False).text
                if i-1 == last_page:
                    break
                else:
                    comment = requests.get(url, params, headers=header).text
                    print(comment)
                    comment = re.findall(r'({.*?}})', comment)
                    comment = json.loads(comment[0])
                    comment_list = comment["rateDetail"]["rateList"]
                    last_page = comment["rateDetail"]['paginator']['lastPage']
                    for each_comment in comment_list:
                        dic={}
                        if each_comment['rateContent'] :# 去掉空的内容
                            comment_content = each_comment['rateContent']
                            dic['comment_content'] = comment_content
                            i +=1 #页数加一页
                           

        page +=1
        page = page * 44
        page_num = {}
        page_num["page"] = page
        url_mode = 'https://s.taobao.com/search?q=%E6%B8%B8%E6%88%8F&s=' + str(page)
        cookies = '填上你的cookie'
 
        cookies = {i.split("=")[0]: i.split("=")[1] for i in cookies.split(";")}
        header = {
            'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36 SLBrowser/6.0.1.8131",
            'referer': ' https://www.taobao.com/'
        }
        yield scrapy.Request(url_mode , callback=self.parse, headers=header, cookies=cookies, meta={'page':page_num})


  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值