import scrapy
import json
import re
import requests
import re
import time
import random
class PplSpider(scrapy.Spider):
name = 'ppl'
start_urls = ['https://www.taobao.com/']
def start_requests(self):
page = 0
page = page*44
page_num={}
page_num["page"] = page
url_mode='https://s.taobao.com/search?q=%E6%B8%B8%E6%88%8F&s='+str(page)
cookies = '填上你的cookie'
cookies={i.split("=")[0]: i.split("=")[1] for i in cookies.split(";")}
header = {
'User-Agent': random.choice(User_Agents),
'referer':' https://www.taobao.com/'
}
yield scrapy.Request(url_mode , callback=self.parse, headers=header, cookies=cookies, meta={'page':page_num})
def parse(self, response):
last_page = -3
page = response.meta['page']
print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" + str(page))
response = response.text
goods_match = re.findall(r'g_page_config = ({.*?});', response)
goods_items = json.loads(goods_match[0])
goods_items = goods_items['mods']['itemlist']['data']['auctions']
for goods_item in goods_items:
dic = {}
goods = {
'title': goods_item['raw_title'],
'price': goods_item['view_price'],
'location': goods_item['item_loc'],
'sales': goods_item['view_sales']
}
print(goods)
itemId = goods_item["nid"]
user_id = goods_item["user_id"]
print(itemId)
print(user_id)
time.sleep(10)
time.sleep(random.randint(6,7))
url = "https://rate.tmall.com/list_detail_rate.htm"
header = {
"cookie":"",
"referer": "https://detail.tmall.com/item.htm",
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36 SLBrowser/6.0.1.8131"
}
for i in range(1,100):
params = {
"itemId": itemId,
"sellerId": user_id,
"currentPage": str(i),
}
if i-1 == last_page:
break
else:
comment = requests.get(url, params, headers=header).text
print(comment)
comment = re.findall(r'({.*?}})', comment)
comment = json.loads(comment[0])
comment_list = comment["rateDetail"]["rateList"]
last_page = comment["rateDetail"]['paginator']['lastPage']
for each_comment in comment_list:
dic={}
if each_comment['rateContent'] :
comment_content = each_comment['rateContent']
dic['comment_content'] = comment_content
i +=1
page +=1
page = page * 44
page_num = {}
page_num["page"] = page
url_mode = 'https://s.taobao.com/search?q=%E6%B8%B8%E6%88%8F&s=' + str(page)
cookies = '填上你的cookie'
cookies = {i.split("=")[0]: i.split("=")[1] for i in cookies.split(";")}
header = {
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36 SLBrowser/6.0.1.8131",
'referer': ' https://www.taobao.com/'
}
yield scrapy.Request(url_mode , callback=self.parse, headers=header, cookies=cookies, meta={'page':page_num})