# -*- coding: utf-8 -*-
import json
import random
import re
import scrapy
from scrapy import Request
from xpc.items import ComposerItem
str = 'qazwsxedcrfvtgbyhnujmikolp1234567890'
cookies = dict(Authorization='F8FB7C7E1E8354A671E83548091E835A4181E83528E62CE43391')
def gen_sessionid():
return ''.join(random.choices(str, k=26))
class DiscoverySpider(scrapy.Spider):
name = 'discovery'
allowed_domains = ['xinpianchang.com', 'openapi-vtom.vmovier.com']
start_urls = ['https://www.xinpianchang.com/channel/index/sort-like?from=navigator']
page_count = 0
def parse(self, response):
self.page_count += 1
if self.page_count >= 100:
cookies.update(PHPSESSID=gen_sessionid())
self.page_count = 0
pid_list = response.xpath('//@data-articleid').extract()
url = "https://www.xinpianchang.com/a%s?from=ArticleList"
for pid in pid_list:
yield response.follow(url % pid, self.parse_post)
pages = response.xpath('//div[@class="page"]/a/@href').extract()
for page in pages:
yield response.follow(page, self.parse, cookies=cookies)
def parse_post(self, response):
# post = {}
# 这里get()和extract_first()一样
# post['title'] = response.xpath('//div[@class="title-wrap"]/h3/text()').get()
# categorys = response.xpath('//span[contains(@class,"cate")]//text()').extract()
# post['category'] = ''.join([category.strip() for category in categorys])
# post['created_at'] = response.xpath('//span[contains(@class,"update-time")]/i//text()').get()
# post['play_counts'] = response.xpath('//i[contains(@class,"play-counts")]/@data-curplaycounts').get()
# vid, = re.findall('vid: \"(\w+)\",', response.text)
# video_url = 'https://openapi-vtom.vmovier.com/v3/video/%s?expand=resource&usage=xpc_web'
# request = Request(video_url % vid, callback=self.parse_video)
# request.meta['post'] = post
# yield request
creator_list = response.xpath('//@data-userid').extract()
creator_url = 'https://www.xinpianchang.com/u%s'
for creator in creator_list:
request = response.follow(creator_url % creator, self.parse_composer)
request.meta['cid'] = creator
request.meta['dont_merge_cookies'] = True
yield request
# def parse_video(self, response):
# # post = response.meta['post']
# result = json.loads(response.text)
# if 'resource' in result['data']:
# post['video'] = result['data']['resource']['default']['url']
# else:
# d = result['data']['third']['data']
# post['video'] = d.get('iframe_url', d.get('swf', ''))
# yield post
def parse_composer(self, response):
banner = response.xpath('//div[@class="banner-wrap"]/@style').get()
composer = ComposerItem()
composer['cid'] = int(response.meta['cid'])
banner, = re.findall('background-image:url\((.+?)\)', banner)
composer['banner'] = banner.replace('\t', '').replace(' ', '')
name = response.xpath('//p[contains(@class,"creator-name")]/text()').get()
composer['name'] = name.replace('\t', '').replace(' ', '')
yield composer