python爬虫翻页爬取数据_python 爬虫 9 (scrapy实例:翻页爬取、爬个人页、存入mysql、redis)...

# -*- coding: utf-8 -*-

import json

import random

import re

import scrapy

from scrapy import Request

from xpc.items import ComposerItem

str = 'qazwsxedcrfvtgbyhnujmikolp1234567890'

cookies = dict(Authorization='F8FB7C7E1E8354A671E83548091E835A4181E83528E62CE43391')

def gen_sessionid():

return ''.join(random.choices(str, k=26))

class DiscoverySpider(scrapy.Spider):

name = 'discovery'

allowed_domains = ['xinpianchang.com', 'openapi-vtom.vmovier.com']

start_urls = ['https://www.xinpianchang.com/channel/index/sort-like?from=navigator']

page_count = 0

def parse(self, response):

self.page_count += 1

if self.page_count >= 100:

cookies.update(PHPSESSID=gen_sessionid())

self.page_count = 0

pid_list = response.xpath('//@data-articleid').extract()

url = "https://www.xinpianchang.com/a%s?from=ArticleList"

for pid in pid_list:

yield response.follow(url % pid, self.parse_post)

pages = response.xpath('//div[@class="page"]/a/@href').extract()

for page in pages:

yield response.follow(page, self.parse, cookies=cookies)

def parse_post(self, response):

# post = {}

# 这里get()和extract_first()一样

# post['title'] = response.xpath('//div[@class="title-wrap"]/h3/text()').get()

# categorys = response.xpath('//span[contains(@class,"cate")]//text()').extract()

# post['category'] = ''.join([category.strip() for category in categorys])

# post['created_at'] = response.xpath('//span[contains(@class,"update-time")]/i//text()').get()

# post['play_counts'] = response.xpath('//i[contains(@class,"play-counts")]/@data-curplaycounts').get()

# vid, = re.findall('vid: \"(\w+)\",', response.text)

# video_url = 'https://openapi-vtom.vmovier.com/v3/video/%s?expand=resource&usage=xpc_web'

# request = Request(video_url % vid, callback=self.parse_video)

# request.meta['post'] = post

# yield request

creator_list = response.xpath('//@data-userid').extract()

creator_url = 'https://www.xinpianchang.com/u%s'

for creator in creator_list:

request = response.follow(creator_url % creator, self.parse_composer)

request.meta['cid'] = creator

request.meta['dont_merge_cookies'] = True

yield request

# def parse_video(self, response):

# # post = response.meta['post']

# result = json.loads(response.text)

# if 'resource' in result['data']:

# post['video'] = result['data']['resource']['default']['url']

# else:

# d = result['data']['third']['data']

# post['video'] = d.get('iframe_url', d.get('swf', ''))

# yield post

def parse_composer(self, response):

banner = response.xpath('//div[@class="banner-wrap"]/@style').get()

composer = ComposerItem()

composer['cid'] = int(response.meta['cid'])

banner, = re.findall('background-image:url\((.+?)\)', banner)

composer['banner'] = banner.replace('\t', '').replace(' ', '')

name = response.xpath('//p[contains(@class,"creator-name")]/text()').get()

composer['name'] = name.replace('\t', '').replace(' ', '')

yield composer

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值