【学习笔记】scrapy示例

爬取视频和评论

# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy import Request
import json


def strip(s):
    if s:
        return s.strip()
    return ''


class DiscoverySpider(scrapy.Spider):
    name = 'discovery'
    allowed_domains = ['xinpianchang.com', 'openapi-vtom.vmovier.com']
    start_urls = ['https://www.xinpianchang.com/channel/index/type-/sort-like/duration_type-0/resolution_type-/page-1']

    def parse(self, response):
        pid_list = response.xpath('//ul[@class="video-list"]/li/@data-articleid').extract()
        url = 'https://www.xinpianchang.com/a%s?from=ArticleList'
        for pid in pid_list:
            request = response.follow(url % pid, self.parse_post)
            request.meta['pid'] = pid
            yield request

    def parse_post(self, response):
        pid = response.meta['pid']
        post = dict()
        post['pid'] = pid
        post['title'] = response.xpath('//div[@class="title-wrap"]/h3/text()').get()
        cates = [c.strip() for c in response.xpath('//span[contains(@class,"cate")]//text()').extract()]
        post['category'] = [cate for cate in cates if cate != '-' and cate != '']
        post['update_time'] = response.xpath('//span[contains(@class,"update-time")]/i//text()').get()
        post['play_counts'] = response.xpath('//i[contains(@class,"play-counts")]/@data-curplaycounts').get()
        post['like_counts'] = response.xpath('//span[contains(@class,"like-counts")]/@data-counts').get()
        post['description'] = strip(response.xpath('//p[contains(@class,"desc")]/text()').get())
        vid, = re.findall('var vid = \"(\w+)\";', response.text)
        video_url = 'https://openapi-vtom.vmovier.com/v3/video/%s?expand=resource&usage=xpc_web&appKey=61a2f329348b3bf77'
        request = Request(video_url % vid, callback=self.parse_video)
        request.meta['post'] = post
        yield request

        comment_url = 'https://app.xinpianchang.com/comments?resource_id=%s&type=article&page=1&per_page=24'
        request = Request(comment_url % pid, callback=self.parse_comment)
        request.meta['pid'] = pid
        yield request

    def parse_video(self, response):
        post = response.meta['post']
        result = json.loads(response.text)
        post['video'] = result['data']['resource']['default']['url']
        post['cover'] = result['data']['video']['cover']
        yield post

    def parse_comment(self, response):
        result = json.loads(response.text)
        li = result['data']['list']
        for item in li:
            comment = dict()
            comment['uname'] = item['userInfo']['username']
            comment['content'] = item['content']
            yield comment

        next_page = result['data']['next_page_url']
        if next_page:
            next_page_url = 'https://app.xinpianchang.com/%s' % next_page
            yield response.follow(next_page_url, self.parse_comment)


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值