【学习笔记】scrapy示例

最新推荐文章于 2024-05-30 19:15:28 发布

Hung武

最新推荐文章于 2024-05-30 19:15:28 发布

阅读量139

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/qq_31478771/article/details/107420958

版权

爬虫专栏收录该内容

4 篇文章 0 订阅

订阅专栏

爬取视频和评论

# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy import Request
import json


def strip(s):
    if s:
        return s.strip()
    return ''


class DiscoverySpider(scrapy.Spider):
    name = 'discovery'
    allowed_domains = ['xinpianchang.com', 'openapi-vtom.vmovier.com']
    start_urls = ['https://www.xinpianchang.com/channel/index/type-/sort-like/duration_type-0/resolution_type-/page-1']

    def parse(self, response):
        pid_list = response.xpath('//ul[@class="video-list"]/li/@data-articleid').extract()
        url = 'https://www.xinpianchang.com/a%s?from=ArticleList'
        for pid in pid_list:
            request = response.follow(url % pid, self.parse_post)
            request.meta['pid'] = pid
            yield request

    def parse_post(self, response):
        pid = response.meta['pid']
        post = dict()
        post['pid'] = pid
        post['title'] = response.xpath('//div[@class="title-wrap"]/h3/text()').get()
        cates = [c.strip() for c in response.xpath('//span[contains(@class,"cate")]//text()').extract()]
        post['category'] = [cate for cate in cates if cate != '-' and cate != '']
        post['update_time'] = response.xpath('//span[contains(@class,"update-time")]/i//text()').get()
        post['play_counts'] = response.xpath('//i[contains(@class,"play-counts")]/@data-curplaycounts').get()
        post['like_counts'] = response.xpath('//span[contains(@class,"like-counts")]/@data-counts').get()
        post['description'] = strip(response.xpath('//p[contains(@class,"desc")]/text()').get())
        vid, = re.findall('var vid = \"(\w+)\";', response.text)
        video_url = 'https://openapi-vtom.vmovier.com/v3/video/%s?expand=resource&usage=xpc_web&appKey=61a2f329348b3bf77'
        request = Request(video_url % vid, callback=self.parse_video)
        request.meta['post'] = post
        yield request

        comment_url = 'https://app.xinpianchang.com/comments?resource_id=%s&type=article&page=1&per_page=24'
        request = Request(comment_url % pid, callback=self.parse_comment)
        request.meta['pid'] = pid
        yield request

    def parse_video(self, response):
        post = response.meta['post']
        result = json.loads(response.text)
        post['video'] = result['data']['resource']['default']['url']
        post['cover'] = result['data']['video']['cover']
        yield post

    def parse_comment(self, response):
        result = json.loads(response.text)
        li = result['data']['list']
        for item in li:
            comment = dict()
            comment['uname'] = item['userInfo']['username']
            comment['content'] = item['content']
            yield comment

        next_page = result['data']['next_page_url']
        if next_page:
            next_page_url = 'https://app.xinpianchang.com/%s' % next_page
            yield response.follow(next_page_url, self.parse_comment)

Hung武

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
【学习笔记】scrapy示例

爬取视频和评论# -*- coding: utf-8 -*-import scrapyimport refrom scrapy import Requestimport jsondef strip(s): if s: return s.strip() return ''class DiscoverySpider(scrapy.Spider): name = 'discovery' allowed_domains = ['xinpianch
复制链接

扫一扫

专栏目录