Scrapy使用笔记-豆瓣电影爬取

如果爬取少量网页的话,直接request+xpath和用scrapy倒也没啥区别,如果爬取大量页面,scrapy这个异步框架就快很多很多了

安装

pip install Scrapy

创建

# 创建项目
scrapy startproject movie
# 创建spider,可以创建多个,从多个不同数据源抓取数据
cd movie
scrapy genspider douban douban.com

目录

movie/
    scrapy.cfg   # 部署配置文件,http://scrapyd.readthedocs.org/
    movie/         # 项目的python模块,import目录下文件时从这里开始,比如from movie.items import MovieItem
        __init__.py
        items.py          # model定义文件
        middlewares.py    # 中间件
        pipelines.py      # 处理spider生成的model
        settings.py       # 项目配置
        spiders/          # 爬虫目录
            __init__.py
            douban.py # 刚创建的豆瓣spider,还可以再来个猫眼spider等等

items

import scrapy

class MovieItem(scrapy.Item):
    id = scrapy.Field()
    name = scrapy.Field()
    rating = scrapy.Field()
    kind = scrapy.Field()
    duration = scrapy.Field()
    showdate = scrapy.Field()
    language = scrapy.Field()
    country = scrapy.Field()
    alternate_name = scrapy.Field()
    synopsis = scrapy.Field()
    poster = scrapy.Field()
    artist = scrapy.Field()
    trailer = scrapy.Field()
    pic = scrapy.Field()


class ArtistItem(scrapy.Item):
    film_id = scrapy.Field()
    name = scrapy.Field()
    avatar = scrapy.Field()
    role = scrapy.Field()

class TrailerItem(scrapy.Item):
    film_id = scrapy.Field()
    video = scrapy.Field()
    pic = scrapy.Field()
    video_time = scrapy.Field()
    name = scrapy.Field()
    release_date = scrapy.Field()

class PicItem(scrapy.Item):
    film_id = scrapy.Field()
    pic = scrapy.Field()

spiders

这里用即将上映影片为例,主要操作就是start_urls定义入口页面,爬取完成后进入parse,解析完后yield 下一个爬取页面的Request并设置回掉函数,这个Request会进入到Scheduler等待请求,请求完后进入回掉函数,类似parse,然后解析,如果一条数据爬取完成可以生成item的model了,就yield相应的item进入 Item Pipelines,可以保存了。。。具体看一下官方介绍吧,https://doc.scrapy.org/en/latest/topics/architecture.html
在这里插入图片描述

import scrapy
import re
from movie.items import MovieItem
from movie.items import ArtistItem
from movie.items import TrailerItem
from movie.items import PicItem

class DoubanSpider(scrapy.Spider):
    
    name = 'douban'
    allowed_domains = ['douban.com']
    start_urls = ['https://movie.douban.com/coming']

    def parse(self, response):
        url_list = response.xpath('//*[@id="content"]/div/div[1]/table/tbody/tr/td[2]/a/@href').getall()
        title_list = response.xpath('//*[@id="content"]/div/div[1]/table/tbody/tr/td[2]/a/text()').getall()
        release_date_list = response.xpath('//*[@id="content"]/div/div[1]/table/tbody/tr/td[1]/text()').getall()
        for i in range(len(release_date_list)):
            release_date_list[i] = release_date_list[i].replace(' ', '').replace('\n', '')
        for i in range(len(title_list)):
            yield scrapy.Request(url_list[i], callback=self.spider_douban_movie)


    def spider_douban_movie(self, response):
        name = response.xpath('//*[@id="content"]/h1/span[1]/text()').get()
        name = name.split(' ')[0].encode('utf-8')
        if not name:
            return

        rating = response.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').get()

        kind = response.xpath('//*[@id="info"]/span[@property="v:genre"]/text()').getall()
        kind = '/'.join(kind)

        duration = response.xpath('//*[@id="info"]/span[@property="v:runtime"]/text()').get()
        duration = duration and re.findall(r'(\d*)分钟', duration)[0]

        release_date = response.xpath('//*[@id="info"]/span[@property="v:initialReleaseDate"]/text()').get()
        showdate = release_date and re.findall(r'\d{4}-\d{2}-\d{2}', release_date)[0]

        info = response.xpath('//*[@id="info"]').get()
        language = re.findall('<span class="pl">语言:</span>(.*)<br>', info)
        language = language and language[0].strip() or ''

        country = re.findall('<span class="pl">制片国家/地区:</span> (.*)<br>', info)
        country = country and country[0].strip() or ''

        alternate_name = re.findall('<span class="pl">又名:</span> (.*)<br>', info)
        alternate_name = alternate_name and alternate_name[0].strip() or ''

        synopsis = response.xpath('//*[@id="link-report"]//span[@property="v:summary"]/text()').get()
        synopsis = synopsis and synopsis.strip()

        poster = response.xpath('//*[@id="mainpic"]/a/img/@src').get()

        film_id = response.url.split('/')[-2]

        yield MovieItem(id = film_id,
                        name = name.decode('utf8'),
                        rating = rating,
                        kind = kind,
                        duration = duration,
                        showdate = showdate,
                        language = language,
                        country = country,
                        alternate_name = alternate_name,
                        synopsis = synopsis,
                        poster = poster)

        # 演职人员
        artist_url = response.xpath('//*[@id="celebrities"]/h2/span/a/@href').getall()
        if artist_url:
            artist_url = artist_url[0]
            yield scrapy.Request('https://movie.douban.com/' + artist_url,
                                callback=self.spider_douban_movie_artist,
                                meta={'id': film_id})

        # 预告片
        tmp = response.xpath('//*[@id="related-pic"]/h2/span').get()
        m = re.findall(r'<a href="(.*trailer#trailer)">.*</a>', tmp)
        if m:
            yield scrapy.Request(m[0], callback=self.spider_douban_movie_trailer, meta={'id': film_id})

        # 剧照
        subject_id = re.findall(r'.*/(\d*)/$', response.url)
        subject_id = subject_id and subject_id[0]
        if subject_id:
            yield scrapy.Request('https://movie.douban.com/subject/%s/photos?type=S' % subject_id,
                                callback=self.spider_douban_movie_pic,
                                meta={'id': film_id})


    def spider_douban_movie_artist(self, response):
        """
        获取演职人员信息
        """
        identities = response.xpath('//*[@id="celebrities"]/div')
        for identity in identities:
            id_kind = identity.xpath('./h2/text()').get()
            id_kind = id_kind and id_kind.split(' ')[0] or ''
            artists = identity.xpath('./ul/li')
            for at in artists:
                name = at.xpath('./a/@title').get()
                avatar = at.xpath('./a/div/@style').get()
                avatar = re.findall(r'background-image: url\((.*)\)', avatar)
                role = at.xpath('./div/span[2]/@title').get()
                role = re.findall(r'\((.*)\)', role)
                role = role and role[0].split(' ') or []
                role = len(role)>1 and '饰 %s' % role[1] or ''
                yield ArtistItem(film_id=response.meta.get('id'),
                                name=name,
                                avatar=avatar and avatar[0],
                                role=role)

    def spider_douban_movie_trailer(self, response):
        """
        通过预告片列表,获取详情,并带参进入预告片播放页,返回播放页的request
        """
        identities = response.xpath('//div[@class="article"]/div[1]/ul/li')
        for identity in identities:
            video_time = identity.xpath('./a/strong/em/text()').get()
            pic = identity.xpath('./a/img/@src').get()
            name = identity.xpath('./p[1]/a/text()').get()
            next_url = identity.xpath('./p[1]/a/@href').get()
            release_date = identity.xpath('./p[@class="trail-meta"]/span/text()').get()
            meta = {
                'id': response.meta.get('id'),
                'pic': pic,
                'video_time': video_time,
                'name': name.strip(),
                'release_date': release_date,
            }
            yield scrapy.Request(next_url, callback=self.spider_douban_movie_trailer_show, meta=meta)

    def spider_douban_movie_trailer_show(self, response):
        """
        通过预告片播放页获取视频url,返回item
        """
        video = response.xpath('//video/source/@src').get()
        yield TrailerItem(film_id=response.meta.get('id'),
                            video=video,
                            pic=response.meta.get('pic'),
                            video_time=response.meta.get('video_time'),
                            name=response.meta.get('name'),
                            release_date=response.meta.get('release_date'))

    def spider_douban_movie_pic(self, response):
        """
        获取剧照
        """
        pics = response.xpath('//*[@id="content"]/div/div[1]/ul/li/div[1]/a/img/@src').getall()
        for index, item in enumerate(pics):
            yield PicItem(film_id=response.meta.get('id'), pic=item)

运行

scrapy crawl douban

pipeline

process_item中处理每次返回的item,比如写数据库或放到缓存,close_spider结束爬取后处理数据,比如排个序啥的

class MoviePipeline(object):
    def process_item(self, item, spider):
        return item

    def open_spider(self, spider):
        pass

    def close_spider(self, spider):
        pass

源码

https://github.com/xiaoyeqiannian/movie_spider

文档

https://doc.scrapy.org/en/latest/index.html

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值