scrapy爬取豆瓣电影

刚看了scrapy框架,就想写个小项目练练手,刚好最近的一个django项目缺少电影推荐的信息,自然就想到了去爬取豆瓣电影的数据,爬取的url是https://movie.douban.com/top250

新建项目命令:
scrapy startproject doubanMovie

进入项目目录下,新建一个爬虫文件:
scrapy genspider movie movie.douban.com

各py文件代码如下:

# settings.py代码

# -*- coding: utf-8 -*-

BOT_NAME = 'doubanMovie'

SPIDER_MODULES = ['doubanMovie.spiders']
NEWSPIDER_MODULE = 'doubanMovie.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

# 设置UA
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
# Obey robots.txt rules
# 自己写的爬虫,暂时就不遵守robots规则啦
ROBOTSTXT_OBEY = False

# Override the default request headers:
# 打开默认请求头
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# 开启item pipeline
ITEM_PIPELINES = {
   'doubanMovie.pipelines.DoubanmoviePipeline': 300,
}
# items.py文件

# -*- coding: utf-8 -*-

import scrapy

class DoubanmovieItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field()  # 电影名字
    release_time = scrapy.Field()  # 上映时间
    director = scrapy.Field()  # 导演名字
    length = scrapy.Field()  # 片长
    imdb_link = scrapy.Field()  # imdb下载链接
    mark = scrapy.Field()  # 评分
    cover_link = scrapy.Field()  # 封面图片
    summary = scrapy.Field()  # 概述

# movie.py 文件,也就是爬虫文件
# -*- coding: utf-8 -*-
import scrapy
from urllib import parse
import time

class MovieSpider(scrapy.Spider):
    name = 'movie'  # 爬虫名
    # allowed_domains = ['movie.douban.com']

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
    }

    def start_requests(self):
        url = 'https://movie.douban.com/top250'
        # yield scrapy.Request(url=url, headers=self.headers, meta={'proxy':'http:118.117.136.19:9000'})
        yield scrapy.Request(url=url, headers=self.headers)

    page = 1  # 翻页记录

    # 解析页面
    def parse(self, response):
        # item = DoubanmovieItem()
        movie_ol = response.xpath('//ol[@class="grid_view"]/li/div/div[2]')

        for div in movie_ol:
            # 电影名称和评分
            item = {}
            name = div.xpath('.//a/span[1]/text()').extract_first()
            mark = div.xpath('.//span[@class="rating_num"]/text()').extract_first()

            item = {
                'name':name,
                'mark':mark,
            }

            # 电影详情信息页面url
            detail_url = div.xpath('.//a/@href').extract_first()

            # 获取详细信息
            yield scrapy.Request(url=detail_url, callback=self.parse_info, meta={'item':item}, dont_filter=True)

        # 翻页
        if self.page <= 10:
            data = {
                'start':(self.page - 1) * 25,
            }
            data = parse.urlencode(data)

            # 下一页url
            next_url = 'https://movie.douban.com/top250?' + data
            self.page += 1

            yield scrapy.Request(url=next_url, callback=self.parse, dont_filter=True)

    # 解析每一条电影链接对应的详情页信息
    def parse_info(self, response):
        item = response.meta['item']
        director = response.xpath('//div[@class="article"]//div[@id="info"]/span[1]/span[2]/a/text()').extract_first()
        release_time = response.xpath('//div[@class="article"]//div[@id="info"]/span[10]/text()').extract_first()
        length = response.xpath('//div[@class="article"]//div[@id="info"]/span[13]/text()').extract_first()
        imdb_link = response.xpath('//div[@class="article"]//div[@id="info"]/a[1]/@href').extract_first()
        cover_link = response.xpath('//div[@class="article"]//div[@id="mainpic"]//img/@src').extract_first()
        summary = response.xpath('//div[@class="article"]//div[@id="link-report"]/span[1]/text()').extract_first()

        item['director'] = director
        item['release_time'] = release_time
        item['length'] = length
        item['imdb_link'] = imdb_link
        item['cover_link'] = cover_link
        item['summary'] = summary

        time.sleep(2)
        yield item


# pipeline.py文件

# -*- coding: utf-8 -*-

import json

class DoubanmoviePipeline(object):
    # 首次运行工程就打开文件
    def open_spider(self, spider):
        self.fp = open('movie.json', 'w', encoding='utf-8')

    # 退出就关闭文件
    def close_spider(self, spider):
        self.fp.close()

    # 此处的item参数就是从film.py中的parse方法返回的
    # 每返回一个item,这里就调用一次
    def process_item(self, item, spider):
        # print(item)
        string = json.dumps(item, ensure_ascii=False, indent=4)
        self.fp.write(string + '\n')

        return item

运行命令:
scrapy crawl movie

刚开始测试的时候是可以拿到结果的,但在测试了几次以后,豆瓣似乎封掉了我的ip,总是报403错误。可能是请求速度太快了,添加了headers之后也不行,最后更换了代理ip就可以了。如果在程序运行的过程中遇到相似问题,更换ip即可。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值