scrapy项目,豆瓣电影top250评分及影评

本文介绍如何使用Scrapy框架爬取豆瓣电影Top250及其短评,详细讲解了脚本编写、数据提取及存储过程。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

scrapy是一个强大的异步爬虫框架,学会使用它能使我们提高效率。

	新建项目:  
scrapy startproject doubanmovie

spiders目录下脚本文件:
一般来说每一个parse函数对应一个页面。每一个页面根据Xpath规则编写获取数据。
本文中使用了Selector(url).xpath()方法。也可以使用.css方法进行选择。
parse函数为获取Top250的电影信息,parseL函数为获取对应的电影短评。
这里注意我把定制的请求头headers放到了脚本文件中。没有在setting文件中配置。更加直观。

import scrapy
from scrapy.http import Request
from scrapy.selector import Selector
from doubanmovie.items import DoubanmovieItem
from urllib.parse import urljoin


class Douban(scrapy.spiders.Spider):
    name = "douban"
    allowed_domains = ["douban.com"]
    # redis_key = 'douban:start_urls'
    start_urls = ['https://movie.douban.com/top250',]
                 #  'https://movie.douban.com/subject/1292052/comments?sort=new_score&status=P']
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'
    }
    selfItem = {}
    def start_requests(self):
        return [Request(url=self.start_urls[0], callback=self.parse, headers=self.headers)]

    def parse(self, response):
        item = DoubanmovieItem()
        selector = Selector(response)
        Movies = selector.xpath('//div[@class="info"]')
        for eachMovie in Movies:
            title = eachMovie.xpath('div[@class="hd"]/a/span/text()').extract()[0] # 多个span标签
            fullTitle = "".join(title) # 将多个字符串无缝连接起来
            # movieInfo = eachMovie.xpath('div[@class="bd"]/p/text()').extract()
            star = eachMovie.xpath('div[@class="bd"]/div[@class="star"]/span/text()').extract()[0]
            comment = eachMovie.xpath('div[@class="bd"]/div[@class="star"]/span[4]/text()').extract()[0]
            quote = eachMovie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract()
            movieurl = eachMovie.xpath('div[@class="hd"]/a/@href').extract()[0]
            classifications = eachMovie.xpath('div[@class="bd"]/p[@class]/text()').extract()[1]
            classification = classifications.replace(" ", "")
            # quote可能为空,因此需要先进行判断
            if quote:
                quote = quote[0]
            else:
                quote = ''
            item['title'] = fullTitle
            # item['movieInfo'] = ';'.join(movieInfo)
            item['star'] = star
            item['comment'] = comment
            item['movieurl'] = movieurl
            item['quote'] = quote
            item['classification'] = classification
            if movieurl != None:
                commentaryURL = str(movieurl+'comments?sort=new_score&status=P')

                #yield Request(urljoin(commentaryURL, commentaryURL), callback=self.parseL,headers=self.headers)

            yield item
        nextLink = selector.xpath('//span[@class="next"]/link/@href').extract()
        # 第10页是最后一页,没有下一页的链接
        if nextLink:
            nextLink = nextLink[0]
            yield Request(urljoin(response.url, nextLink), callback=self.parse,headers=self.headers)
    def parseL(self,commentaryURL):
        item = DoubanmovieItem()
        selectors = Selector(commentaryURL)
        selfItem = self.selfItem
        commentarys = selectors.xpath('//div[@id="comments"]')
        for commentary in commentarys :
            movies = selectors.xpath('//div[@id="content"]/h1/text()').extract_first()
            selfItem["movies"] = movies
            commentary1 = commentary.xpath('div[1]/div[2]/p/span/text()').extract_first()
            selfItem["commentary1"] = commentary1
            commentary2 = commentary.xpath('div[2]/div[2]/p/span/text()').extract_first()
            selfItem["commentary2"] = commentary2
            commentary3 = commentary.xpath('div[3]/div[2]/p/span/text()').extract_first()
            selfItem["commentary3"] = commentary3
            commentary4 = commentary.xpath('div[4]/div[2]/p/span/text()').extract_first()
            selfItem["commentary4"] = commentary4
            commentary5 = commentary.xpath('div[5]/div[2]/p/span/text()').extract_first()
            selfItem["commentary5"] = commentary5
            commentary6 = commentary.xpath('div[6]/div[2]/p/span/text()').extract_first()
            selfItem["commentary6"] = commentary6
            commentary7 = commentary.xpath('div[7]/div[2]/p/span/text()').extract_first()
            selfItem["commentary7"] = commentary7
            commentary8 = commentary.xpath('div[8]/div[2]/p/span/text()').extract_first()
            selfItem["commentary8"] = commentary8
            commentary9 = commentary.xpath('div[9]/div[2]/p/span/text()').extract_first()
            selfItem["commentary9"] = commentary9
            commentary10 = commentary.xpath('div[10]/div[2]/p/span/text()').extract_first()
            selfItem["commentary10"] = commentary10
            item.update(self.selfItem)
            yield item

items.py
items是通道文件。把脚本获取到的数据传递给下载器。一般使用scrapy.Field()方法从Item字典中取得信息。

import scrapy
class DoubanmovieItem(scrapy.Item):
    title = scrapy.Field()  
    movieInfo = scrapy.Field()  
    star = scrapy.Field()  
    classification = scrapy.Field() 
    comment = scrapy.Field()   # 电影评论数
    quote = scrapy.Field()  
    movieurl = scrapy.Field()  # 电影链接
    commentary1 = scrapy.Field() # 短评
    commentary2 = scrapy.Field()  # 短评
    commentary3 = scrapy.Field()  # 短评
    commentary4 = scrapy.Field()  # 短评
    commentary5 = scrapy.Field()  # 短评
    commentary6 = scrapy.Field()  # 短评
    commentary7 = scrapy.Field()  # 短评
    commentary8 = scrapy.Field()  # 短评
    commentary9 = scrapy.Field()  # 短评
    commentary10 = scrapy.Field()  # 短评
    movies = scrapy.Field()
    pass

main.py
我们可以在文件夹下新建main.py文件。这样就无需使用命令行来启动scrapy。

from scrapy.cmdline import execute

execute("scrapy crawl douban".split())

pipelines.py
作为下载器。我们可以定义class piplines来达到保存数据想要的效果。这里我把数据保存在excel文件中。
如果数据量比较大的话我建议使用数据库保存。避免因为文件过大不方便进行数据分析。

import openpyxl


class DoubanmoviePipeline(object):
    def open_spider(self, spider):
        self.wb = openpyxl.Workbook()
        self.sheet = self.wb.active
        self.sheet.append(['title', 'star', 'comment', 'quote', 'classification', 'movieurl',
                           "commentary1","commentary2","commentary3","commentary4","commentary5","commentary6",
                           "commentary7","commentary8","commentary9","commentary10"])

    def process_item(self, item, spider):
        # 获取字典 item 的所有值
        temp = item.values()
        # 将值转换成列表数据
        data = list(temp)
        # 添加到excel 文件
        self.sheet.append(data)

    def close_spider(self, spider):
        self.wb.save("豆瓣电影.xlsx")

当然使用爬虫时ip被网站封禁也是有可能的,请注意爬虫对目标网站带来的压力。可以在setting文件中配置爬取频率。
使用爬虫时务必要阅读网站的robots.txt文件。
ROBOTSTXT_OBEY = True
默认为true即遵守robots文件。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值