scrapy是一个强大的异步爬虫框架,学会使用它能使我们提高效率。
新建项目:
scrapy startproject doubanmovie
spiders目录下脚本文件:
一般来说每一个parse函数对应一个页面。每一个页面根据Xpath规则编写获取数据。
本文中使用了Selector(url).xpath()方法。也可以使用.css方法进行选择。
parse函数为获取Top250的电影信息,parseL函数为获取对应的电影短评。
这里注意我把定制的请求头headers放到了脚本文件中。没有在setting文件中配置。更加直观。
import scrapy
from scrapy.http import Request
from scrapy.selector import Selector
from doubanmovie.items import DoubanmovieItem
from urllib.parse import urljoin
class Douban(scrapy.spiders.Spider):
name = "douban"
allowed_domains = ["douban.com"]
# redis_key = 'douban:start_urls'
start_urls = ['https://movie.douban.com/top250',]
# 'https://movie.douban.com/subject/1292052/comments?sort=new_score&status=P']
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'
}
selfItem = {}
def start_requests(self):
return [Request(url=self.start_urls[0], callback=self.parse, headers=self.headers)]
def parse(self, response):
item = DoubanmovieItem()
selector = Selector(response)
Movies = selector.xpath('//div[@class="info"]')
for eachMovie in Movies:
title = eachMovie.xpath('div[@class="hd"]/a/span/text()').extract()[0] # 多个span标签
fullTitle = "".join(title) # 将多个字符串无缝连接起来
# movieInfo = eachMovie.xpath('div[@class="bd"]/p/text()').extract()
star = eachMovie.xpath('div[@class="bd"]/div[@class="star"]/span/text()').extract()[0]
comment = eachMovie.xpath('div[@class="bd"]/div[@class="star"]/span[4]/text()').extract()[0]
quote = eachMovie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract()
movieurl = eachMovie.xpath('div[@class="hd"]/a/@href').extract()[0]
classifications = eachMovie.xpath('div[@class="bd"]/p[@class]/text()').extract()[1]
classification = classifications.replace(" ", "")
# quote可能为空,因此需要先进行判断
if quote:
quote = quote[0]
else:
quote = ''
item['title'] = fullTitle
# item['movieInfo'] = ';'.join(movieInfo)
item['star'] = star
item['comment'] = comment
item['movieurl'] = movieurl
item['quote'] = quote
item['classification'] = classification
if movieurl != None:
commentaryURL = str(movieurl+'comments?sort=new_score&status=P')
#yield Request(urljoin(commentaryURL, commentaryURL), callback=self.parseL,headers=self.headers)
yield item
nextLink = selector.xpath('//span[@class="next"]/link/@href').extract()
# 第10页是最后一页,没有下一页的链接
if nextLink:
nextLink = nextLink[0]
yield Request(urljoin(response.url, nextLink), callback=self.parse,headers=self.headers)
def parseL(self,commentaryURL):
item = DoubanmovieItem()
selectors = Selector(commentaryURL)
selfItem = self.selfItem
commentarys = selectors.xpath('//div[@id="comments"]')
for commentary in commentarys :
movies = selectors.xpath('//div[@id="content"]/h1/text()').extract_first()
selfItem["movies"] = movies
commentary1 = commentary.xpath('div[1]/div[2]/p/span/text()').extract_first()
selfItem["commentary1"] = commentary1
commentary2 = commentary.xpath('div[2]/div[2]/p/span/text()').extract_first()
selfItem["commentary2"] = commentary2
commentary3 = commentary.xpath('div[3]/div[2]/p/span/text()').extract_first()
selfItem["commentary3"] = commentary3
commentary4 = commentary.xpath('div[4]/div[2]/p/span/text()').extract_first()
selfItem["commentary4"] = commentary4
commentary5 = commentary.xpath('div[5]/div[2]/p/span/text()').extract_first()
selfItem["commentary5"] = commentary5
commentary6 = commentary.xpath('div[6]/div[2]/p/span/text()').extract_first()
selfItem["commentary6"] = commentary6
commentary7 = commentary.xpath('div[7]/div[2]/p/span/text()').extract_first()
selfItem["commentary7"] = commentary7
commentary8 = commentary.xpath('div[8]/div[2]/p/span/text()').extract_first()
selfItem["commentary8"] = commentary8
commentary9 = commentary.xpath('div[9]/div[2]/p/span/text()').extract_first()
selfItem["commentary9"] = commentary9
commentary10 = commentary.xpath('div[10]/div[2]/p/span/text()').extract_first()
selfItem["commentary10"] = commentary10
item.update(self.selfItem)
yield item
items.py
items是通道文件。把脚本获取到的数据传递给下载器。一般使用scrapy.Field()方法从Item字典中取得信息。
import scrapy
class DoubanmovieItem(scrapy.Item):
title = scrapy.Field()
movieInfo = scrapy.Field()
star = scrapy.Field()
classification = scrapy.Field()
comment = scrapy.Field() # 电影评论数
quote = scrapy.Field()
movieurl = scrapy.Field() # 电影链接
commentary1 = scrapy.Field() # 短评
commentary2 = scrapy.Field() # 短评
commentary3 = scrapy.Field() # 短评
commentary4 = scrapy.Field() # 短评
commentary5 = scrapy.Field() # 短评
commentary6 = scrapy.Field() # 短评
commentary7 = scrapy.Field() # 短评
commentary8 = scrapy.Field() # 短评
commentary9 = scrapy.Field() # 短评
commentary10 = scrapy.Field() # 短评
movies = scrapy.Field()
pass
main.py
我们可以在文件夹下新建main.py文件。这样就无需使用命令行来启动scrapy。
from scrapy.cmdline import execute
execute("scrapy crawl douban".split())
pipelines.py
作为下载器。我们可以定义class piplines来达到保存数据想要的效果。这里我把数据保存在excel文件中。
如果数据量比较大的话我建议使用数据库保存。避免因为文件过大不方便进行数据分析。
import openpyxl
class DoubanmoviePipeline(object):
def open_spider(self, spider):
self.wb = openpyxl.Workbook()
self.sheet = self.wb.active
self.sheet.append(['title', 'star', 'comment', 'quote', 'classification', 'movieurl',
"commentary1","commentary2","commentary3","commentary4","commentary5","commentary6",
"commentary7","commentary8","commentary9","commentary10"])
def process_item(self, item, spider):
# 获取字典 item 的所有值
temp = item.values()
# 将值转换成列表数据
data = list(temp)
# 添加到excel 文件
self.sheet.append(data)
def close_spider(self, spider):
self.wb.save("豆瓣电影.xlsx")
当然使用爬虫时ip被网站封禁也是有可能的,请注意爬虫对目标网站带来的压力。可以在setting文件中配置爬取频率。
使用爬虫时务必要阅读网站的robots.txt文件。
ROBOTSTXT_OBEY = True
默认为true即遵守robots文件。
本文介绍如何使用Scrapy框架爬取豆瓣电影Top250及其短评,详细讲解了脚本编写、数据提取及存储过程。


被折叠的 条评论
为什么被折叠?



