使用scrapy框架的crawl模板爬取豆瓣250报错却还可以正常爬取数据,但是翻页功能无法实现,spider.py代码:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import time
import random
class CrawlMovieSpider(CrawlSpider):
name = 'crawl_movie'
allowed_domains = ['douban.com']
start_urls = ['https://movie.douban.com/top250?start=0&filter=']
rules = (
# 正则提取详情页的url :https://movie.douban.com/subject/1292052/
Rule(LinkExtractor(allow=r'^https://movie.douban.com/subject/\d+/$'), callback='parse_item', follow=False),
# xpath提取 另外一种xpath语法://a[text()='后页>']/@href //span[@class="next"]/a/@href
Rule(LinkExtractor(restrict_xpaths=('//span[@class="next"]/a/@href')), follow=True),# # follow=True意味着一直按照这个规则提取url,,构建请求对象
)
# time.sleep(random.randint(1,2))
def parse_item(self, response):
item = {}
#item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
#item['name'] = response.xpath('//div[@id="name"]').get()
#item['description'] = response.xpath('//div[@id="description"]').get()
# 电影名称
title = response.xpath('//h1/span[1]/text()').get() # 只有单个数据,所以用get()提取出字符串
print(title)
# 电影评分
score = response.xpath('//strong/text()').get()
print(score)
# 电影简介:有两种xpath提取语法
text_list = response.xpath('//div[@class="indent"]/span[1]/text()|//div[@class="indent"]/span[2]/text()').extract()
text = ''.join(text_list)
text = text.replace('\u3000','')
text = text.replace(' ','')
text = text.replace('\n','')
print(text)
# 存入数据
item['title'] = title
item['score'] = score
item['text'] = text
return item
这个错误的原因是 restrict_xpaths 应该指向元素,也就是直接链接或包含链接的容器,而不是它的属性,而我们的代码中用的是 a/@href
更详细的讲解可以参考:https://www.cnblogs.com/lei0213/p/8097515.html
如果对你有所帮助,麻烦点个赞鼓励一下,谢谢!