scrapy第一次请求方式的重写
# -*- coding: utf-8 -*-
import scrapy
from ..items import MaoyanItem
class MaoyanSpider(scrapy.Spider):
name = 'maoyan3'
allowed_domains = ['maoyan.com']
#重写start_requests()方法,把所有URL地址都交给调度器
def start_requests(self):
# 把所有的URL地址统一扔给调度器入队列
for offset in range(0, 91, 10):
url = 'https://maoyan.com/board/4?offset={}'.format(offset)
# 交给调度器
yield scrapy.Request(
url=url,
callback=self.parse_html
)
def parse_html(self,response):
#基准的xpath
dd_list = response.xpath('//dl[@class="board-wrapper"]/dd')
#for循环依次遍历
for dd in dd_list:
#创建对象'
item = MaoyanItem()
# 电影名称
# 如果不添加extract_first(),会得到一堆列表里面的选择器,但是我们的目标是得到字符串
item["name"] = dd.xpath("./a/@title").extract_first().strip()
# 电影主演
item["star"] = dd.xpath(".//p[@class='star']/text()").extract_first().strip()
#上映时间
item["time"] = dd.xpath('.//p[@class="releasetime"]/text()').extract_first().strip()
#把爬取的数据交给管道文件pip