问题起源于初次接触scrapy时,好奇于spider子类当中的parse方法,在parse()当中为了Request next_url,而要使用yield函数。
于是乎撸了一段代码试验一下yield:
import scrapy
from items import Work1Item
class Work1Spider(scrapy.Spider):
name = 'work1'
start_urls = [
'http://quotes.toscrape.com/',
]
def parse(self, response):
for quote in response.xpath('//div[@class="quote"]'):
item = Work1Item() #自定义的item
item['author'] = quote.xpath('.//small[@class="author"]/text()').extract_first()
item['tags'] = quote.xpath('.//div[@class="tags"]/a[@class="tag"]/text()').extract()
item['quote'] = quote.xpath('./span[@class="text"]/text()').extract_first()
yield item
next_page_url = response.xpath('//li[@