框架Scrapy中起始url内部实现
源码解析
位置
class KuaidailiSpider(scrapy.Spider):
源码:
def start_requests(self): cls = self.__class__ if method_is_overridden(cls, Spider, 'make_requests_from_url'): warnings.warn( "Spider.make_requests_from_url method is deprecated; it " "won't be called in future Scrapy releases. Please " "override Spider.start_requests method instead (see %s.%s)." % ( cls.__module__, cls.__name__ ), ) for url in self.start_urls: yield self.make_requests_from_url(url) else: for url in self.start_urls: yield Request(url, dont_filter=True)
自定制起始url
# -*- coding: utf-8 -*- import scrapy from scrapy.http import Request """ scrapy引擎来爬虫中取起始url: 1、调用start_requests并获取返回值 2、v = iter(返回值) 3、 执行 req1 = v.__next__() 执行 req2 = v.__next__() 执行 req3 = v.__next__() ... 4、req全部放到调度器中 """ class KuaidailiSpider(scrapy.Spider): name = 'kuaidaili' allowed_domains = ['https://www.kuaidaili.com/free/'] start_urls = ['https://www.kuaidaili.com/free/'] cookie_dict = {} def start_requests(self): # 定义起始url方式一:[也可以发post请求]yield Request(url=url,method='post') for url in self.start_urls: yield Request(url=url) # 定义起始url方式二: # req_list = [] # for url in self.start_urls: # req_list.append(Request(url=url)) # return req_list