#1、name = 'amazon'
定义爬虫名,scrapy会根据该值定位爬虫程序
所以它必须要有且必须唯一(In Python 2 this must be ASCII only.)
#2、allowed_domains = ['www.amazon.cn']
定义允许爬取的域名,如果OffsiteMiddleware启动(默认就启动),
那么不属于该列表的域名及其子域名都不允许爬取
如果爬取的网址为:https://www.example.com/1.html,那就添加'example.com'到列表.
#3、start_urls = ['http://www.amazon.cn/']
如果没有指定url,就从该列表中读取url来生成第一个请求
#4、custom_settings
值为一个字典,定义一些配置信息,在运行爬虫程序时,这些配置会覆盖项目级别的配置
所以custom_settings必须被定义成一个类属性,由于settings会在类实例化前被加载
#5、settings
通过self.settings['配置项的名字']可以访问settings.py中的配置,如果自己定义了custom_settings还是以自己的为准
#6、logger
日志名默认为spider的名字
self.logger.debug('=============>%s' %self.settings['BOT_NAME'])
#5、crawler:了解
该属性必须被定义到类方法from_crawler中
#6、from_crawler(crawler, *args, **kwargs):了解
You probably won’t need to override this directly because the default implementation acts as a proxy to the __init__() method, calling it with the given arguments args and named arguments kwargs.
#7、start_requests()
该方法用来发起第一个Requests请求,且必须返回一个可迭代的对象。它在爬虫程序打开时就被Scrapy调用,Scrapy只调用它一次。
默认从start_urls里取出每个url来生成Request(url, dont_filter=True)
#针对参数dont_filter,请看自定义去重规则
如果你想要改变起始爬取的Requests,你就需要覆盖这个方法
class ImagezzSpider(scrapy.Spider):
name = 'Imagezz'
allowed_domains = ['image.so.com']
def start_requests(self):
base_url = 'http://image.so.com/zj?'
data = {'ch':'photography','listtype':'new'}
for page in range(1,2):
data['sn'] = page * 30
parmas = parse.urlencode(data)
url = base_url + parmas
yield scrapy.Request(url,callback=self.parse)
def parse(self, response):
pass
#8、parse(response)
这是默认的回调函数,所有的回调函数必须返回an iterable of Request and/or dicts or Item objects.
#9、log(message[, level, component]):了解
Wrapper that sends a log message through the Spider’s logger, kept for backwards compatibility. For more information see Logging from Spiders.
#10、closed(reason)
爬虫程序结束时自动触发
去重
#方法一:
1、新增类属性
visited=set() #类属性
2、回调函数parse方法内:
def parse(self, response):
if response.url in self.visited:
return None
.......
self.visited.add(response.url)
#方法一改进:针对url可能过长,所以我们存放url的hash值
def parse(self, response):
url=md5(response.request.url)
if url in self.visited:
return None
.......
self.visited.add(url)
#方法二:Scrapy自带去重功能
配置文件:
DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter' #默认的去重规则帮我们去重,去重规则在内存中
DUPEFILTER_DEBUG = False
JOBDIR = "保存范文记录的日志路径,如:/root/" # 最终路径为 /root/requests.seen,去重规则放文件中
scrapy自带去重规则默认为RFPDupeFilter,只需要我们指定
Request(...,dont_filter=False) ,如果dont_filter=True则告诉Scrapy这个URL不参与去重。
#方法三:
我们也可以仿照RFPDupeFilter自定义去重规则,
from scrapy.dupefilter import RFPDupeFilter,看源码,仿照BaseDupeFilter
#步骤一:在项目目录下自定义去重文件cumstomdupefilter.py
'''
if hasattr("MyDupeFilter",from_settings):
func = getattr("MyDupeFilter",from_settings)
obj = func()
else:
return MyDupeFilter()
'''
class MyDupeFilter(object):
def __init__(self):
self.visited = set()
@classmethod
def from_settings(cls, settings):
'''读取配置文件'''
return cls()
def request_seen(self, request):
'''请求看过没有,这个才是去重规则该调用的方法'''
if request.url in self.visited:
return True
self.visited.add(request.url)
def open(self): # can return deferred
'''打开的时候执行'''
pass
def close(self, reason): # can return a deferred
pass
def log(self, request, spider): # log that a request has been filtered
'''日志记录'''
pass
#步骤二:配置文件settings.py
# DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter' #默认会去找这个类实现去重
#自定义去重规则
DUPEFILTER_CLASS = 'AMAZON.cumstomdupefilter.MyDupeFilter'
# 源码分析:
from scrapy.core.scheduler import Scheduler
见Scheduler下的enqueue_request方法:self.df.request_seen(request)
## 传递参数
我们可能需要在命令行为爬虫程序传递参数,比如传递初始的url,像这样
#命令行执行
scrapy crawl myspider -a category=electronics
#在__init__方法中可以接收外部传进来的参数
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
def __init__(self, category=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.start_urls = ['http://www.example.com/categories/%s' % category]
#...
#注意接收的参数全都是字符串,如果想要结构化的数据,你需要用类似json.loads的方法
def process_request(self, request, spider):
# Called for each request that goes through the downloader
USER_AGENT = [
'Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)'
]
user_agent = random.choice(USER_AGENT)
request.headers['User-Agent'] = user_agent
def process_request(self, request, spider):
# Called for each request that goes through the downloader
PROXIES = ["124.152.32.140:53281","119.48.179.46:9999","121.233.207.142:9999",
"111.177.187.119:9999","219.145.170.23:8888"]
proxy = random.choice(PROXIES)
request.meta['download_timeout']=10 #设置超时时间
request.meta['proxy'] = 'http://' + proxy
#request.meta['proxy']='http://user:pwd@ip:port'