image_url = item[“image_urls”]
if image_url:
self.num + 1
yield Request(url=image_url, meta={“item”: item})
def file_path(self, request, response=None, info=None):
start of deprecation warning block (can be removed in the future)
def _warn():
from scrapy.exceptions import ScrapyDeprecationWarning
import warnings
warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, ’
‘please use file_path(request, response=None, info=None) instead’,
category=ScrapyDeprecationWarning, stacklevel=1)
check if called from image_key or file_key with url as first argument
if not isinstance(request, Request):
_warn()
url = request
else:
url = request.url
detect if file_key() or image_key() methods have been overridden
if not hasattr(self.file_key, ‘_base’):
_warn()
return self.file_key(url)
elif not hasattr(self.image_key, ‘_base’):
_warn()
return self.image_key(url)
end of deprecation warning block
return ‘desk/{}.jpg’.format(request.meta[“item”][“image_title”])
middlewares.py
from scrapy import signals
from zol2.useragents import agents
class Zol2SpiderMiddleware(object):
Not all methods need to be defined. If a method is not defined,
scrapy acts as if the spider middleware does not modify the
passed objects.
@classmethod
def from_crawler(cls, crawler):
This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
Called for each response that goes through the spider
middleware and into the spider.
Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
Called with the results returned from the Spider, after
it has processed the response.
Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
Called when a spider or process_spider_input() method
(from other spider middleware) raises an exception.
Should return either None or an iterable of Response, dict
or Item objects.
pass
def process_start_requests(self, start_requests, spider):
Called with the start requests of the spider, and works
similarly to the process_spider_output() method, except
that it doesn’t have a response associated.
Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info(‘Spider opened: %s’ % spider.name)
class Zol2DownloaderMiddleware(object):
Not all methods need to be defined. If a method is not defined,
scrapy acts as if the downloader middleware does not modify the
passed objects.
@classmethod
def from_crawler(cls, crawler):
This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
Called for each request that goes through the downloader
middleware.
Must either:
- return None: continue processing this request
- or return a Response object
- or return a Request object
- or raise IgnoreRequest: process_exception() methods of
installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
Called with the response returned from the downloader.
Must either;
- return a Response object
- return a Request object
- or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
Called when a download handler or a process_request()
(from other downloader middleware) raises an exception.
Must either:
- return None: continue processing this exception
- return a Response object: stops process_exception() chain
- return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info(‘Spider opened: %s’ % spider.name)
settings.py
-- coding: utf-8 --
Scrapy settings for zol2 project
For simplicity, this file contains only settings considered important or
commonly used. You can find more settings consulting the documentation:
https://doc.scrapy.org/en/latest/topics/settings.html
https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = ‘zol2’
SPIDER_MODULES = [‘zol2.spiders’]
NEWSPIDER_MODULE = ‘zol2.spiders’
Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36’
Obey robots.txt rules
ROBOTSTXT_OBEY = True
Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
Configure a delay for requests for the same website (default: 0)
See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
Disable cookies (enabled by default)
#COOKIES_ENABLED = False
Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
‘Accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8’,
‘Accept-Language’: ‘en’,
#}
Enable or disable spider middlewares
See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
‘zol2.middlewares.Zol2SpiderMiddleware’: 543,
#}
自我介绍一下,小编13年上海交大毕业,曾经在小公司待过,也去过华为、OPPO等大厂,18年进入阿里一直到现在。
深知大多数Python工程师,想要提升技能,往往是自己摸索成长或者是报班学习,但对于培训机构动则几千的学费,着实压力不小。自己不成体系的自学效果低效又漫长,而且极易碰到天花板技术停滞不前!
因此收集整理了一份《2024年Python开发全套学习资料》,初衷也很简单,就是希望能够帮助到想自学提升又不知道该从何学起的朋友,同时减轻大家的负担。
既有适合小白学习的零基础资料,也有适合3年以上经验的小伙伴深入学习提升的进阶课程,基本涵盖了95%以上前端开发知识点,真正体系化!
由于文件比较大,这里只是将部分目录大纲截图出来,每个节点里面都包含大厂面经、学习笔记、源码讲义、实战项目、讲解视频,并且后续会持续更新
如果你觉得这些内容对你有帮助,可以扫码获取!!!(备注:Python)
存中…(img-kAHOCEfr-1713757074370)]
既有适合小白学习的零基础资料,也有适合3年以上经验的小伙伴深入学习提升的进阶课程,基本涵盖了95%以上前端开发知识点,真正体系化!
由于文件比较大,这里只是将部分目录大纲截图出来,每个节点里面都包含大厂面经、学习笔记、源码讲义、实战项目、讲解视频,并且后续会持续更新
如果你觉得这些内容对你有帮助,可以扫码获取!!!(备注:Python)