# -*- coding: utf-8 -*-
# Scrapy settings for morekeywords project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import os
import sys
sys.path.append(os.path.dirname(os.path.abspath("."))) # 获取到路径 这个正确
# print(os.path.dirname(os.path.abspath("."))) D:\django\dataAnalysis\morekeywords
# print(os.path.dirname(os.path.abspath(__file__))) D:\django\dataAnalysis\morekeywords\morekeywords 这个错误No module named 'dataAnalysis'
# sys.path.append(r"D:\django\dataAnalysis\dataAnalysis") # 获取到路径
os.environ["DJANGO_SETTINGS_MODULE"] = "dataAnalysis.settings"
import django
django.setup() # 初始化Django 和Django交互的第一步,需要爬虫部署之后才能启动
# 配置完成之后当前无法启动爬虫ModuleNotFoundError: No module named 'dataAnalysis'
# ---------------------------------------------------------------------
BOT_NAME = 'morekeywords'
SPIDER_MODULES = ['morekeywords.spiders']
NEWSPIDER_MODULE = 'morekeywords.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'morekeywords (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32 # 设置并发数量 Scrapy下载程序执行的并发(即同时)请求的最大数量。
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# 下载器在从同一网站下载连续页面之前应等待的时间(以秒为单位)。
# 这可以用于限制爬行速度,以避免连续向服务器发起请求。支持小数
DOWNLOAD_DELAY = 3 # 设置延迟下载间隔,单位:秒
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# 部分网站会通过用户的Cookie信息对用户进行识别与分析,
# 所以要防止目标网站使用Cookie识别我们的会话信息。
COOKIES_ENABLED = False # 禁用cookies
FEED_EXPORT_ENCODING = "utf-8" # 官网说这可以指定编码
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Telnet用于查看当前爬虫的信息,操作爬虫等...
# 使用telnet ip port ,然后通过命令操作
# TELNETCONSOLE_ENABLED = True
# TELNETCONSOLE_HOST = '127.0.0.1'
# TELNETCONSOLE_PORT = [6023,]
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Connection': 'keep-alive',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Origin': 'https://www.amazon.com',
'Sec-Fetch-Site': 'same-site',
'Sec-Fetch-Mode': 'cors',
'Referer': 'https://www.amazon.com/ref=nav_logo?language=en_US',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
}
RANDOM_UA_TYPE = "chrome" # 使用谷歌的浏览器ua头
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'morekeywords.middlewares.MorekeywordsSpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
# 'morekeywords.middlewares.MorekeywordsDownloaderMiddleware': 543,
'morekeywords.middlewares.RandomUserAgentMiddlware': 542,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, # 需要将原始的设置为none
}
# 自定义扩展,基于信号进行调用
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 'morekeywords.pipelines.MorekeywordsPipeline': 300,
'morekeywords.pipelines.MysqlPipeline': 301,
}
# 访问URL去重
# DUPEFILTER_CLASS = 'morekeywords.duplication.RepeatUrl'
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True # 开始自动限速
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5 # 初始下载延迟
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60 # 最大下载延迟
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # 平均每秒并发数
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
"""
启用缓存
目的用于将已经发送的请求或相应缓存下来,以便以后使用
from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
from scrapy.extensions.httpcache import DummyPolicy
from scrapy.extensions.httpcache import FilesystemCacheStorage
"""
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True # 是否启用缓存策略
# HTTPCACHE_EXPIRATION_SECS = 0 # 缓存超时时间
# HTTPCACHE_DIR = 'httpcache' # 缓存保存路径
# HTTPCACHE_IGNORE_HTTP_CODES = [] # 缓存忽略的Http状态码
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' # 缓存存储的插件
记忆碎片之python爬虫scrapy框架settings.py参数
最新推荐文章于 2021-09-29 17:02:00 发布