作为强大的采集框架scrapy,有几个基本配置,大家一定要掌握。下面猫哥一一为大家介绍。
- 编辑settings.py文件
# 随机下载延迟
RANDOM_DELAY = 1
# MIDDLEWARES 设置
DOWNLOADER_MIDDLEWARES = {
'DemoProjects.middlewares.RandomDelayMiddleware': 150,
}
- 编辑middlewares文件
# 设置随机延时
class RandomDelayMiddleware(object):
def __init__(self, delay):
self.delay = delay
@classmethod
def from_crawler(cls, crawler):
delay = crawler.spider.settings.get("RANDOM_DELAY", 10)
if not isinstance(delay, int):
raise ValueError("RANDOM_DELAY need a int")
return cls(delay)
def process_request(self, request, spider):
# delay = random.randint(0, self.delay)
delay = random.uniform(0, self.delay)
delay = float("%.1f" % delay)
logging.debug("### random delay: %s s ###" % delay)
time.sleep(delay)
- 编辑settings.py文件
# 设置随机UA
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"
RANDOM_UA_TYPE = "random"
# MIDDLEWARES 设置
DOWNLOADER_MIDDLEWARES = {
'DemoProjects.middlewares.RandomUserAgentMiddlware': 100,
}
- 编辑middlewares文件
# pip install fake_useragent
# 导入UserAgent
from fake_useragent import UserAgent
# 随机更换user-agent方法
class RandomUserAgentMiddlware(object):
def __init__(self, crawler):
super(RandomUserAgentMiddlware, self).__init__()
self.ua = UserAgent(use_cache_server=False, verify_ssl=False)
self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
def get_ua():
# print(request.headers)
return getattr(self.ua, self.ua_type)
request.headers.setdefault('User-Agent', get_ua())
- 编辑settings.py文件
# MIDDLEWARES 设置
DOWNLOADER_MIDDLEWARES = {
'DemoProjects.middlewares.RandomUserAgentMiddlware': 50,
}
- 编辑middlewares文件
# GetIP方法是自己定义的,用于返回一个IP,IP格式:https://58.218.92.167:2303
import GetIP
class RandomProxyMiddleware(object):
# 动态设置ip代理
def process_request(self, request, spider):
# 定义方法
get_ip = GetIP()
# 这里猫哥是设置了一个IP池,每次随机的从IP池里取出一个IP使用
proxy_ip = get_ip.get_random_ip()
# print("当前使用的代理IP是" + proxy_ip)
request.meta["proxy"] = proxy_ip
完毕!
⚠️ 不懂的地方,欢迎在评论区留言~