作用
下载器中间件,实现请求的包装
User-Agent池
发少量的请求时:
settings.py:
USER_AGENT = " XXX"
DEFAULT_REQUEST_HEADERS = { “User-Agent”:xxxx}
固定的
爬虫文件中:
yield scrapy.Request(url=,meta={},callback=,headers={},cookies={})
包装每个请求
#middlewares.py
#包装User-Agent
from fake_useragent import UserAgent
class MyBaiduMiddleware:
#包装请求
def process_request(self,request,spider):
#每个请求都包装一个
request.headers["User-Agent"] = UserAgent().random
print("包装的User-Agent:",request.headers.get("User-Agent"))
#随机代理ip
import random
from .proxy import proxy
class MyBaiduMiddle2:
def process_request(self,request,spider):
p = random.choice(proxy)
#meta属性里
request.meta["proxy"] = p
安装并开启下载器中间件
若当前代理IP不能用,scrapy 重试三次,均不能用,则抛出异常
中间件中 定义异常处理
def process_exception(self,request,exception,spider):
return request
#scrapy会一直重试
#随机cookies,单独的一个cookie高频率访问,也可被识别
#随机cookies
class MyBaiduMiddleware3:
def process_request(self,request,spider):
cookie_dict = self.get_cookie()
#设置cookies 字典
request.cookies = cookie_dict
print("cookies:",cookie_dict)
def get_cookie(self):
cookie_str = """BAIDUID=EBD412F8194971612439F501075582CA:FG=1; PSTM=1620650226; BIDUPSID=F54624FE6C7615FCE4623BE8F79C263F; BD_UPN=12314753; __yjs_duid=1_c861f54cc0acddb0511c63bc19e3dd231620826973310; MCITY=-286:; BAIDUID_BFESS=EBD412F8194971612439F501075582CA:FG=1; Hm_lvt_aec699bb6442ba076c8981c6dc490771=1622024731,1622025237,1622040739,1622195675; COOKIE_SESSION=66449_1_9_0_30_143_0_9_9_9_5_17_0_0_5493_2_1622195190_1622256148_1622256146|9#917081_27_1622256146|9; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDRCVFR[FIXrT5n2Tgt]=mk3SLVN4HKm; BD_HOME=1; H_PS_PSSID=33986_34004_33773_33855_33607_34076_26350; BA_HECTOR=8l2l00818g8l8h8k4u1gb44730q"""
cookie_list = cookie_str.split("; ")
cookie_dict = {}
for i in cookie_list:
k,v = i.split("=")
cookie_dict[k] = v
return cookie_dict
注意:
在配置文件中,
#注释时,不使用cookie
#取消注释,False, 使用default_request_headers中的Cookie
#取消注释,True,使用Request()中的cookies
COOKIES_ENABLED = True
安装中间件,一个类是一个中间件
代码
提取码:5413