0. 在settings.py中设置
# settings.py
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
1. 在 Request
对象中设置 headers 和 cookies
你可以在创建 Request
对象时直接设置 headers
和 cookies
参数:
import scrapy
class MySpider(scrapy.Spider):
name = 'my_spider'
def start_requests(self):
url = 'http://example.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
}
cookies = {
'name': 'value',
}
yield scrapy.Request(
url,
headers=headers,
cookies=cookies,
callback=self.parse
)
def parse(self, response):
# Your parsing code here
pass
2. 在 settings.py
中设置默认 headers 和 cookies
你可以在 Scrapy 的设置文件 settings.py
中设置默认的请求头和 cookies,这样每个请求都会使用这些默认值:
# settings.py
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Accept-Language': 'en',
}
COOKIES = {
'name': 'value',
}
3. 使用中间件(middlewares)
你可以编写或配置下载中间件来动态设置或修改请求的 headers 和 cookies。中间件允许你对所有请求和响应进行更复杂的处理:
# middlewares.py
class MyCustomDownloaderMiddleware:
def process_request(self, request, spider):
request.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
request.cookies['name'] = 'value'
return None
# settings.py
# 开启管道
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.MyCustomDownloaderMiddleware': 543,
}
4. 中间件随机请求头
# middlewares.py
import random
class MyCustomDownloaderMiddleware:
# 随机请求头列表
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6"
]
def process_request(self, request, spider):
# 随机请求头
request.headers['User-Agent'] = random.choice(self.user_agent_list)
request.cookies['name'] = 'value'
return None
# settings.py
# 开启管道
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.MyCustomDownloaderMiddleware': 543,
}
也可以这样
# middlewares.py
"""
安装: pip install fake_useragent
"""
from fake_useragent import UserAgent
class MyCustomDownloaderMiddleware:
def process_request(self, request, spider):
# 使用第三方
request.headers['User-Agent'] = UserAgent().random
request.cookies['name'] = 'value'
return None
# settings.py
# 开启管道
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.MyCustomDownloaderMiddleware': 543,
}
5. 设置代理
# middlewares.py
class MyCustomDownloaderMiddleware:
def process_request(self, request, spider):
request.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
request.cookies['name'] = 'value'
# 设置代理
proxy = 'https://1.71.188.37:3128'
request.meta['proxy'] = proxy
return None
# settings.py
# 开启管道
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.MyCustomDownloaderMiddleware': 543,
}
6. 使用扩展的 CookieMiddleware
你可以扩展 Scrapy 内置的 CookiesMiddleware
来设置全局 cookies:
# middlewares.py
from scrapy.downloadermiddlewares.cookies import CookiesMiddleware
class CustomCookiesMiddleware(CookiesMiddleware):
def process_request(self, request, spider):
request.cookies = {
'name': 'value',
}
return super().process_request(request, spider)
# settings.py
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
'myproject.middlewares.CustomCookiesMiddleware': 700,
}
7. 处理cookies的方法
cookies = "t=e78029a1905a443ea3d54e5a95beab80; r=566; Hm_lvt_f5329ae3e00629a7bb8ad78d0efb7273=1718169339; Hm_lpvt_f5329ae3e00629a7bb8ad78d0efb7273=1718169533"
cookie = {}
for e in cookies.split('; '):
k, v = e.split('=')
cookie[k] = v
print(cookie)
print("---------------------------------------------------")
# 字典生成式
cookie2 = {e.split('=')[0]: e.split('=')[1] for e in cookies.split('; ')}
print("字典生成式----->", cookie2)