from scrapy.http.cookies import CookieJar # 该模块继承自内置的http.cookiejar,操作类似
# 实例化一个cookiejar对象
cookie_jar = CookieJar()
real_cookie = {}
# 首先是cookie的提取
class MySpider(scrapy.Spider):
....
....
# 模拟登陆,之后调用一个检查是否登录成功的函数
def login(self, response):
....
return [scrapy.FormRequest(
url=login_url,
formdata = {'username':xxx, 'password':xxx},
callback = self.check_login
)]
def save_cookie(self, cookie_jar):
for cookie in cookie_jar:
p = re.compile(r'<Cookie (.*?) for .*?>')
cookies = re.findall(p, str(cookie))
if '=' in cookies[0]:
cookies = (cookie.split('=', 1) for cookie in cookies)
tmp_dict = dict(cookies)
for item in tmp_dict:
self.real_cookie[item] = tmp_dict[item]
def check_login(self, response):
if 登录成功:
# 到这里我们的登录状态已经写入到response header中的'Set-Cookies'中了,
# 使用extract_cookies方法可以提取response中的cookie
cookiejar.extract_cookies(response, response.request)
self.save_cookie(cookiejar)
后面一些请求就可以带上这个cookie
scrapy.Request(url, callback=self.xxx, cookies=self.real_cookie)