通过 cookies 登陆,以豆瓣的个人邮箱为例
# -*- coding: utf-8 -*-
import scrapy
from douban_movie_rank.items import DoubanMovieRankItem
from scrapy.http import Request
from scrapy.http import FormRequest
class BasicSpider(scrapy.Spider):
name = 'login'
allowed_domains = ['douban.com']
#定制化请求的开始
def start_requests(self):
cookies = """
bid=b_VAJAFawms; _vwo_uuid_v2=DC93755BE00D1BD9609C359C130817EAE|bf117511c89e3c610565aca0bc40d0b5; _ga=GA1.\
2.935052215.1531653270; ll="118339"; gr_user_id=1e70a6ee-48be-43c6-8c18-69ec7c3163b9; __yadk_uid=iukblxXRnN\
FkgMu6eBxUqjAPyYCQCJ1g; __utmv=30149280.18126; douban-profile-remind=1; douban-fav-remind=1; viewed="114828\
2_2995812"; __utmz=30149280.1547989086.21.4.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ct=y; __utmc=3014\
9280; push_noty_num=0; push_doumail_num=0; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1548230896%2C%22https%3\
A%2F%2Faccounts.douban.com%2Fpassport%2Flogin%22%5D; _pk_id.100001.8cb4=9cbec346701a3c1b.1531653651.16.1548\
230896.1548226698.; _pk_ses.100001.8cb4=*; ap_v=0,6.0; __utma=30149280.935052215.1531653270.1548226701.1548\
230897.32; __utmt=1; __utmb=30149280.1.10.1548230897; dbcl2="181261570:kuPE/q2bNhc
"""
# 将cookies分割成字典形式
cookies = {
i.split("=")[0] : i.split("=")[1] for i in cookies.split("; ")
}
yield FormRequest(
'https://www.douban.com/doumail/',
cookies=cookies,
callback=self.parse
)
def parse(self, response):
"""
@url https://accounts.douban.com/passport/login
@returns items 1 16
@returns requests 0 0
@scrapes title publish_info score comment_number_of_people
:param response:
:return:items
"""
#登陆成功,爬取邮件标题
a = response.xpath('//span[@class="from"]/text()')
if a:
print('登陆成功')
else:
print('失败')
通过提交用户名密码,
https://blog.csdn.net/qq_32942549/article/details/79585013
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request, FormRequest
class PachSpider(scrapy.Spider): # 定义爬虫类,必须继承scrapy.Spider
name = 'login' # 设置爬虫名称
allowed_domains = ['edu.iqianyue.com'] # 爬取域名
# start_urls = ['http://edu.iqianyue.com/index_user_login.html'] #爬取网址,只适于不需要登录的请求,因为没法设置cookie等信息
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0'} # 设置浏览器用户代理
def start_requests(self): # 用start_requests()方法,代替start_urls
"""第一次请求一下登录页面,设置开启cookie使其得到cookie,设置回调函数"""
return [Request('http://edu.iqianyue.com/index_user_login.html', meta={'cookiejar': 1}, callback=self.parse)]
def parse(self, response): # parse回调函数
data = { # 设置用户登录信息,对应抓包得到字段
'number': '',
'passwd': '',
'submit': ''
}
# 响应Cookie
Cookie1 = response.headers.getlist('Set-Cookie') # 查看一下响应Cookie,也就是第一次访问注册页面时后台写入浏览器的Cookie
print(Cookie1)
print('登录中')
"""第二次用表单post请求,携带Cookie、浏览器代理、用户登录信息,进行登录给Cookie授权"""
return [FormRequest.from_response(response,
url='http://edu.iqianyue.com/index_user_login', # 真实post地址
meta={'cookiejar': response.meta['cookiejar']},
headers=self.header,
formdata=data,
callback=self.next,
)]
def next(self, response):
a = response.body.decode("utf-8") # 登录后可以查看一下登录响应信息
# print(a)
"""登录后请求需要登录才能查看的页面,如个人中心,携带授权后的Cookie请求"""
yield Request('http://edu.iqianyue.com/index_user_index.html', meta={'cookiejar': True}, callback=self.next2)
def next2(self, response):
# 请求Cookie
Cookie2 = response.request.headers.getlist('Cookie')
print(Cookie2)
body = response.body # 获取网页内容字节类型
unicode_body = response.body_as_unicode() # 获取网站内容字符串类型
a = response.xpath('/html/head/title/text()').extract() # 得到个人中心页面
print(a)
隐藏表单数据
<div style="display:none;">,类似这样的
此标签为隐藏标签包含了隐藏的 <input type="hidden"> 他们的值在 value 属性中,虽然
值不需要用户填写,但提交的表单数据中缺少他们可能会导致登录验证失败,这些隐藏的 <input>
有其他一些用途,比如:
<input name="_next"> 用来告诉服务器,登录成功后页面跳转的地址
<input name="_formkey"> 用来防止 CSRF 跨域攻击
1)可把隐藏字段的值提取出来到字典中,在添加账户密码两个键,提交表单
>>> sel = response.xpath('//div[@style]/input')
>>> sel
[<Selector xpath='//div[@style]/input' data='<input name="_next" type="hidden" value='>,
<Selector xpath='//div[@style]/input' data='<input name="_formkey" type="hidden" val'>,
<Selector xpath='//div[@style]/input' data='<input name="_formname" type="hidden" va'>]
>>> fd = dict(zip(sel.xpath('./@name').extract(),sel.xpath('./@value').extract()))
2)访问了登录页面,有 response 对象,包含页面中的 <form> 元素,帮助用户创建 FromReques 对象,并将隐藏 <input> 中的信息自动填入表单,使用这种方式我们只需要填写账号和密码
>>> from scrapy.http import FormRequest
>>> request = FormRequest.from_response(response,formdata=fd)
简单验证码识别登录 OCR 识别 pytesseract
# -*- coding: utf-8 -*-
import scrapy
import json
import pytesseract
from scrapy import Request,FormRequest
from PIL import Image
from io import BytesIO
from scrapy.log import logger
class CaptchaloginSpider(scrapy.Spider):
name = 'captchalogin'
allowed_domains = ['xxx.com']
start_urls = ['http://xxx.com/']
def parse(self, response):
pass
# x 网站登录页面的 url(虚构的)
login_url = 'http://xxx.com/login'
user = 'zjk'
password = '123456'
def start_requests(self):
yield Request(self.login_url,callback=self.login,dont_filter=True)
def login(self,response):
"""
该方法即从登录页面提取验证码,又保存登录页面的响应信息,还是下载验证码的函数
如果 response.meta['login_response'](登录后保存登录页面的 response 响
应)存在,当前 response 为验证码图片的响应
否则当前 response 为登录页面的响应
:param response:
:return:
"""
login_response = response.meta.get('login_response')
if not login_response:
#Step 1:
#此时 response 为登录页面响应,从中提取验证码图片的 url,下载验证码图片
captchaUrl = response.css('验证码图片的路径').extract_first()
captchaUrl = response.urljoin(captchaUrl)
#构造 Request 时,将当前 response 保存到 meta 字典中
yield Request(captchaUrl,callback=self.login,
meta={'login_response' : response},
dont_filter=True)
else:
#Step 2:
#此时 response 为验证码图片响应,response.body 时二进制数据
# login_response 为登录页面的响应,用其构造表单请求并发送
formdata = {
'enamil' : 'xxxx',
'password' : 'xxxx',
#使用 OCR 识别验证码
'code' : self.get_captcha_by_OCR(response.body)
}
yield FormRequest.from_response(login_response,
callback=self.parse_login,
formdata=formdata,
dont_filter=True)
def parse_login(self,response):
"""
根据响应结果判断是否登录成功
:param response:
:return:
"""
info = json.loads(response.text)
if info['error'] == '0':
logger.info('登录成功:-)')
return super().start_requests()
logger.info('登录失败:-(,重新登录...')
return self.start_requests()
def get_captcha_by_OCR(self,data):
"""
验证码识别
:param data:
:return:
"""
img = Image.open(BytesIO(data))
img = img.convert('L')
captcha = pytesseract.image_to_string(img)
img.close()
return captcha