#-*- coding:utf-8 -*-
import requests,re
from PIL import Image
class DoubanSpider(object):
def __init__(self):
self.session = requests.session()
def login(self, userName, password):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36',
'Referer':'https://www.douban.com/'}
#这个地方有一点奇怪,豆瓣有时需要验证码,有时不需要验证码
#需要验证码时,那么data 里面注释的两行是需要的
data = {'source':'index_nav',
'redir':'https://www.douban.com/',
'form_email':userName,
'form_password':password,
#captcha-solution:monkey
#captcha-id:y3SFVd4diuAVw5HvCjPuI7lM:en
'login':'登录'}
url = 'https://accounts.douban.com/login'
#我们注意到前面 data里面还有两个变量添加,下面添加
capSolution, capId = self._getCapcha(self.session.get(url).content)
if capSolution and capId:
print capSolution, capId
data['captcha-solution'] = capSolution
data['captcha-id'] = capId
r = self.session.post(url=url, data=data, headers=headers)
print self.session.cookies.items()
#跳转到自己的页面
html = self.session.get('https://movie.douban.com/mine').content
print html
def _getCapcha(self, html):
capSolution = None
reg = r'img id="captcha_image" src="(.*?)" alt'
reg = re.compile(reg)
capUrl = reg.findall(html)
if len(capUrl) > 0:
capUrl = capUrl[0]
picName = 'captcha.jpg'
with open (picName, 'wb') as f:
f.write(self.session.get(url=capUrl).content)
f.close()
try:
img = Image.open(picName)
img.show()
img.close()
except:
print u'获取验证码失败'
capSolution = raw_input('输入验证码:')
reg = r'name="captcha-id" value="(.*?)"/>'
reg = re.compile(reg)
capId = None
capIdList = reg.findall(html)
if len(capIdList) > 0:
capId = capIdList[0]
return capSolution, capId
if __name__ == '__main__':
spider = DoubanSpider()
spider.login(userName='', password='')
当运行时,打印出 cookie.items()出现上面的界面就是登陆成功了