Python爬虫模拟登陆豆瓣

#-*- coding:utf-8 -*-

import requests,re
from PIL import Image

class DoubanSpider(object):
    def __init__(self):
        self.session = requests.session()

    def login(self, userName, password):
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36',
                   'Referer':'https://www.douban.com/'}

        #这个地方有一点奇怪,豆瓣有时需要验证码,有时不需要验证码
        #需要验证码时,那么data 里面注释的两行是需要的
        data = {'source':'index_nav',
                'redir':'https://www.douban.com/',
                'form_email':userName,
                'form_password':password,
                #captcha-solution:monkey
                #captcha-id:y3SFVd4diuAVw5HvCjPuI7lM:en
                'login':'登录'}

        url = 'https://accounts.douban.com/login'
        #我们注意到前面 data里面还有两个变量添加,下面添加
        capSolution, capId = self._getCapcha(self.session.get(url).content)
        if capSolution and capId:
            print capSolution, capId
            data['captcha-solution'] = capSolution
            data['captcha-id'] = capId

        r = self.session.post(url=url, data=data, headers=headers)
        print self.session.cookies.items()

        #跳转到自己的页面
        html = self.session.get('https://movie.douban.com/mine').content
        print html


    def _getCapcha(self, html):
        capSolution = None

        reg = r'img id="captcha_image" src="(.*?)" alt'
        reg = re.compile(reg)
        capUrl = reg.findall(html)
        if len(capUrl) > 0:
            capUrl = capUrl[0]
            picName = 'captcha.jpg'
            with open (picName, 'wb') as f:
                f.write(self.session.get(url=capUrl).content)
                f.close()
            try:
                img = Image.open(picName)
                img.show()
                img.close()
            except:
                print u'获取验证码失败'
            capSolution = raw_input('输入验证码:')

        reg = r'name="captcha-id" value="(.*?)"/>'
        reg = re.compile(reg)
        capId = None
        capIdList = reg.findall(html)
        if len(capIdList) > 0:
            capId = capIdList[0]

        return capSolution, capId

if __name__ == '__main__':
    spider = DoubanSpider()
    spider.login(userName='', password='')




当运行时,打印出 cookie.items()出现上面的界面就是登陆成功了


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值