基于Python3使用urllib与requests进行豆瓣登录并保持

要点:

  • 验证码下载;
  • post提交结构内容;
  • 登录保持;

(刚才保持草稿显示成功,结果发表后又让我登录,登录一看草稿根本没有,还要自己重新写一遍……)

 

豆瓣网站页面:

https://www.douban.com/accounts/login

首先,我们先登录一下,看看FormData有哪些内容;

可以看到,不仅需要验证单词,还需要验证码ID

首先我们用urllib进行测试,直接上代码:

import urllib.request
import urllib.parse
import re
from lxml import etree
import http.cookiejar
# Python 3.6.3


# 使用 urllib进行登录保持
def testDouban():
    global userAgent
    url_login = 'https://accounts.douban.com/login'
    response = requests.get(url_login).text
    # 获取验证码图片URL
    pattern1 = '<img id="captcha_image" src="(.+?)"'
    url_code = re.compile(pattern1).search(response).group(1)
    # 获取验证码图片ID
    pattern2 = '<input type="hidden" name="captcha-id" value="(.+?)"/>'
    cid = re.compile(pattern2).search(response).group(1)
    get_pic(url_code)
    # 输入下载的验证码
    code = input('请输入验证码:')

    postUrl = "https://accounts.douban.com/login"
    # 构建登录的字典数据
    postDict = {
        "redir": "https://www.douban.com/",
        "form_email": "<你的邮箱>",
        "form_password": "<你的密码>",
        "captcha-solution": code, # 验证码,
        "captcha-id": cid,
    }
    # headerData = {
    #     "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    # }
    # 获取cookie对象
    cookjar = http.cookiejar.CookieJar()
    # 返回一个cookie管理器
    handle = urllib.request.HTTPCookieProcessor(cookjar)
    # 获取一个待cookie的请求管理器
    opener = urllib.request.build_opener(handle)
    opener.handlers = [("User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1")]
    postData = urllib.parse.urlencode(postDict).encode("utf-8")
    # 后面使用opener打开网页即可
    myResponse = opener.open(postUrl, postData)
    # epostdata = urllib.parse.urlencode(postData).encode("utf-8")
    # myRequest = urllib.request.Request(postUrl, epostdata, headerData)
    # myResponse = urllib.request.urlopen(myRequest)
    result = myResponse.read().decode("utf-8")
    print(len(result))
    # 获取自己的用户ID
    id_pat = "userId = '(\d+)',"
    this_id = re.compile(id_pat).findall(result)[0]
    # 进入自己的用户页面
    user_detail = f'https://www.douban.com/people/{this_id}/'
    res = opener.open(user_detail)
    # 这里获取自己读过的书籍,用xpath获取
    xData = etree.HTML(res.read().decode('utf-8'))
    this_books_lst = xData.xpath('//li[@class="aob"]/a/img/@alt')
    for book in this_books_lst:
        print(book)

然后用requests进行测试(简洁多了)

 

import requests
import re
from lxml import etree
# Python 3.6.3


# requests 登录豆瓣,并保持登录状态,然后获取当前用户的部分资料
def get_identifyingCode():
    # requests保持登录比较简单,建立一个会话即可
    session = requests.Session()
    url_login = 'https://accounts.douban.com/login'
    response = requests.get(url_login).text
    # 获取验证码图片URL
    pattern = '<img id="captcha_image" src="(.+?)"'
    url_code = re.compile(pattern).search(response).group(1)
    # 获取验证码图片ID
    pattern2 = '<input type="hidden" name="captcha-id" value="(.+?)"/>'
    cid = re.compile(pattern2).search(response).group(1)
    print(url_code)
    get_pic(url_code)
    # 输入下载的验证码
    code = input('请输入验证码:')
    # 构建登录的字典数据
    postdata = {
        "redir": "https://www.douban.com/",
        "form_email": "<你的邮箱>",
        "form_password": "<你的密码>",
        "captcha-solution": code,  # 验证码,
        "captcha-id": cid,
    }
    # postdata = urllib.parse.urlencode(postdata).encode(encoding="utf-8")
    print(postdata)
    # 登录以后进行会话保持
    res = session.post('https://accounts.douban.com/login', data=postdata)
    print(len(res.text))
    # 获取自己的用户ID
    id_pat = "userId = '(\d+)',"
    this_id = re.compile(id_pat).findall(res.text)[0]

    # 进入自己的用户页面
    user_detail = f'https://www.douban.com/people/{this_id}/'
    res = session.get(user_detail)
    # 这里获取自己读过的书籍,用xpath获取
    xData = etree.HTML(res.text)
    this_books_lst = xData.xpath('//li[@class="aob"]/a/img/@alt')
    for book in this_books_lst:
        print(book)

 

最后放一个公用函数(用于获取验证图片)

def get_pic(url, name=None):
    if name == None:
        name = str(url)
        if '?' in name:
            name = name.split('?')[0]
        name = name.split('/')[-1] + '.jpg'
    res = requests.get(url, timeout=10)
    with open(name, 'wb') as f:
        f.write(res.content)
    return True

感谢观看,欢迎批评指正^^

  • 2
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值