python模拟登录豆瓣网爬取影评——requests_html&mongodb&处理验证码

最新推荐文章于 2022-09-20 14:08:57 发布

Lin9977

最新推荐文章于 2022-09-20 14:08:57 发布

阅读量675

点赞数

分类专栏： python 文章标签： requests_html 模拟登录豆瓣网验证码

本文链接：https://blog.csdn.net/lin9977/article/details/80171438

版权

python 专栏收录该内容

6 篇文章 1 订阅

订阅专栏

# -*- coding: utf-8 -*-
"""
Created on Mon Apr 30 14:47:09 2018

"""


from requests_html import HTMLSession
from pymongo import MongoClient
from PIL import Image
import time 


session = HTMLSession()


def createDB():
    #创建mongo数据库，并建立集合
    client = MongoClient('localhost',27017)
    db = client.douban
    col_hldwm = db.hldwm
    return col_hldwm


def login(email,password):
    #登录豆瓣网
    url = 'https://accounts.douban.com/login'
    headers = {
        #在network获取
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
        }
    login_data = {
        #在network获取
        "source":'index_nav',  
        'redir': 'https://www.douban.com',
        "form_email":email,
        "form_password": password,    
        "login": "登录",
         }
    captcha_html = session.get(url).html
    #每次get(url)都会刷新验证码，所以只能get一次并同时获取验证码地址和验证码id !!!!!!!!!!!!!!!!
    captcha_img = captcha_html.find('img.captcha_image',first = True)
    if captcha_img:
        #如果获取了验证码图片则填写验证码并添加进login_data
        login_data = identify_captcha(login_data,captcha_html,captcha_img)
    html = session.post(url,headers = headers,data = login_data)
    html = session.get('https://movie.douban.com/subject/26683723/comments?start=0&limit=20&sort=new_score&status=P&percent_type=').html
    if html.find('a.bn-more',first = True) is None:
        print('登录失败')       
        return False
    else:
        print('登录成功，当前为：%s'%html.find('a.bn-more',first = True).text)
        return True
    
def identify_captcha(login_data,captcha_html,captcha_img):
    #识别验证码并将其添加进login_data
    captcha_id = captcha_html.find('div.captcha_block',first = True).find('input')[1].attrs['value']
    img_src = captcha_img.attrs['src']
    ir = session.get(img_src, stream=True)
    f = open('d:\\captcha.jpg','wb')
    f.write(ir.content)
    f.close()
    Image.open('d:\\captcha.jpg').show()
    captcha_solution = input('请输入验证码:')
    login_data['captcha-solution'] = captcha_solution
    login_data['captcha-id'] = captcha_id
    return login_data    


def get_current_comments(html,col_hldwm):
    #获取当前页的评论并存入mongodb
    comments = html.find('div.comment')
    for comment in comments:
        _id = comment.find('input',first = True).attrs['value']
        comment_info = comment.find('p',first = True).text
        if len(comment.find('span')[4].attrs['title'])== 2:
            class_ = comment.find('span')[4].attrs['title']
        else :
            class_ = ''
        votes = comment.find('span.votes',first = True).text
        col_hldwm.save({'_id':_id,'comment_info':comment_info,'class':class_,'votes':votes})


        
def get_all_comments(col_hldwm):
    #获取所有页面的评论
    first = 'https://movie.douban.com/subject/26683723/comments'
    end = '?start=0&limit=20&sort=new_score&status=P&percent_type='
    while 1:
        url = first + end
        html = session.get(url).html
        get_current_comments(html,col_hldwm)
        try :
            end = html.find('a.next',first = True).attrs['href']
        except AttributeError:
            break       
        time.sleep(0.5)
        
if __name__ == '__main__':
    email = input('帐号:')
    password = input('密码:')
    col_hldwm = createDB()
    if login(email,password):      
        print('正在收集数据，请等待...')
        get_all_comments(col_hldwm)
        print('共收集%s条数据'%str(col_hldwm.count()))
        print('完成！')
    else:
        pass

在验证码图片和验证码id一致性上卡了好几天，欲哭无泪！谨记