python模拟登录豆瓣网爬取影评——requests_html&mongodb&处理验证码

# -*- coding: utf-8 -*-
"""
Created on Mon Apr 30 14:47:09 2018

"""


from requests_html import HTMLSession
from pymongo import MongoClient
from PIL import Image
import time 


session = HTMLSession()


def createDB():
    #创建mongo数据库,并建立集合
    client = MongoClient('localhost',27017)
    db = client.douban
    col_hldwm = db.hldwm
    return col_hldwm


def login(email,password):
    #登录豆瓣网
    url = 'https://accounts.douban.com/login'
    headers = {
        #在network获取
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
        }
    login_data = {
        #在network获取
        "source":'index_nav',  
        'redir': 'https://www.douban.com',
        "form_email":email,
        "form_password": password,    
        "login": "登录",
         }
    captcha_html = session.get(url).html
    #每次get(url)都会刷新验证码,所以只能get一次并同时获取验证码地址和验证码id !!!!!!!!!!!!!!!!
    captcha_img = captcha_html.find('img.captcha_image',first = True)
    if captcha_img:
        #如果获取了验证码图片则填写验证码并添加进login_data
        login_data = identify_captcha(login_data,captcha_html,captcha_img)
    html = session.post(url,headers = headers,data = login_data)
    html = session.get('https://movie.douban.com/subject/26683723/comments?start=0&limit=20&sort=new_score&status=P&percent_type=').html
    if html.find('a.bn-more',first = True) is None:
        print('登录失败')       
        return False
    else:
        print('登录成功,当前为:%s'%html.find('a.bn-more',first = True).text)
        return True
    
def identify_captcha(login_data,captcha_html,captcha_img):
    #识别验证码并将其添加进login_data
    captcha_id = captcha_html.find('div.captcha_block',first = True).find('input')[1].attrs['value']
    img_src = captcha_img.attrs['src']
    ir = session.get(img_src, stream=True)
    f = open('d:\\captcha.jpg','wb')
    f.write(ir.content)
    f.close()
    Image.open('d:\\captcha.jpg').show()
    captcha_solution = input('请输入验证码:')
    login_data['captcha-solution'] = captcha_solution
    login_data['captcha-id'] = captcha_id
    return login_data    


def get_current_comments(html,col_hldwm):
    #获取当前页的评论并存入mongodb
    comments = html.find('div.comment')
    for comment in comments:
        _id = comment.find('input',first = True).attrs['value']
        comment_info = comment.find('p',first = True).text
        if len(comment.find('span')[4].attrs['title'])== 2:
            class_ = comment.find('span')[4].attrs['title']
        else :
            class_ = ''
        votes = comment.find('span.votes',first = True).text
        col_hldwm.save({'_id':_id,'comment_info':comment_info,'class':class_,'votes':votes})


        
def get_all_comments(col_hldwm):
    #获取所有页面的评论
    first = 'https://movie.douban.com/subject/26683723/comments'
    end = '?start=0&limit=20&sort=new_score&status=P&percent_type='
    while 1:
        url = first + end
        html = session.get(url).html
        get_current_comments(html,col_hldwm)
        try :
            end = html.find('a.next',first = True).attrs['href']
        except AttributeError:
            break       
        time.sleep(0.5)
        
if __name__ == '__main__':
    email = input('帐号:')
    password = input('密码:')
    col_hldwm = createDB()
    if login(email,password):      
        print('正在收集数据,请等待...')
        get_all_comments(col_hldwm)
        print('共收集%s条数据'%str(col_hldwm.count()))
        print('完成!')
    else:
        pass
在验证码图片和验证码id一致性上卡了好几天,欲哭无泪!谨记
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值