from selenium import webdriver import time import requests from lxml import etree import base64 #获取登陆页面的页面信息和Url driver = webdriver.Chrome() url = "https://accounts.douban.com/login?alias=doraemon_meow_meow%40163.com&redir=https%3A%2F%2Fwww.douban.com%2Fsettings%2F&source=index_nav&error=1016" driver.get(url) time.sleep(1) driver.find_element_by_id('email').send_keys('********')#这里的******代表豆瓣账号 time.sleep(1) driver.find_element_by_id('password').send_keys('******')#这里的****代表豆瓣密码 time.sleep(1) html_str = driver.page_source html_ele = etree.HTML(html_str) image_url = html_ele.xpath('//img[@id="captcha_image"]/@src')[0] response = requests.get(image_url) #把获取到的验证码信息转为base64str类型 b64_str = base64.b64encode(response.content) #开始调用阿里云验证码 v_type = 'cn' #把需要的验证码信息传入阿里云验证码识别 form = { 'v_pic':b64_str, 'v_type':v_type, } headers = { "Authorization": "APPCODE **********"#这里的*代表阿里云的appcode值,如有需要自行去阿里云购买 } # dmpt_url = "http://yzmplus.market.alicloudapi.com/fzyzm" response = requests.post(dmpt_url,form,headers=headers) captcha_value = response.json()['v_code'] driver.find_element_by_id('captcha_field').send_keys(captcha_value) time.sleep(1) driver.find_element_by_class_name('btn-submit').click() time.sleep(1) #获取所有cookie的信息 cookies = driver.get_cookies() cookie_list = [] #循环拼接cookie信息 for cookie_dict in cookies: cookie_str = cookie_dict['name'] + '=' +cookie_dict['value'] cookie_list.append(cookie_str) header_cookie = ';'.join(cookie_list) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'cookie': 'headers_cookie' } #登陆进入账号管理页面 another_url = "https://www.douban.com/accounts/" response = requests.get(another_url,headers=headers) #保存 with open('douban.html','wb') as f: f.write(response.content)
破豆瓣网验证码反爬机制
最新推荐文章于 2024-04-02 05:00:00 发布