解决验证码问题的方法:第一种是提取验证码的地址,下载验证码到本地,手动输入后再post登陆。第二种是通过一些验证码识别库进行识别。第三种是云打码平台,需付费。
1.手动输入
有了验证码之后,表单内容会有变化
import re
# pickle是对cookie文件进行dump,load操作
import pickle
import requests
from PIL import Image
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
# 提交表单登陆并获取cookie
def get_cookie_from_net():
# 获取验证码
url = 'https://accounts.douban.com/login'
login_html = s.get(url, headers=headers).text
verif_img_url = re.findall(r'<img id="captcha_image" src="(.*?)" alt="captcha"', login_html)[0]
verif_img_data = s.get(verif_img_url, headers=headers).content
with open('douban.jpg', 'wb') as f:
# 将验证码数据写图片文件
f.write(verif_img_data)
# 手动输入验证码
img = Image.open('douban.jpg')
Image._show(img)
captcha_img = str(input("输入验证码: "))
captcha_id = re.findall(r'name="captcha-id" value="(.*?)"/>', login_html)[0]
print('captcha_id:', captcha_id)
# 构建表单
payload = {
'source': 'None',
'redir': 'https://www.douban.com/people/182582312/',
'form_email': '17706214981',
'form_password': '你的密码',
'captcha-solution': captcha_img,
'captcha-id': str(captcha_id),
'login': '登陆'
}
print(payload)
# 查看表单流程:右击--检查--network--登陆页面登陆操作--查看form data
# 绕过SSL验证
data = s.post(url, headers=headers, data=payload, verify=True)
with open('cookies.douban', 'wb') as f:
cookiedict = requests.utils.dict_from_cookiejar(s.cookies)
pickle.dump(cookiedict, f)
print("提交表单登陆,成功获取cookie...")
'''
这里可以用用户名进一步验证是否登陆成功
'''
# print(data.text)
if 'york' in data.text:
print("york登陆成功!")
return s.cookies
def get_cookie_from_file():
with open('cookies.douban', 'rb') as f:
cookiedict = pickle.load(f)
cookiedict = requests.utils.cookiejar_from_dict(cookiedict)
print("解析文件,成功提取cookies...")
return cookiedict
def getdata(html):
soup = BeautifulSoup(html.text, 'lxml')
# print(soup)
mydata = soup.select('#display')[0].get_text()
'''
进行登陆后其他数据的获取及存储,这里仅仅获取自己的签名数据
'''
return mydata
def login_and_getdata():
print('获取cookies...')
try:
s.cookies = get_cookie_from_file()
except:
print("从文件获取cookie失败...\n正在尝试提交表单登陆以获取...")
s.cookies = get_cookie_from_net()
html = s.get('https://www.douban.com/people/182582312/', headers=headers)
data = getdata(html)
print(data)
if __name__ == '__main__':
s = requests.session()
ua = UserAgent()
headers = {'User-Agent': ua.random}
login_and_getdata()
此处的核心操作为验证码url的获取和新增id的获取,此处用了正则表达式进行抓去。获取验证码之后保存至本地,再自动弹出来,然后手动输入,提交表单。
2.利用pytesseract进行简单的验证码识别
import pytesseract
from PIL import Image
image = Image.open('chars.png')
vcode = pytesseract.image_to_string(image)
Image._show(Image)
print(vcode)
3.云打码平台
查看打码平台的开发文档,参照API。核心是post过程。
4.其他
由于验证码出现的不确定性,通过异常处理解决
import re
# pickle是对cookie文件进行dump,load操作
import pickle
import requests
from PIL import Image
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
# 提交表单登陆并获取cookie
def get_cookie_from_net():
url = 'https://accounts.douban.com/login'
login_html = s.get(url, headers=headers).text
try:
# 获取验证码
verif_img_url = re.findall(r'<img id="captcha_image" src="(.*?)" alt="captcha"', login_html)[0]
verif_img_data = s.get(verif_img_url, headers=headers).content
with open('douban.jpg', 'wb') as f:
# 将验证码数据写图片文件
f.write(verif_img_data)
except:
captha_id = captha_code = None
else:
captcha_id = re.findall(r'name="captcha-id" value="(.*?)"/>', login_html)[0]
print('captcha_id:', captcha_id)
# 手动输入验证码
img = Image.open('douban.jpg')
Image._show(img)
captcha_img = str(input("输入验证码: "))
# 构建表单
if captha_id == None:
payload = {
'source': 'None',
'redir': 'https://www.douban.com/people/182582312/',
'form_email': '17706214981',
'form_password': '你的密码',
'login': '登陆'
}
else:
payload = {
'source': 'None',
'redir': 'https://www.douban.com/people/182582312/',
'form_email': '17706214981',
'form_password': '6641199qqh',
'captcha-solution': captcha_img,
'captcha-id': str(captcha_id),
'login': '登陆'
}
print(payload)
# 查看表单流程:右击--检查--network--登陆页面登陆操作--查看form data
# 绕过SSL验证
data = s.post(url, headers=headers, data=payload, verify=True)
with open('cookies.douban', 'wb') as f:
cookiedict = requests.utils.dict_from_cookiejar(s.cookies)
pickle.dump(cookiedict, f)
print("提交表单登陆,成功获取cookie...")
'''
这里可以用用户名进一步验证是否登陆成功
'''
# print(data.text)
if 'york' in data.text:
print("york登陆成功!")
return s.cookies
def get_cookie_from_file():
with open('cookies.douban', 'rb') as f:
cookiedict = pickle.load(f)
cookiedict = requests.utils.cookiejar_from_dict(cookiedict)
print("解析文件,成功提取cookies...")
return cookiedict
def getdata(html):
soup = BeautifulSoup(html.text, 'lxml')
# print(soup)
mydata = soup.select('#display')[0].get_text()
'''
进行登陆后其他数据的获取及存储,这里仅仅获取自己的签名数据
'''
return mydata
def login_and_getdata():
print('获取cookies...')
try:
s.cookies = get_cookie_from_file()
except:
print("从文件获取cookie失败...\n正在尝试提交表单登陆以获取...")
s.cookies = get_cookie_from_net()
html = s.get('https://www.douban.com/people/182582312/', headers=headers)
data = getdata(html)
print(data)
if __name__ == '__main__':
s = requests.session()
ua = UserAgent()
headers = {'User-Agent': ua.random}
login_and_getdata()