网络爬虫——古诗文网中验证码(超级鹰)
目标网址: 古诗文网
目标网址:https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx
任务要求:
(1)通过selenium的方式模拟该网站的登录,并成功输入用户名和密码;
(2)保存验证码图片,并使用输入式验证码识别的方式识别验证码的文字,获取后输入到输入框中,
(3)验证登录是否成功。
源码:
超级鹰源码:
import requests
from hashlib import md5
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
# todo:更改点一
self.password = md5(password.encode("utf-8")).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def PostPic(self, im, codetype):
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
识别源码:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException , NoSuchElementException
import time
from PIL import Image
import pytesseract
import chaojiying
browser = webdriver.Edge('E:\\anaconda\\Scripts\\msedgedriver.exe')
# browser = webdriver.Chrome()
try:
browser.get('https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx')
except TimeoutException:
print('Time Out')
try:
username = browser.find_element_by_xpath('//*[@id="email"]')
username.send_keys('自己账号')
time.sleep(1)
password = browser.find_element_by_xpath('//*[@id="pwd"]')
password.send_keys('自己密码')
time.sleep(1)
pictureN = browser.find_element_by_xpath('//*[@id="imgCode"]')
browser.save_screenshot('login.png')
loc = pictureN.location
size = pictureN.size
left = loc['x']
top = loc['y']
bottom = top+size['height']
right = left+size['width']
page = Image.open('login.png')
Code = page.crop((left,top,right,bottom))
Code.save('code.png')
chaojiying = Chaojiying_Client('超级鹰账号', '密码', 'ID')#ID 具体看软件ID。
im = open('code.png', 'rb').read()
text = chaojiying.PostPic(im,2004)['pic_str']
print(text)
# text = pytesseract.image_to_string(Image.open('code.png'))
# print(text)
CodeWhere = browser.find_element_by_xpath('//*[@id="code"]')
CodeWhere.send_keys(text)
time.sleep(5)
Submit = browser.find_element_by_xpath('//*[@id="denglu"]')
Submit.click()
time.sleep(5)
except NoSuchElementException:
print('No Element')
finally:
browser.close()