验证码是网页识别机器还是人的一种方法,这为爬虫带来了一定难度。本章我们演示了如何批量上传文件到网页,以及识别验证码进行访问。
一、文件上传
主要针对网页标签为input,type="file"的类型。直接给元素输入文件绝对路径和文件名。
from selenium import webdriver
driver = webdriver.Firefox()
driver.get("https://cpclab.uni-duesseldorf.de/cna/main.php")
driver.implicitly_wait(10)
driver.find_elements_by_css_selector("#m1")[0].click()
driver.find_elements_by_css_selector("#upload")[0].send_keys("/home/lxh/Documents/Lysin/PDB_Thermal_stable/PDB_A_chain/已跑/" + b)
二、验证码识别
image_element = driver.find_elements_by_css_selector('body > div:nth-child(1) > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(1) > img:nth-child(1)')[0]
location = image_element.location
size = image_element.size
top, bottom, left, right = location['y'] - 17, location['y'] - 85 + size['height'], location['x'] + 155, location['y'] + size['width'] - 250
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
screenshot = driver.get_screenshot_as_png()
screenshot = Image.open(BytesIO(screenshot))
captcha = screenshot.crop((left,top,right,bottom))
captcha.save("/home/lxh/Downloads/zip-temp_2023-01-22_a5960a34995b4eab97faecbb7bb0be16/captcha1.png")
time.sleep(10)
ocr = ddddocr.DdddOcr()
with open('/home/lxh/Downloads/zip-temp_2023-01-22_a5960a34995b4eab97faecbb7bb0be16/captcha1.png', 'rb') as f:
img_bytes = f.read()
res = ocr.classification(img_bytes)
driver.find_elements_by_css_selector("body > div:nth-child(1) > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(1) > span:nth-child(5) > fieldset:nth-child(1) > form:nth-child(2) > table:nth-child(2) > tbody:nth-child(1) > tr:nth-child(14) > td:nth-child(2) > input:nth-child(3)")[0].send_keys(res)
driver.find_elements_by_css_selector("#sub1")[0].click()
time.sleep(10)
三、完整的循环语句
from selenium import webdriver
driver = webdriver.Firefox()
import time
from io import BytesIO
from PIL import Image
import os
import ddddocr
driver.get("https://cpclab.uni-duesseldorf.de/cna/main.php")
driver.implicitly_wait(10)
for a in os.walk("/home/lxh/Documents/Lysin/PDB_Thermal_stable/PDB_A_chain/已跑/"):
aa = a[2]
for b in aa:
driver.find_elements_by_css_selector("#m1")[0].click()
driver.find_elements_by_css_selector("#upload")[0].send_keys("/home/lxh/Documents/Lysin/PDB_Thermal_stable/PDB_A_chain/已跑/" + b)
image_element = driver.find_elements_by_css_selector('body > div:nth-child(1) > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(1) > img:nth-child(1)')[0]
location = image_element.location
size = image_element.size
top, bottom, left, right = location['y'] - 17, location['y'] - 85 + size['height'], location['x'] + 155, location['y'] + size['width'] - 250
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
screenshot = driver.get_screenshot_as_png()
screenshot = Image.open(BytesIO(screenshot))
captcha = screenshot.crop((left,top,right,bottom))
captcha.save("/home/lxh/Downloads/zip-temp_2023-01-22_a5960a34995b4eab97faecbb7bb0be16/captcha1.png")
time.sleep(10)
ocr = ddddocr.DdddOcr()
with open('/home/lxh/Downloads/zip-temp_2023-01-22_a5960a34995b4eab97faecbb7bb0be16/captcha1.png', 'rb') as f:
img_bytes = f.read()
res = ocr.classification(img_bytes)
driver.find_elements_by_css_selector("body > div:nth-child(1) > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(1) > span:nth-child(5) > fieldset:nth-child(1) > form:nth-child(2) > table:nth-child(2) > tbody:nth-child(1) > tr:nth-child(14) > td:nth-child(2) > input:nth-child(3)")[0].send_keys(res)
driver.find_elements_by_css_selector("#sub1")[0].click()
time.sleep(10)
detect = driver.find_elements_by_tag_name('body')[0].text
while "ERROR" in detect:
driver.find_elements_by_css_selector("#m1")[0].click()
driver.find_elements_by_css_selector("#upload")[0].send_keys("/home/lxh/Documents/Lysin/PDB_Thermal_stable/PDB_A_chain/已跑/" + b)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
image_element = driver.find_elements_by_css_selector('body > div:nth-child(1) > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(1) > img:nth-child(1)')[0]
location = image_element.location
size = image_element.size
top, bottom, left, right = location['y'] - 17, location['y'] - 85 + size['height'], location['x'] + 155, location['y'] + size['width'] - 250
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
screenshot = driver.get_screenshot_as_png()
screenshot = Image.open(BytesIO(screenshot))
captcha = screenshot.crop((left,top,right,bottom))
captcha.save("/home/lxh/Downloads/zip-temp_2023-01-22_a5960a34995b4eab97faecbb7bb0be16/captcha1.png")
time.sleep(10)
ocr = ddddocr.DdddOcr()
with open('/home/lxh/Downloads/zip-temp_2023-01-22_a5960a34995b4eab97faecbb7bb0be16/captcha1.png', 'rb') as f:
img_bytes = f.read()
res = ocr.classification(img_bytes)
driver.find_elements_by_css_selector("body > div:nth-child(1) > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(1) > span:nth-child(5) > fieldset:nth-child(1) > form:nth-child(2) > table:nth-child(2) > tbody:nth-child(1) > tr:nth-child(14) > td:nth-child(2) > input:nth-child(3)")[0].send_keys(res)
driver.find_elements_by_css_selector("#sub1")[0].click()
time.sleep(10)
detect = driver.find_elements_by_tag_name('body')[0].text
driver.find_elements_by_css_selector("body > div:nth-child(1) > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(1) > span:nth-child(5) > font:nth-child(5) > span:nth-child(8) > b:nth-child(1) > a:nth-child(2) > font:nth-child(1) > b:nth-child(1)")[0].click()
time.sleep(10)
n = driver.window_handles
driver.switch_to_window(n[-1])
driver.find_elements_by_css_selector("body > table:nth-child(14) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(3) > a:nth-child(1) > font:nth-child(1) > b:nth-child(1)")[0].click()
time.sleep(10)
data1 = driver.find_elements_by_tag_name('body')[0].text
f = open("/home/lxh/Documents/MDL_DATA/weak_spot.txt","a+")
f.write(b + "\n")
for a in data1:
f.write(a)
f.write("\n")
f.close()
driver.close()
driver.switch_to_window(n[0])
time.sleep(3)
driver.get("https://cpclab.uni-duesseldorf.de/cna/main.php")
driver.implicitly_wait(10)
其中利用while循环将识别错误的验证码,重新验证并输入。利用截屏的方式获得验证码图片,截屏元素的位置通过位于网页顶端图片的位置,并尝试调整大小获得。最好不要直接获取验证码元素的位置,因为验证码的坐标往往大于电脑屏幕的坐标位置,从而截屏图像为没有像素的图片,因此需要将截屏的验证码调整到屏幕可见位置。