web自动化文件上传及验证码识别

​     验证码是网页识别机器还是人的一种方法,这为爬虫带来了一定难度。本章我们演示了如何批量上传文件到网页,以及识别验证码进行访问。

一、文件上传

   主要针对网页标签为input,type="file"的类型。直接给元素输入文件绝对路径和文件名。

from selenium import webdriver
driver = webdriver.Firefox()
driver.get("https://cpclab.uni-duesseldorf.de/cna/main.php")
driver.implicitly_wait(10)
driver.find_elements_by_css_selector("#m1")[0].click()
driver.find_elements_by_css_selector("#upload")[0].send_keys("/home/lxh/Documents/Lysin/PDB_Thermal_stable/PDB_A_chain/已跑/" + b)
​

二、验证码识别

image_element = driver.find_elements_by_css_selector('body > div:nth-child(1) > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(1) > img:nth-child(1)')[0]
location = image_element.location
size = image_element.size
top, bottom, left, right = location['y'] - 17, location['y'] - 85 + size['height'], location['x'] + 155, location['y'] + size['width'] - 250
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
screenshot = driver.get_screenshot_as_png()
screenshot = Image.open(BytesIO(screenshot))
captcha = screenshot.crop((left,top,right,bottom))
captcha.save("/home/lxh/Downloads/zip-temp_2023-01-22_a5960a34995b4eab97faecbb7bb0be16/captcha1.png")
time.sleep(10)
ocr = ddddocr.DdddOcr()
with open('/home/lxh/Downloads/zip-temp_2023-01-22_a5960a34995b4eab97faecbb7bb0be16/captcha1.png', 'rb') as f:
  img_bytes = f.read()
res = ocr.classification(img_bytes)
driver.find_elements_by_css_selector("body > div:nth-child(1) > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(1) > span:nth-child(5) > fieldset:nth-child(1) > form:nth-child(2) > table:nth-child(2) > tbody:nth-child(1) > tr:nth-child(14) > td:nth-child(2) > input:nth-child(3)")[0].send_keys(res)
driver.find_elements_by_css_selector("#sub1")[0].click()
time.sleep(10)

三、完整的循环语句

from selenium import webdriver
driver = webdriver.Firefox()
import time
from io import BytesIO
from PIL import Image
import os
import ddddocr
driver.get("https://cpclab.uni-duesseldorf.de/cna/main.php")
driver.implicitly_wait(10)
for a in os.walk("/home/lxh/Documents/Lysin/PDB_Thermal_stable/PDB_A_chain/已跑/"):
  aa = a[2]
​
for b in aa:
  driver.find_elements_by_css_selector("#m1")[0].click()
  driver.find_elements_by_css_selector("#upload")[0].send_keys("/home/lxh/Documents/Lysin/PDB_Thermal_stable/PDB_A_chain/已跑/" + b)
  image_element = driver.find_elements_by_css_selector('body > div:nth-child(1) > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(1) > img:nth-child(1)')[0]
  location = image_element.location
  size = image_element.size
  top, bottom, left, right = location['y'] - 17, location['y'] - 85 + size['height'], location['x'] + 155, location['y'] + size['width'] - 250
  driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
  screenshot = driver.get_screenshot_as_png()
  screenshot = Image.open(BytesIO(screenshot))
  captcha = screenshot.crop((left,top,right,bottom))
  captcha.save("/home/lxh/Downloads/zip-temp_2023-01-22_a5960a34995b4eab97faecbb7bb0be16/captcha1.png")
  time.sleep(10)
  ocr = ddddocr.DdddOcr()
  with open('/home/lxh/Downloads/zip-temp_2023-01-22_a5960a34995b4eab97faecbb7bb0be16/captcha1.png', 'rb') as f:
    img_bytes = f.read()
  res = ocr.classification(img_bytes)
  driver.find_elements_by_css_selector("body > div:nth-child(1) > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(1) > span:nth-child(5) > fieldset:nth-child(1) > form:nth-child(2) > table:nth-child(2) > tbody:nth-child(1) > tr:nth-child(14) > td:nth-child(2) > input:nth-child(3)")[0].send_keys(res)
  driver.find_elements_by_css_selector("#sub1")[0].click()
  time.sleep(10)
  detect = driver.find_elements_by_tag_name('body')[0].text
  while "ERROR" in detect:
    driver.find_elements_by_css_selector("#m1")[0].click()
    driver.find_elements_by_css_selector("#upload")[0].send_keys("/home/lxh/Documents/Lysin/PDB_Thermal_stable/PDB_A_chain/已跑/" + b)
    driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    image_element = driver.find_elements_by_css_selector('body > div:nth-child(1) > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(1) > img:nth-child(1)')[0]
    location = image_element.location
    size = image_element.size
    top, bottom, left, right = location['y'] - 17, location['y'] - 85 + size['height'], location['x'] + 155, location['y'] + size['width'] - 250
    driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    screenshot = driver.get_screenshot_as_png()
    screenshot = Image.open(BytesIO(screenshot))
    captcha = screenshot.crop((left,top,right,bottom))
    captcha.save("/home/lxh/Downloads/zip-temp_2023-01-22_a5960a34995b4eab97faecbb7bb0be16/captcha1.png")
    time.sleep(10)
    ocr = ddddocr.DdddOcr()
    with open('/home/lxh/Downloads/zip-temp_2023-01-22_a5960a34995b4eab97faecbb7bb0be16/captcha1.png', 'rb') as f:
      img_bytes = f.read()
      res = ocr.classification(img_bytes)
    driver.find_elements_by_css_selector("body > div:nth-child(1) > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(1) > span:nth-child(5) > fieldset:nth-child(1) > form:nth-child(2) > table:nth-child(2) > tbody:nth-child(1) > tr:nth-child(14) > td:nth-child(2) > input:nth-child(3)")[0].send_keys(res)
    driver.find_elements_by_css_selector("#sub1")[0].click()
    time.sleep(10)
    detect = driver.find_elements_by_tag_name('body')[0].text
  driver.find_elements_by_css_selector("body > div:nth-child(1) > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(1) > span:nth-child(5) > font:nth-child(5) > span:nth-child(8) > b:nth-child(1) > a:nth-child(2) > font:nth-child(1) > b:nth-child(1)")[0].click()
  time.sleep(10)
  n = driver.window_handles
  driver.switch_to_window(n[-1])
  driver.find_elements_by_css_selector("body > table:nth-child(14) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(3) > a:nth-child(1) > font:nth-child(1) > b:nth-child(1)")[0].click()
  time.sleep(10)
  data1 = driver.find_elements_by_tag_name('body')[0].text
  f = open("/home/lxh/Documents/MDL_DATA/weak_spot.txt","a+")
  f.write(b + "\n")
  for a in data1:
    f.write(a)
  f.write("\n")
  f.close()
  driver.close()
  driver.switch_to_window(n[0])
  time.sleep(3)
  driver.get("https://cpclab.uni-duesseldorf.de/cna/main.php")
  driver.implicitly_wait(10)

     其中利用while循环将识别错误的验证码,重新验证并输入。利用截屏的方式获得验证码图片,截屏元素的位置通过位于网页顶端图片的位置,并尝试调整大小获得。最好不要直接获取验证码元素的位置,因为验证码的坐标往往大于电脑屏幕的坐标位置,从而截屏图像为没有像素的图片,因此需要将截屏的验证码调整到屏幕可见位置。

 web自动化文件上传及验证码识别

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值