为啥搞这个
zzu的系统变强了,在短时间内如果重复登录会导致登录需要验证码。especially jksb和抢课的时候需要不停刷新浏览器每次都要输验证码浪费时间(可以作为其中的一个模块)
整体思路
(1) 整体思路就是在通过webdriver的时候将其中的验证码进行截图
(2) 之后将图片扔给百度AI进行图像识别
(原因是因为有噪音,识别率很低,常常需要重复进行验证)
(2) 所以就想到了手动来破解这个sb验证码
- 爬取100张验证码
- 将验证码每个字分割出来
- 取其中除了白色之外最多的颜色为黑色(因为噪点较少,所以颜色最多的为真实字符的大概)
- 把这些字全丢给百度AI进行分类(单个字符验证能够得到结果的正确率还是高的,得不到结果的直接给丢了)
- 人工瞟一眼,把ai分错的给丢了
- 将相同字符的图片进行重叠 超过一定数量重叠的作为最后用于验证的图片
- 再爬取验证码识别的时候就能够直接将其中的字符和已经找到的字符进行比对
分部代码
图片处理
爬取所有的图片以及对图像进行处理保存
其实定裁剪的边界的位置是最麻烦的…
# encoding:utf-8
from urllib.request import urlretrieve
import numpy as np
from PIL import Image
def get_pic(name):
path = str(name)
pathadnname = "allpic/{}.png".format(path)
url = 'https://jksb.v.zzu.edu.cn/vls6sss/zzjlogin3d.dll/zzjgetimg?ids=12'
urlretrieve(url, pathadnname)
def pic_cut(name):
path = str(name)
pathadnname = "allpic/{}.png".format(path)
img = Image.open(pathadnname)
# print(img.size)
cropped = img.crop((2, 4, 16, 22)) # (left, upper, right, lower)
cropped.save("cutpic/{}_1.png".format(path))
cropped = img.crop((16, 4, 30, 22)) # (left, upper, right, lower)
cropped.save("cutpic/{}_2.png".format(path))
cropped = img.crop((30, 4, 44, 22)) # (left, upper, right, lower)
cropped.save("cutpic/{}_3.png".format(path))
cropped = img.crop((44, 4, 58, 22)) # (left, upper, right, lower)
cropped.save("cutpic/{}_4.png".format(path))
def pic_proc(i1, j1):
pathadnname = "cutpic/{}_{}.png".format(i1, j1)
print(pathadnname)
image = Image.open(pathadnname)
image_arr = np.array(image) # 转化成numpy数组
print(image_arr)
img = image_arr
dic = {}
for i in img:
for j in i:
if j in dic:
dic[j] = dic[j] + 1
else:
dic[j] = 1
blk = []
dic_1 = (sorted(dic.items(), key=lambda kv: (kv[1], kv[0]), reverse=True))
print(dic_1)
blk_color = dic_1[1][0] # 白色是最多的 所以是除了白色之外最多的
print(blk_color)
for i in range(len(img)):
for j in range(len(img[i])):
jj = img[i][j]
if jj == blk_color:
blk.append((i, j))
# 先看一眼大概的图片
for i in range(len(img)):
for j in range(len(img[i])):
if (i, j) in blk:
print(1, end='')
else:
print(0, end='')
print("")
b = [0, 0, 0]
w = [255, 255, 255]
f_pic = []
for i in range(len(img)):
temp_lst = []
for j in range(len(img[i])):
if (i, j) in blk:
temp_lst.append(b)
else:
temp_lst.append(w)
temp_lst.append(w)
f_pic.append(temp_lst)
fnl = np.asarray(f_pic, dtype=np.uint8)
f_img = Image.fromarray(fnl, "RGB")
f_img.save("finalpic/{}_{}.png".format(i1, j1))
if __name__ == "__main__":
for i in range(100):
get_pic(i)
pic_cut(i)
for j in range(1, 5):
pic_proc(i, j)
扔给百度api去识别
丢给百度api
# encoding:utf-8
import requests
from PIL import Image
import time
import base64
# request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/general_basic"
request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic"
# 二进制方式打开图片文件
for i in range(100):
for j in range(1, 5):
token = None
# client_id 为官网获取的AK, client_secret 为官网获取的SK
host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials' \
'&client_id=************&client_secret=*****************'
response = requests.get(host)
if response:
# print(response.json()["access_token"])
token = response.json()["access_token"]
pathname = 'finalpic/{}_{}.png'.format(i, j)
print(pathname)
f = open(pathname, 'rb')
img = base64.b64encode(f.read())
params = {"image": img,"language_type":"ENG"}
access_token = token
request_url = request_url + "?access_token=" + access_token
headers = {'content-type': 'application/x-www-form-urlencoded'}
response = requests.post(request_url, data=params, headers=headers)
ch = None
if response:
print(response.json())
time.sleep(0.3)
image = Image.open(pathname)
try:
ch = str(response.json()["words_result"][0]["words"])
# print(ch)
print(ch.lower())
except Exception as e:
print(e)
try:
svpath = "sort/{}/{}_{}.png".format(ch.lower()[0], i, j)
print(svpath)
image.save(svpath)
print("had")
except:
image.save("sort/{}/{}_{}.png".format('unknow', i, j))
print("not")
图片重叠
将图片重叠,找到重叠率超过0.49的
# encoding:utf-8
import os
from PIL import Image
import numpy as np
filepath = "C:\\Users"
filelist = os.listdir(filepath)
def pic_save(img, path):
# b = [0, 0, 0, 100]
# w = [255, 255, 255, 100]
b = [0, 0, 0]
w = [255, 255, 255]
f_pic = []
for i in range(len(img)):
temp_lst = []
for j in range(len(img[i])):
if img[i][j]>0.49:
temp_lst.append(b)
else:
temp_lst.append(w)
temp_lst.append(w)
f_pic.append(temp_lst)
print(f_pic)
fnl = np.asarray(f_pic, dtype=np.uint8)
f_img = Image.fromarray(fnl, "RGB")
f_img.save("C:\\Users\Charles\Desktop\python\ocr\\rst\\{}.png".format(path))
print("saved")
for ch in filelist:
dirpath = filepath + "\\" + ch
print(ch)
if ch == "unknow":
continue
filelist = os.listdir(dirpath)
pic = np.zeros((18, 15), dtype=np.float)
for j in filelist:
picpath = dirpath + "\\" + j
# print(picpath)
image = Image.open(picpath) # 用PIL中的Image.open打开图像
image_arr = np.array(image)
# print(image_arr)
image_arr_pro = np.zeros((18, 15), dtype=np.float)
# print(len(image_arr))
# print(len(image_arr[0]))
for m in range(len(image_arr)):
for n in range(len(image_arr[m])):
print(list(image_arr[m][n]))
if list(image_arr[m][n]) == [255, 255, 255]:
image_arr_pro[m][n] = 0
else:
image_arr_pro[m][n] = 1
pic = pic + image_arr_pro
pic = pic / len(filelist)
for m in pic:
for n in m:
if n > 0.49:
print(1, end="")
else:
print(0, end="")
print("")
pic_save(pic,ch)
测试验证
最后测试一下能够成功验证(当然只是直接通过图片进行验证)
如果要用到自动刷新上的话图片图片是RGBA的,需要再进行改动
而且需要使用直接截图进行保存的方式
# encoding:utf-8
import os
from urllib.request import urlretrieve
import numpy as np
from PIL import Image
from selenium import webdriver
PATH = "C:\\Users\\Charles\\Desktop\\python\\ocr\\final\\temp\\"
def get_pic(name):
path = str(name)
pathadnname = PATH + ("{}.png".format(path))
url = 'https://jksb.v.zzu.edu.cn/vls6sss/zzjlogin3d.dll/zzjgetimg?ids=7777'
urlretrieve(url, pathadnname)
def authentic_get_pic():
# 由于在真正使用爬虫的时候只能够通过屏幕的截图来实现对于验证码的保存 之前的仅供测试
# 以下例子用于在健康打卡时遇到验证码的处理(仅供参考)
options = webdriver.ChromeOptions()
# options.add_argument('--no-sandbox')
# options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
url = "https://jksb.v.zzu.edu.cn/vls6sss/zzujksb.dll/first0"
# driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)
print(driver.page_source)
driver.save_screenshot('/project/ocr/input.png')
imgelement = driver.find_element_by_id("myimg6")
location = imgelement.location
size = imgelement.size
rangle = (int(location['x']), int(location['y']), int(location['x'] + size['width']),
int(location['y'] + size['height']))
i = Image.open("/project/ocr/input.png")
frame4 = i.crop(rangle)
frame4.save('/project/ocr/output.png')
# ...........
def pic_cut(name):
path = str(name)
pathadnname = PATH+("{}.png".format(path))
img = Image.open(pathadnname)
# print(img.size)
cropped = img.crop((2, 4, 16, 22)) # (left, upper, right, lower)
cropped.save(PATH + ("{}_1.png".format(path)))
cropped = img.crop((16, 4, 30, 22)) # (left, upper, right, lower)
cropped.save(PATH + ("{}_2.png".format(path)))
cropped = img.crop((30, 4, 44, 22)) # (left, upper, right, lower)
cropped.save(PATH + ("{}_3.png".format(path)))
cropped = img.crop((44, 4, 58, 22)) # (left, upper, right, lower)
cropped.save(PATH + ("{}_4.png".format(path)))
def pic_proc(i1, j1):
pathadnname = PATH + ("{}_{}.png".format(i1, j1))
print(pathadnname)
image = Image.open(pathadnname) # 用PIL中的Image.open打开图像
image_arr = np.array(image) # 转化成numpy数组
print(image_arr)
img = image_arr
dic = {}
for i in img:
for j in i:
if j in dic:
dic[j] = dic[j] + 1
else:
dic[j] = 1
blk = []
dic_1 = (sorted(dic.items(), key=lambda kv: (kv[1], kv[0]), reverse=True))
print(dic_1)
# print(dic_1.__class__)
blk_color = dic_1[1][0]
print(blk_color)
for i in range(len(img)):
for j in range(len(img[i])):
jj = img[i][j]
# jj = j.tolist()
if jj == blk_color:
blk.append((i, j))
b = [0, 0, 0]
w = [255, 255, 255]
f_pic = []
for i in range(len(img)):
temp_lst = []
for j in range(len(img[i])):
if (i, j) in blk:
temp_lst.append(b)
else:
temp_lst.append(w)
temp_lst.append(w)
f_pic.append(temp_lst)
fnl = np.asarray(f_pic, dtype=np.uint8)
# print(fnl)
f_img = Image.fromarray(fnl, "RGB")
f_img.save(PATH+("{}_{}.png".format(i1, j1)))
if __name__=="__main__":
get_pic(1)
pic_cut(1)
for i in range(1,5):
pic_proc(1,i)
for i in range(1,5):
pathadnname = PATH + ("/{}_{}.png".format("1", i))
rawpic = Image.open(pathadnname)
rawpic_arr = np.array(rawpic)
filepath = "C:\\Users\\Charles\\Desktop\\python\\ocr\\rst\\"
filelist = os.listdir(filepath)
dic = {}
for ch in filelist:
picpath = filepath+ch
testpic = Image.open(picpath)
testpic_arr = np.array(testpic)
countsame = 0
for m in range(len(rawpic_arr)):
for n in range(len(rawpic_arr[m])):
if rawpic_arr[m][n].any() in testpic_arr[m][n]:
countsame = countsame+1
dic[ch] = countsame
lst = (sorted(dic.items(), key=lambda kv: (kv[1], kv[0]), reverse=True))
# print(lst)
print(lst[0])