准备工作
python环境,PILLOW库,pytesseract库
环境搭建
python环境和PILLOW库的搭建不再赘述,主要是pytesseract库搭建需要注意一下,具体搭建细节,参考:pytesseract库安装与使用
项目工程结构图如下:
最后附上Verify模块中的代码
#-*-coding:utf-8-*-
import os
from PIL import Image,ImageDraw
import pytesseract
class Recognizer(object):
def __init__(self):
self.ParentPath = os.path.abspath(os.path.join(os.getcwd(),"..")) #上级目录
self.DataPath = os.path.join(self.ParentPath,'Data') #Data目录
self.ResultsPath = os.path.join(self.ParentPath,'Results') #Reulsts目录
self.BlackWhiteImage = None #存放灰度图像经过阈值划分后的0-1二值图像
self.tessdata_dir_config = '--tessdata-dir "D:\\Tesseract-OCR\\tessdata"'
self.t2val = {}
def ToBlack(self,path):
"""
将彩色
:param path:
:return:
"""
img = Image.open(path)
w, h = img.size
for x in range(w): #这个更改RGB值好像没有什么用
for y in range(h):
r, g, b = img.getpixel((x, y)) #检索指定坐标点的像素的RGB颜色值
if 190 <= r <= 255 and 170 <= g <= 255 and 0 <= b <= 140:
img.putpixel((x, y), (0, 0, 0))
if 0 <= r <= 90 and 210 <= g <= 255 and 0 <= b <= 90:
img.putpixel((x, y), (0, 0, 0))
# img = img.convert('L') #灰度图像,不是二值图像
img = img.convert('L').point([0] * 150 + [1] * (256 - 150), '1') #RGB先转换为灰度公式,然后根据灰度值设定阈值,进行二值化
self.BlackWhiteImage = img
return img
def twoValue(self, G=100):
for y in xrange(0, self.BlackWhiteImage.size[1]):
for x in xrange(0, self.BlackWhiteImage.size[0]):
self.t2val[(x, y)] = self.BlackWhiteImage.getpixel((x, y)) # 这个是二值后的图像像素,将0,255转换为0,1矩阵
def CleanNoise(self,N=3,Z=2):
# 根据一个点A的RGB值,与周围的8个点的RBG值比较,设定一个值N(0 <N <8),当A的RGB值与周围8个点的RGB相等数小于N时,此点为噪点
# N: Integer 降噪率 0 <N <8,常见的设置为3,效果还行
# Z: Integer 降噪次数
# 输出
# 0:降噪成功
# 1:降噪失败
for i in xrange(0, Z):
self.t2val[(0, 0)] = 1
self.t2val[(self.BlackWhiteImage.size[0] - 1, self.BlackWhiteImage.size[1] - 1)] = 1
for x in xrange(1, self.BlackWhiteImage.size[0] - 1):
for y in xrange(1, self.BlackWhiteImage.size[1] - 1):
nearDots = 0
L = self.t2val[(x, y)]
if L == self.t2val[(x - 1, y - 1)]:
nearDots += 1
if L == self.t2val[(x - 1, y)]:
nearDots += 1
if L == self.t2val[(x - 1, y + 1)]:
nearDots += 1
if L == self.t2val[(x, y - 1)]:
nearDots += 1
if L == self.t2val[(x, y + 1)]:
nearDots += 1
if L == self.t2val[(x + 1, y - 1)]:
nearDots += 1
if L == self.t2val[(x + 1, y)]:
nearDots += 1
if L == self.t2val[(x + 1, y + 1)]:
nearDots += 1
if nearDots < N:
self.t2val[(x, y)] = 1 # 1白,0黑
def recognize_captcha(self):
image = Image.new("1", self.BlackWhiteImage.size)
draw = ImageDraw.Draw(image)
for x in xrange(0, self.BlackWhiteImage.size[0]):
for y in xrange(0, self.BlackWhiteImage.size[1]):
draw.point((x, y), self.t2val[(x, y)])
num = pytesseract.image_to_string(image, config=self.tessdata_dir_config)
return num
def BeginRecogize(self,path):
self.ToBlack(path)
self.twoValue()
self.CleanNoise(N=3,Z=2)
res = self.recognize_captcha()
res = self.StandardOutput(res)
return res
def StandardOutput(self,output):
temp = ''
for i in range(len(str(output))):
if(str(output)[i]!=' '):
temp = temp + str(output)[i]
return temp.upper()
if __name__ == '__main__':
# p1 = os.path.abspath(os.path.join(os.getcwd(),".."))
# print os.path.join(p1,'Data')
# print ([0] * 150 + [1] * (256 - 150))
r1 = Recognizer()
p1 = os.path.join(r1.DataPath,'7.png')
res = r1.BeginRecogize(p1)
print res
方法效果:
原图
识别结果:
但是针对,一些其他验证码不行,如:
识别结果为: