某网站验证码的识别笔记【二】

新博客地址:http://gorthon.sinaapp.com/

经过一段时间的思考,觉得在某网站验证码的识别笔记【一】里面写的不太好,后面旋转也不太好弄快哭了,聚类又不想碰也很难。

后来决定改了个方法,就是基于形状来识别,就是根据人的眼睛怎么识别(区分)字符来识别的。

代码写的一团糟……

还是写出来吧,好像我玷污了python的整洁与美观,就这样了,以后再改进吧。

#! /usr/bin/env python
# coding: utf-8

import Image, ImageEnhance, ImageFilter
import math, cPickle

# 随便找个图片先获得以下参数:
GRID_COLOR = (128, 191, 255) #背景表格颜色
BG_COLOR = (227, 218, 237) # 背景颜色
ARC_COLOR = (128, 128, 255) # 干扰线(弧线)颜色
BLACK = (0,0,0)
WHITE = (255, 255, 255)

### 观察到这类验证码没有大写字母和数字
##OMEGA_1 = 'abd'.split(' ') # 圈在下
### a:8个数小于55
### d:8个数大于55,上7后有0
### b:8个数大于55,上7后无0
### 注意a字体不同圈位置不同,另外g可能是上圈,也可能有两个圈
### 这里a是下圈,g是上圈
##OMEGA_2 = 'o'.split(' ') # 只有一个圈
### o:无7(有时候有7)
##OMEGA_3 = 'egpq'.split(' ') # 圈在上
### e:下面有8在右边,右有7
### g:下面有8在左边,右无7
### p:0在左下角,下无8
### q:0在右下角,下无8
##
##OMEGA_4 = 'cfhijklmnrstuvwxyz'.split(' ') # 无圈

def printf(s='', debug=True):
    if debug:
        print s

def eraseGridAndArc(im):
    w, h = im.size
    pixels = im.load()
    for y in range(h):
        for x in range(w):
            pixel = pixels[x, y]
            if pixel == BG_COLOR or pixel == GRID_COLOR:
                im.putpixel((x, y), WHITE)
            elif pixel == ARC_COLOR:
                up = y > 0 and pixels[x, y-1] or None
                down = y < h-1 and pixels[x, y+1] or None
                up_down = up or down
                im.putpixel((x, y), up_down and up_down or WHITE)

    enhancer = ImageEnhance.Contrast(im)
    im = enhancer.enhance(255) # 提高对比度,这个参数大点就好
    im = im.convert('1') # 二值化
    #im.show()
    return im

def getPoint(im, end=False):
    pixels = im.load()
    w, h = im.size
    range_w = end and range(w-1, 0, -1) or range(w)
    for x in range_w:
        for y in range(h):
            if pixels[x, y] == 0:
                return end and x + 1 or x

def getVerticalProjection(im):
    # 得到垂直投影图, 返回投影图数据
    pixels = im.load()
    w, h = im.size
    start_x = getPoint(im)
    end_x = getPoint(im, end=True)
    graph = [0] * (end_x - start_x)
    for x in range(start_x, end_x):
        for y in range(h):
            pixel = pixels[x, y]
            if pixel == 0: # 此列有字符
                graph[x - start_x] += 1

    return start_x, end_x, graph

def showVerticalProjection(graph):
    # 显示垂直投影图
    w = len(graph)
    h = max(graph)
    img = Image.new('1', (w, h))
    for x in range(w):
        for y in range(h):
            if y <= graph[x]:
                img.putpixel((x, y), 255)
            else:
                break
    img = img.transpose(Image.FLIP_TOP_BOTTOM)
    img.show()

def cut(start_x, end_x, graph, im):
    chars = [start_x]
    for i, v in enumerate(graph):
        if v !=0:
            if graph[i - 1] == 0:
                chars.append(i + start_x)
        elif graph[i - 1] !=0:
            chars.append(i + start_x)
    chars.append(end_x)
    
    result = list()
    for i in range(len(chars) - 1):
            result.append((chars[i], chars[i + 1] - 1))
    result = [v for (i, v) in enumerate(result) if not i%2] # 去除波谷的0

    w, h = im.size
    chars = list()
    for char_start_x, char_end_x  in result:
        chars.append(im.crop((char_start_x, 0, char_end_x, h)))

    # 纵向切割之后再横向切割:
    result = list()
    for char in chars:
        w, h = char.size
        pixels = char.load()
        try:
            for y in range(h):
                for x in range(w):
                    if pixels[x, y] == 0:
                        start_y = y
                        raise
        except:
            pass
        try:
            for y in range(h-1, -1, -1):
                for x in range(w):
                    if pixels[x, y] == 0:
                        end_y = y
                        raise
        except:
            pass
        result.append(char.crop((0, start_y, w, end_y)))
    return result

def unique(s):
    r = s[0]
    for i in list(s):
        if i != r[-1]:
                r += i
    return r

def otsu(chars, debug):
    vcode = '' # 返回值
    # 凹凸区域和圈的提取
    for im in chars:
        global result, w, h
        pixels = im.load()
        result = im.copy().load()
        w, h = im.size
        for y in range(h):
            for x in range(w):
                count = 0
                # 八个方向扫描:
                lst = (range(x + 1, w), range(x)) # 东、西
                for rng in lst:
                    for i in rng:
                        if pixels[i, y] == 0: # 黑色
                            count += 1
                            break
                lst = (range(y + 1, h), range(y)) # 南、北
                for rng in lst:
                    for j in rng:
                        if pixels[x, j] == 0:
                            count += 1
                            break
                p = ((1, -1), (1, 1), (-1, 1), (-1, -1)) # 东北、东南、西南、西北四个方向
                for m1, m2 in p:
                    try:
                        for i in range(999):
                            if pixels[x + i * m1, y + i * m2] == 0:
                                count += 1
                                break
                    except:
                        pass
                count = count == 8 and 8 or (7 if 4 < count < 8 else 4)
                # count为8: 圈区域
                # count为5、6或7:凹区域
                if result[x, y] != 0:
                    result[x, y] = count
                else: # 对于孤立的0将其置为其左边或右边的数
                    if 0<x<w-1:
                        if result[x - 1, y] != 0 and result[x+1, y] != 0:
                            result[x, y] = result[x-1, y]
                #end for x
        # end for y
        for y in range(h):
            for x in range(w):
                if result[x, y] == 8 and 0<x<w-1:
                    if result[x - 1, y] != 8 and result[x+1, y] != 8: # 对于孤立的8将其置为其左边的数
                        result[x, y] = result[x-1, y]
                if result[x, y] == 7 and 0<x<w-1:
                    if result[x - 1, y] != 7 and result[x+1, y] != 7: # 对于孤立的7将其置为其左边的数
                        result[x, y] = result[x-1, y]

##        # 8的8个方向不应该有7,否则应该将8置为7
##        # 对于倾斜m很有效
##        for y in range(h-1, -1, -1):
##            for x in range(w-1, -1, -1):
##                if result[x, y] == 8 and 0<x<w-1 and 0<y<h-1:
##                    if result[x-1, y-1] == 7 or result[x-1, y] == 7 or result[x, y-1] == 7 or result[x+1, y] == 7 or \
##                       result[x, y+1] == 7 or result[x+1, y+1] == 7 or result[x+1, y-1] == 7 or result[x-1, y+1] == 7:
##                        result[x, y] = 7
                    
        # 对于干扰线遮挡b的底部的时候会将其判断为h,故去除干扰线时有待改进
        # 此类的还有o会识别为n或者u等等
        num_8 = 0
        for y in range(h):
            for x in range(w):
                if result[x, y] == 8:
                    num_8 += 1
        # 对于倾斜的n或者u等字符内部会出现8的区域
        # 如果8的个数小于一定个数(比如说15个)就将其看成是5、6或7即凹区域
        # 通过观察可看出此圈区域的8个数很多(在50个以上), 这里我取的30...
        has_circle = True # 有圈
        if num_8 < 30:
            has_circle = False # 无圈
            for y in range(h):
                for x in range(w):
                    if result[x, y] == 8:
                        result[x, y] = 7

        if has_circle:
            def has_7():
                # 没有7就是0
                num_7 = 0
                for y in range(h):
                    for x in range(w):
                        if result[x, y] == 7:
                            num_7 += 1
                return num_7
            if has_7() < 6: # o
                # 有时候有7, 这里取小于6个7也为o
                # 去除干扰线引起的e
                if result[w/2, h/2] == 0:
                    vcode += 'e'
                    printf('e', debug)
                else:
                    # 最后3排o的0多于p、q
                    s = ''
                    for y in range(h - 3, h):
                        for x in range(w):
                            t = str(result[x, y])
                            t = t!='0' and '7' or '0'
                            s += t
                    if s.count('0') > s.count('7'): # o
                        vcode += 'o'
                        printf('o', debug)
                    else: # pq
                        # 右下角5*4区域0多于7为q
                        s = ''
                        for y in range(h - 5, h):
                            for x in range(w - 4, w):
                                t = str(result[x, y])
                                t = t!='0' and '7' or '0'
                                s += t
                        if s.count('0') > s.count('7'): # qp
                            vcode += 'q'
                            printf('q', debug)
                        else:
                            vcode += 'p'
                            printf('p', debug)
            else:
                num_8_up = 0 # 记录8在上的个数
                num_8_down = 0# 记录8在下的个数
                for y in range(h/2):
                    for x in range(w):
                        if result[x, y] == 8:
                            num_8_up += 1
                for y in range(h/2, h):
                    for x in range(w):
                        if result[x, y] == 8:
                            num_8_down += 1
                if num_8_up > num_8_down: # 圈在上:'egpq'
                    num_8 = 0 # 整个字符中8的个数,小于50个为e,多于50个为gpq
                    for y in range(h):
                        for x in range(w):
                            if result[x, y] == 8:
                                num_8 += 1
                    if num_8 < 50: # e
                        # 用一条直线从2/3的地方穿过,如果遇到07070这样的情况那么是m
                        try:
                            find = False
                            for y in range(h):
                                s = ''
                                for x in range(w):
                                    t = str(result[x, y])
                                    t = t!='0' and '7' or '0'
                                    s += t
                                s =  unique(s)
                                if s.find('07070') != -1:
                                    vcode += 'm'
                                    printf('m', debug)
                                    find = True
                                    raise
                        except:
                            pass
                        if not find:
                            vcode += 'e'
                            printf('e', debug)
                    else: # pgq
                        # 倒数第二排的0个数多于一半为g
                        zero = 0
                        y = h - 3
                        for x in range(w):
                            if result[x, y] == 0:
                                zero += 1
                        if zero >= w / 2: # g
                            vcode += 'g'
                            printf('g', debug)
                        else: # pq
                            # 右下角5*4区域0多于7为q
                            s = ''
                            for y in range(h - 5, h):
                                for x in range(w - 4, w):
                                    t = str(result[x, y])
                                    t = t!='0' and '7' or '0'
                                    s += t
                            if s.count('0') > s.count('7'): # qp
                                vcode += 'q'
                                printf('q', debug)
                            else:
                                vcode += 'p'
                                printf('p', debug)
                else: # 圈在下'abd'
                    num_8 = 0 # 整个字符中8的个数,小于55个为a,多于55个为bd
                    for y in range(h):
                        for x in range(w):
                            if result[x, y] == 8:
                                num_8 += 1
                    if num_8 < 55:
                        vcode += 'a'
                        printf('a', debug)
                    else:
                        # 7与0的位置关系
                        try:
                            for y in range(h/2):
                                for x in range(w):
                                    if result[x, y] == 7:
                                        if result[x - 1, y] in (0, 8):
                                            vcode += 'b'
                                            printf('b', debug)
                                        else:
                                            vcode += 'd'
                                            printf('d', debug)
                                        raise
                        except:
                            pass
        else: # 没有圈,'cfhijklmnrstuvwxyz'
            # i和j是由两部分组成的,其他字符都是一个整体
            def has_2_part():
                for y in range(h/3):
                    for x in range(w):
                        if result[x, y] == 0:
                            for y in range(y+1, h/3):
                                for x in range(w):
                                    if result[x, y] != 0:
                                        yes = True
                                    else:
                                        yes = False
                                        break
                                if yes:
                                    return True
                            return False
            if has_2_part(): # ij
                # 最近几列如果0上面有7则为i否则为j
                # 取右下角4X4方格
                s = ''
                for y in range(h/2, h-3):
                    for x in range(w-4, w):
                        t = str(result[x, y])
                        if t != '0':
                            t = '7'
                        s += t
                if s.count('7') > s.count('0'):
                    vcode += 'i'
                    printf('i', debug)
                else:
                    vcode += 'j'
                    printf('j', debug)
            else: #'cfhklmnrstuvwxyz'
                # c的中间向右均无0
                def has_zreo():
                    s = ''
                    for y in range(h/2 - 3, h/2 + 4):
                        for x in range(w/2 - 2, w):
                            s += str(result[x, y])
                    if s.count('0') < 2:
                        return False
                    return True
                if not has_zreo(): # c
                    # 排除r
                    # r的倒数第二排0个数很少,而c很多(多于宽度的一半)
                    s = ''
                    for x in range(w):
                        s += str(result[x, h - 2])
                    if s.count('0') > w / 2:
                        vcode += 'c'
                        printf('c', debug)
                    else:
                        vcode += 'r'
                        printf('r', debug)
                else: # 'fhklmnrstuvwxyz'
                    # 用一条直线穿过,如果遇到07070比较多的情况那么是m或者是w
                    find = False
                    num_07070 = 0
                    for y in range(h):
                        s = ''
                        for x in range(w):
                            t = str(result[x, y])
                            t = t!='0' and '7' or '0'
                            s += t
                        s =  unique(s)
                        if s.find('07070') != -1:
                            num_07070 += 1
                    if num_07070 >= 3: # m/w否则有可能为右倾的f
                        # 字符上面中间位置看7和0的个数
                        # 7多0少:w
                        # 7少0多:m
                        s = ''
                        for y in range(5):
                            for x in range(w/2 - 2, w/2 + 3):
                                t = str(result[x, y])
                                t = t!='0' and '7' or '0'
                                s += t
                        if s.count('7') > s.count('0'):
                            vcode += 'w'
                            printf('w', debug)
                        else:
                            vcode += 'm'
                            printf('m', debug)
                        find = True
                    else:
                        find = False
                    if not find: # fhklnrstuvxyz
                        # 用一条直线横穿字符,如果得到的unique为070的个数多于6那么为h,k,n, u,v,x,y
                        # 否则为f,l,r,s,t,z
                        num_070 = 0
                        for y in range(h):
                            s = ''
                            for x in range(w):
                                t = str(result[x, y])
                                t = t!='0' and '7' or '0'
                                s += t
                            s =  unique(s)
                            #print s
                            if s.find('070') != -1:
                                num_070 += 1
                        if num_070 < 6: # 'flrstz'
                            # s倒数几列下半部分很多0
                            s = ''
                            for y in range(h/2, h):
                                for x in range(w- 3, w):
                                    t = str(result[x, y])
                                    t = t!='0' and '7' or '0'
                                    s += t
                            if s.count('0') > s.count('7'): # s
                                vcode += 's'
                                printf('s', debug)
                            else: # 'flrtz'
                                # 将字符为分2*3的块,那么东北那块出现070个数>=3的话就是f
                                num_070 = 0
                                for x in range(w/2, w):
                                    s = ''
                                    for y in range(2*h/3):
                                        t = str(result[x, y])
                                        t = t!='0' and '7' or '0'
                                        s += t
                                    s = unique(s)
                                    if s.find('070') != -1:
                                        num_070 += 1
                                if num_070 >= 3: # f
                                    vcode += 'f'
                                    printf('f', debug)
                                else: # lrtz
                                    # 右下角6*3区域内0少的为r
                                    # !!!!!!!有可能是右倾的t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!末完成!!!!!!!!!!!!!!!!!!!!!!!!!!
                                    s = ''
                                    for y in range(h - 3, h):
                                        for x in range(w - 6, w):
                                            t = str(result[x, y])
                                            t = t!='0' and '7' or '0'
                                            s += t
                                    if s.count('7') > s.count('0'): # r
                                        vcode += 'r'
                                        printf('r', debug)
                                    else: # ltz
                                        # 宽度小于10为右倾的l
                                        if w < 10:
                                            vcode += 'l'
                                            printf('l', debug)
                                        else:
                                            # 左上角4*8区域内0多于7为l,否则为t
                                            # 这里不用考虑z,因为z基本上不出现
                                            s = ''
                                            for y in range(4):
                                                for x in range(8):
                                                    t = str(result[x, y])
                                                    t = t!='0' and '7' or '0'
                                                    s += t
                                            if s.count('0') > s.count('7'): # l
                                                vcode += 'l'
                                                printf('l', debug)
                                            else:
                                                vcode += 't'
                                                printf('t', debug)
                        else: # 'hknuvxy'
                            # n的顶部中间有点其他都没有如中间的3*3内多于3个点
                            s = ''
                            for y in range(1, 4):
                                for x in range(w/2 - 1, w/2 + 2):
                                    s += str(result[x, y])
                            if s.count('0') > 3: # n
                                # 排除h
                                # h右上角5*5内几乎没有0
                                num_0 = 0
                                for y in range(5):
                                    for x in range(w-5, w):
                                        if result[x, y] == 0:
                                            num_0 += 1
                                if num_0 > 2: # l
                                    vcode += 'n'
                                    printf('n', debug)
                                else:
                                    vcode += 'h'
                                    printf('h', debug)
                            else:
                                # k与x的后半部分(纵向切分时),会出现070在5个左右,这里取>=3
                                num_070 = 0
                                for x in range(w/2, w):
                                    s = ''
                                    for y in range(h):
                                        t = str(result[x, y])
                                        t = t!='0' and '7' or '0'
                                        s += t
                                    s = unique(s)
                                    if s.find('070') != -1:
                                        num_070 += 1
                                if num_070 >= 3: # kx
                                    # 也有可能是y,判断倒数第5排有没有070即可,没有即为y
                                    s = ''
                                    for x in range(w):
                                        t = str(result[x, h - 5])
                                        t = t!='0' and '7' or '0'
                                        s += t
                                    s = unique(s)
                                    if s.find('070') != -1:
                                        # x的前半部分(纵向切分时),会出现070在5个左右,这里取>=3
                                        # 对于孤立的7(纵向看)将其置为其下边的数
                                        for x in range(w):
                                            for y in range(h):
                                                if result[x, y] == 7 and 0<y<h-1:
                                                    if result[x, y-1] != 7 and result[x, y+1] != 7: 
                                                        result[x, y] = result[x, y+1]
                                        num_070 = 0
                                        for x in range(w/2):
                                            s = ''
                                            for y in range(h):
                                                t = str(result[x, y])
                                                t = t!='0' and '7' or '0'
                                                s += t
                                            s = unique(s)
                                            if s.find('070') != -1:
                                                num_070 += 1
                                        if num_070 >= 3: # x
                                            vcode += 'x'
                                            printf('x', debug)
                                        else:
                                            vcode += 'k'
                                            printf('k', debug)
                                    else:
                                        vcode += 'y'
                                        printf('y', debug)
                                else: # 'huvy'
                                    # h的上半部分有几行0很多, 并且中间有0
                                    # h的下半部分中间一列没有0
                                    num_0 = 0
                                    line = -1
                                    for y in range(h/2):
                                        s = ''
                                        for x in range(w):
                                            t = str(result[x, y])
                                            t = t!='0' and '7' or '0'
                                            s += t
                                        if s.count('0') > w/2:
                                            num_0 += 1
                                            line = y
                                    #(0 in (result[w/2, line - 1], result[w/2, line], result[w/2, line - 2])) and 
                                    if line > 0 and num_0 > 2 and (
                                            0 not in (result[w/2, h-1], result[w/2, h-2], result[w/2, h-3], result[w/2, h-4],
                                                      result[w/2, h-5], result[w/2, h-6])):
                                        vcode += 'h'
                                        printf('h', debug)
                                    else:
                                        # y的下半部分会出现先遇到7再遇到0的情况
                                        num_70 = 0
                                        for y in range(2*h/3, h):
                                            s = ''
                                            for x in range(w):
                                                s += str(result[x, y])
                                            s = unique(s)
                                            if s.find('070') != -1:
                                                continue
                                            if s.find('70') != -1:
                                                num_70 += 1
                                        if num_70 > 2:
                                            vcode += 'y'
                                            printf('y', debug)
                                        else:
                                            # 倒数第5排出现了7则为u
                                            y = h - 5
                                            u = False
                                            for x in range(w):
                                               if result[x, y] == 7:
                                                   u = True
                                                   break
                                            if u:
                                                vcode += 'u'
                                                printf('u', debug)
                                            else:
                                                vcode += 'v'
                                                printf('v', debug)

        continue

        for y in range(h):
            for x in range(w):
                print result[x, y],
            print
        print
    return vcode

def getCode(filename):
    im = Image.open(filename)
    im = eraseGridAndArc(im) # 去除背景及弧线并转换为二值图
    start_x, end_x, graph = getVerticalProjection(im)
    chars = cut(start_x, end_x, graph, im)
    return otsu(chars, False)
    #chars[0].show()
    #for i in chars:i.show()

if __name__ == '__main__':
    print getCode('./0.png')
##    下面是用识别的结果重命名那100个图片,遇到重复的字符串,就生成一个随机的后缀避免覆盖已有的图片
##    import os, glob
##    import random
##    for f in glob.glob('*.png'):
##        try:
##            im = Image.open(f)
##            im = eraseGridAndArc(im) # 去除背景及弧线并转换为二值图
##            start_x, end_x, graph = getVerticalProjection(im)
##            chars = cut(start_x, end_x, graph, im)
##            vcode = otsu(chars, False)
##            print vcode
##            if os.path.exists(vcode + '.png'):
##                os.rename(f, vcode + ' - %f.png' % random.random())
##            else:
##                os.rename(f, vcode + '.png')
##        except:
##            print 'error occurs when otsu %s' % f

最后来几个效果图:

单个字母识别率还是挺高的,单词组合起来就不太理想了,提升空间很大。


第3个好像也是错的

第3篇:http://blog.csdn.net/bh20077/article/details/7311183

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 4
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值