PaddleOCR自制身份证数据集训练用于身份证识别

仅学习交流

其中中文字体为方正黑体,身份证号字体为OCR-B 10 BT.ttf
add_txt()中 size字体大小 draw_x, draw_y坐标 根据自己图片设置

import os
import cv2
import random
import numpy as np
from tqdm import tqdm
from PIL import Image, ImageDraw, ImageFont


def mkdir(path):
    if not os.path.exists(path):
        os.makedirs(path)


class Person(object):
    def __init__(self, name, sex, national, years, month, day, address1, address2, id_card, public):
        self.name = str(name), 
        self.sex = str(sex), 
        self.national = str(national),
        self.years = str(years), 
        self.month = str(month), 
        self.day = str(day), 
        self.address1 = str(address1),
        self.address2 = str(address2),
        self.id_card = str(id_card),
        self.public = str(public)
        
        
def first_name():    
    """
    功能 : 随机姓氏  百家姓
    """
    with open(ROOT + 'first_name.txt', 'r', encoding='utf-8') as f:
        first_name_list = [line.rstrip('\n') for line in f]
        total_lines = len(first_name_list)
        idx = random.randint(0, total_lines - 1)
    
    return first_name_list[idx]


def name():
    """
    功能 : 随机汉字txt   参考PaddleOCR自带的识别中文txt
    """
    with open(ROOT + 'name.txt', 'r', encoding='utf-8') as f:
        name_list = [line.rstrip('\n') for line in f]
        total_lines = len(name_list)
        idx = random.randint(0, total_lines - 1)

    return name_list[idx]


def GBK2312():   
    """
    功能 : 随机生成一个汉字GBK   所有字都随机可使用这一函数
    """
    head = random.randint(0xb0, 0xf7)
    body = random.randint(0xa1, 0xf9)  # 在head区号为55的那一块最后5个汉字是乱码,为了方便缩减下范围
    val = f'{head:x}{body:x}'
    st = bytes.fromhex(val).decode('gb2312')
    return st


def second_name():
    """
    功能 : 随机取数组中字符, 取到空字符则没有second_name    '\u00B7'为新疆名字中间的·
    """
    second_name_list = [name(), '', '\u00B7']
    _rand = random.randint(0, 30)
    if (_rand < 15):
        n = 0
    elif (_rand < 29):
        n = 1
    else:
        n = 2    

    return second_name_list[n]


def last_name():
    """
    功能 : 随机生成名字最后一位字
    """
    return name()


def create_name():
    """
    功能 : 随机生成名字
    """
    name = first_name() + second_name() + last_name()
    return name


def sex_word():
    """
    功能 : 随机生成性别
    """
    nums = random.randint(0, 1)
    sex_list = ['男', '女']
    
    return sex_list[nums]


def national_name():
    """
    功能 : 随机生成民族
    """
    with open(ROOT + 'nation.txt', 'r', encoding='utf-8') as f:
        national_list = [line.rstrip('\n') for line in f]
        total_lines = len(national_list)
        idx = random.randint(0, total_lines - 1)
    
    return national_list[idx]


def address_line1():  
    """
    功能 : 随机生成省,市,地址   可随机 可自制省市address.txt
    """
    address_list = ''
    with open(ROOT + 'address.txt', 'r', encoding='utf-8') as f:
        address = [line.rstrip('\n') for line in f]
        idx = random.randint(0, 327)
        address_list = address[idx]
        a = 11 - len(address_list)     # 地址第一行11个字
        for _ in range(a):
            address_list += name()
            
    return address_list


def address_line2():  
    """
    功能 : 随机生成地址
    """
    address_list = ''
    _rand = random.randint(5, 7)
    for _ in range(_rand):
        address_list += name()
 
    return address_list


def random_id_card():
    """
    功能 : 随机生成18位身份证ID
    """
    num_str = ''
    for _ in range(17): #  
        num_str += str(random.randint(0, 9))
        
    _rand = random.randint(0, 10)
    if _rand == 10:
        num_str += 'X'
    else:
        num_str += str(random.randint(0, 9))
    
    return num_str


def public():
    """
    功能 : 国徽面随机生成机关
    """
    public_list = ''
    _rand = random.randint(3, 6)
    for _ in range(_rand):
        public_list += name()
        
    public_list += '公安局'
    
    return public_list


def to_str(per):
    """
    功能 : 将元祖转为str
    """
    _str =''.join(per)
    
    return _str


# 定义写字函数
def add_txt(image, size, draw_x, draw_y, txt, Font='方正黑体.ttf'):

    setFont = ImageFont.truetype(ROOT + 'IDTemplate/%s' % Font, size)
    draw = ImageDraw.Draw(image)
    draw.text((draw_x, draw_y), txt, font=setFont, fill=(0, 0, 0))
    
    return image
 

def draw_txt(ori_image, img):
    
    mask_image_txt = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
    gray = cv2.cvtColor(mask_image_txt, cv2.COLOR_BGR2GRAY)
    # 高斯模糊,制造边缘模糊效果哦
    gray_Gaussianblur = cv2.GaussianBlur(gray, (3, 3), 0)
    # 使用阈值对图片进行二值化
    _, res = cv2.threshold(gray_Gaussianblur, 200, 255, cv2.THRESH_BINARY)
    res_inv = cv2.bitwise_not(res)
    # 写字的模板保留文字部分
    img_bg = cv2.bitwise_and(mask_image_txt, mask_image_txt, mask=res_inv)
    # 原图保留除文字的其他部分
    img_fg = cv2.bitwise_and(ori_image, ori_image, mask=res)
    # 将两张图直接进行相加,即可
    final = cv2.add(img_bg, img_fg)
    
    return final

# 照片面 
def make_maskA(person, nums, template_path, output_path):
    # 生成一个空白的模板mask
    ori_image = cv2.imread('%s/IDA{}.jpg'.format(random.randint(1, 2)) % template_path)
    mask_image = np.ones_like(ori_image)
    mask_image *= 255
    # print(mask_image.shape,' {}.jpg'.format(nums))
    
    # 往空白模板上写字(只能用PIL写,OpenCV写中文会乱码)
    img = Image.fromarray(cv2.cvtColor(ori_image, cv2.COLOR_BGR2RGB))
    img = add_txt(img, 58, 210, 95, to_str(person.name))
    img = add_txt(img, 50, 210, 205, to_str(person.sex))
    img = add_txt(img, 50, 490, 205, to_str(person.national))
    img = add_txt(img, 50, 210, 308, to_str(person.years))
    
    if (int(person.month[0]) > 9):
        img = add_txt(img, 50, 410, 308, to_str(person.month))
    else:
        img = add_txt(img, 50, 425, 308, to_str(person.month))
        
    if (int(person.day[0]) > 9):
        img = add_txt(img, 50, 535, 308, to_str(person.day))
    else:
        img = add_txt(img, 52, 550, 308, to_str(person.day))
        
    img = add_txt(img, 50, 210, 415, to_str(person.address1))
    img = add_txt(img, 50, 210, 485, to_str(person.address2))
    img = add_txt(img, 58, 420, 682, to_str(person.id_card), 'OCR-B 10 BT.ttf')
 
    final = draw_txt(ori_image, img)
    cv2.imwrite('%s/A_{}.jpg'.format(nums) % output_path, final)
    
# 国徽面
def make_maskB(person, nums, template_path, output_path):

    ori_image = cv2.imread('%s/IDB{}.jpg'.format(random.randint(1, 2)) % template_path)
    mask_image = np.ones_like(ori_image)
    mask_image *= 255
    # print(mask_image.shape,' {}.jpg'.format(nums))
 
    img = Image.fromarray(cv2.cvtColor(ori_image, cv2.COLOR_BGR2RGB))
    img = add_txt(img, 48, 515, 583, to_str(person.public))
    
    if (int(person.month[0]) > 9):
        month = to_str(person.month)
    else:
        month = ('0' + to_str(person.month))
        
    if (int(person.day[0]) > 9):
        day = to_str(person.day)
    else:
        day = ('0' + to_str(person.day))

    date = to_str(person.years) + '.' + month + '.' + day + '-' + str(int(person.years[0]) + random.randint(10, 30)) + '.' + month + '.' + day
    img = add_txt(img, 48, 515, 688, date)
    
    final = draw_txt(ori_image, img)
    cv2.imwrite('%s/B_{}.jpg'.format(nums) % output_path, final)
    
    
if __name__ == '__main__':
    
    ROOT = '../PaddleOCR-2.7.1/dataset/'
    
    template_path = ROOT + 'IDTemplate' # 存放模板
    output_path = ROOT + 'test'    # train or test
    mkdir(template_path)
    mkdir(output_path)
    
    for i in tqdm(range(0, 1000)):  
        person = Person(name=create_name(), sex=sex_word(), national=national_name(),
                        years=random.randint(1940, 2024), month=random.randint(1, 12), 
                        day=random.randint(1, 31), address1=address_line1(), address2=address_line2(), 
                        id_card=random_id_card(), public=public())
        
        make_maskA(person, str(i).zfill(5), template_path, output_path)
        make_maskB(person, str(i).zfill(5), template_path, output_path)
参考链接

https://aistudio.baidu.com/projectdetail/2338420

对照身份证前6位,整理了6457条数据.形成了省市区县3级地区表 如下案例. INSERT INTO `area` VALUES ('1', '0', '1', '11', '1100', '110000', '北京市'); INSERT INTO `area` VALUES ('2', '1', '2', '11', '1101', '110100', '市辖区'); INSERT INTO `area` VALUES ('3', '2', '3', '11', '1101', '110101', '东城区'); INSERT INTO `area` VALUES ('4', '2', '3', '11', '1101', '110102', '西城区'); INSERT INTO `area` VALUES ('5', '2', '3', '11', '1101', '110103', '崇文区'); INSERT INTO `area` VALUES ('6', '2', '3', '11', '1101', '110104', '宣武区'); INSERT INTO `area` VALUES ('7', '2', '3', '11', '1101', '110105', '朝阳区'); INSERT INTO `area` VALUES ('8', '2', '3', '11', '1101', '110106', '丰台区'); INSERT INTO `area` VALUES ('9', '2', '3', '11', '1101', '110107', '石景山区'); INSERT INTO `area` VALUES ('10', '2', '3', '11', '1101', '110108', '海淀区'); INSERT INTO `area` VALUES ('11', '2', '3', '11', '1101', '110109', '门头沟区'); INSERT INTO `area` VALUES ('12', '2', '3', '11', '1101', '110110', '燕山区'); INSERT INTO `area` VALUES ('13', '2', '3', '11', '1101', '110111', '房山区'); INSERT INTO `area` VALUES ('14', '2', '3', '11', '1101', '110112', '通州区'); INSERT INTO `area` VALUES ('15', '2', '3', '11', '1101', '110113', '顺义区'); INSERT INTO `area` VALUES ('16', '2', '3', '11', '1101', '110114', '昌平区'); INSERT INTO `area` VALUES ('17', '2', '3', '11', '1101', '110115', '大兴区'); INSERT INTO `area` VALUES ('18', '2', '3', '11', '1101', '110116', '怀柔区'); INSERT INTO `area` VALUES ('19', '2', '3', '11', '1101', '110117', '平谷区');
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值