python正则化相关内容----代码示例看了就会
import re
from tools.infer.predict_rec import application
import os
import json
class Invoice(object):
def __init__(self):
print("chuangjian")
# 正则匹配日期
def get_date(self, data_str):
data_str = data_str.strip()
data_str = data_str.replace(' ', '')
data_str = data_str.replace('B', '8')
data_str = data_str.replace('b', '6')
char_list = ['S', 's']
for char in char_list:
if char in data_str:
data_str = data_str.replace(char, '5')
char_list = ['I', 'i', 'l']
for char in char_list:
if char in data_str:
data_str = data_str.replace(char, '1')
char_list = ['Z', 'z']
for char in char_list:
if char in data_str:
data_str = data_str.replace(char, '2')
chars = "QWERTYUIPASDFGHJKLZXCVBNMqwertyuipasdfghjklzxcvbnm!()-_+=[]{};:\'|\\<>?/~`《》.."
char_list = ['O', 'o']
for char in char_list:
if char in data_str:
data_str = data_str.replace(char, "0")
for char in chars:
if char in data_str:
data_str = data_str.replace(char, '')
# 情形2:如果输入结果刚好是xxxx年xx月xx日的情况
p_date_2 = re.compile(r"\d{4}年\d{2}月\d{2}日$")
date_2 = p_date_2.findall(data_str)
# 情况1:如果输入的结果是 2020念10越16日
p_date_1 = re.compile(r"\d{4}.\d{2}.\d{2}.{1}$")
date_1 = p_date_1.findall(data_str)
# 情况3:2001黑01越2鈤
p_date_3 = re.compile(r"(\d{4}).(\d{1,2}).(\d{1,2}).*$")
date_3 = p_date_3.findall(data_str)
# # 情况4:20年1月1日
# p_date_4 = re.compile(r"\d{4}.\d{0,1}.$")
# date_4 = p_date_4.findall(data_str)
# print(date_4)
temp_result = ["xxxx", "年", 'xx', '月', 'xx', '日']
# 如果刚好满足第二中情况,则返回相应结果
if len(date_2) != 0:
return "".join(date_2[0])
# 即如果第一种情况成立
elif len(date_1) != 0:
char_list = []
for char in date_1[0]:
char_list.append(char)
# char_list.append("日")
date = ''.join(char_list)
return date
elif len(date_3) != 0:
temp_list = ['', '', '']
for index, item in enumerate(date_3[0]):
temp_list[index] = item
if (len(date_3[0][0])) == 0:
return data_str
elif (len(temp_list[0])) != 0:
temp_result[0] = temp_list[0]
if (len(temp_list[1])) == 0:
return data_str
elif (len(temp_list[1])) != 0:
if (len(temp_list[2])) == 0:
if len(temp_list[1]) < 2:
gqr_temp = "0" + temp_list[1]
temp_result[2] = "0" + temp_list[1]
else:
gqr_temp = temp_list[1]
temp_result[2] = temp_list[1]
temp_result[-1] = ''
temp_result[-2] = ''
gqr_temp = ''.join(temp_result)
return gqr_temp
else:
if len(temp_list[1]) < 2:
gqr_temp = "0" + temp_list[1]
temp_result[2] = "0" + temp_list[1]
else:
gqr_temp = temp_list[1]
temp_result[2] = temp_list[1]
if len(temp_list[2]) < 2:
temp_result[-2] = "0" + temp_list[2]
else:
temp_result[-2] = temp_list[2]
return ''.join(temp_result)
else:
return data_str
# 正则匹配金额
def get_money(self, money_str):
money_str = money_str.strip()
money_str = money_str.replace(' ', '')
money_str = money_str.replace('。', '.')
money_str = money_str.replace(',', '.')
money_str = money_str.replace(',', '.')
money_str = money_str.replace('-', '.')
money_str = money_str.replace('B', '0')
money_str = money_str.replace('b', '6')
char_list = ['S', 's']
for char in char_list:
if char in money_str:
money_str = money_str.replace(char, '5')
char_list = ['I', 'i', 'l']
for char in char_list:
if char in money_str:
money_str = money_str.replace(char, '1')
char_list = ['Z', 'z']
for char in char_list:
if char in money_str:
money_str = money_str.replace(char, '2')
char_list = ['O', 'o', 'C', 'c', 'n']
for char in char_list:
if char in money_str:
money_str = money_str.replace(char, '0')
p_money = re.compile(r"(\d*\.\d{0,2})$")
money = p_money.findall(money_str)
if len(money) != 0:
money_str = money[0]
chars = "QWERTYUIPASDFGHJKLZXVNMqwertyuipasdfghjklzxvbnm!()-_+=[]{};:\'|\\<>?/~`《》"
for char in chars:
if char in money_str:
money_str = money_str.replace(char, '')
chars = "0123456789."
for char in money_str:
if char not in chars:
money_str = money_str.replace(char, '')
p_money = re.compile(r"(\d*\.{0,1}\d{0,2})$")
money = p_money.findall(money_str)
str_temp = "".join(money)
# print(money,",,,,,,",str_temp)
if (len(money) != 0) and ('.' in str_temp):
money_list = ["xx", ".", "xx"]
money_split = money[0].split(".")
# 判断是否出现.20的情况
temp_1 = money_split[0]
temp_2 = money_split[1]
if len(temp_1) == 0:
money_list[0] = "0"
else:
money_list[0] = temp_1
# 判断出现10.2的情况
if len(temp_2) == 0:
money_list[2] = "00"
elif len(temp_2) == 2:
money_list[2] = temp_2
else:
temp = list(temp_2)
temp.append("0")
money_list[2] = ''.join(temp)
# print(money_list,"111")
money_str = ''.join(money_list)
return "¥" + money_str
elif ("." not in str_temp):
money_list = ["xx", ".", "xx"]
money_split = money[0].split(".")
# 判断是否出现.20的情况
temp_1 = money_split[0]
# temp_2 = money_split[1]
if len(temp_1) == 0:
money_list[0] = "0"
else:
money_list[0] = temp_1
money_list[2] = "00"
# print(money_list,"111")
money_str = ''.join(money_list)
return "¥" + money_str
else:
return '¥' + money_str
# 正则匹配纳税人识别号
def get_mf_code(self, pep_num_str):
pep_num_str = pep_num_str.strip()
pep_num_str = pep_num_str.replace(" ", '')
# 替换掉容易混淆字体 其中"IOSZV为不包含的号码
word_1s = ["i", 'I', 'l'] # 此列表中的字符都是数字1
for word_1 in word_1s:
if word_1 in pep_num_str:
pep_num_str = pep_num_str.replace(word_1, "1")
word_2s = ["s", "S"] # 此列表中的字符都是数字5
for word_2 in word_2s:
if word_2 in pep_num_str:
pep_num_str = pep_num_str.replace(word_2, "5")
word_3s = ['O', 'o'] # 此列表中的字母都是数字0
for word_3 in word_3s:
if word_3 in pep_num_str:
pep_num_str = pep_num_str.replace(word_3, "0")
word_4s = ['V', 'v'] # 取出掉列表中的字母
for word_4 in word_4s:
if word_4 in pep_num_str:
pep_num_str = pep_num_str.replace(word_4, '')
# 使用正则化提取相应信息
p_pep_num = re.compile(r"\d+.*$")
pep_num_str = p_pep_num.findall(pep_num_str)
if len(pep_num_str) != 0:
pep_num = pep_num_str[0]
pep_num = pep_num.upper() # 全部转为大写
# 去除掉非数字与非字母的内容
char_str = "QWERTYUIOPASDFGHJKLZXCVBNM0123456789"
# 将识别字符串变为列表形式
pep_num_list = list(pep_num)
# 枚举遍历列表中元素,如果元素不在char_str中,则赋值为空
temp_list = []
for index, item in enumerate(pep_num_list):
if item in char_str:
temp_list.append(item)
pep_num = ''.join(temp_list)
else:
return ''
return pep_num
# 正则匹配地址、电话
def get_add_phone(self, imf_str):
imf_str = imf_str.strip()
re.compile("")
# 数字金额转大写
def get_num2money(self,change_number):
"""
.转换数字为大写货币格式( format_word.__len__() - 3 + 2位小数 )
change_number 支持 float, int, long, string
"""
format_word = ["分", "角", "圆",
"拾", "佰", "仟", "万",
"拾", "佰", "仟", "亿",
"拾", "佰", "仟", "万",
"拾", "佰", "仟", "兆"]
format_num = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
if type(change_number) == str:
# - 如果是字符串,先尝试转换成float或int.
if '.' in change_number:
try:
change_number = float(change_number)
except:
'%s can\'t change' % change_number
else:
try:
change_number = int(change_number)
except:
'%s can\'t change' % change_number
if type(change_number) == float:
real_numbers = []
for i in range(len(format_word) - 3, -3, -1):
if change_number >= 10 ** i or i < 1:
real_numbers.append(int(round(change_number / (10 ** i), 2) % 10))
elif isinstance(change_number, int):
real_numbers = []
for i in range(len(format_word), -3, -1):
if change_number >= 10 ** i or i < 1:
real_numbers.append(int(round(change_number / (10 ** i), 2) % 10))
else:
'%s can\'t change' % change_number
zflag = 0 # 标记连续0次数,以删除万字,或适时插入零字
start = len(real_numbers) - 3
change_words = []
for i in range(start, -3, -1): # 使i对应实际位数,负数为角分
if 0 < real_numbers[start - i] or len(change_words) == 0:
if zflag:
change_words.append(format_num[0])
zflag = 0
change_words.append(format_num[real_numbers[start - i]])
change_words.append(format_word[i + 2])
elif 0 == i or (0 == i % 4 and zflag < 3): # 控制 万/元
change_words.append(format_word[i + 2])
zflag = 0
else:
zflag += 1
if change_words[-1] not in (format_word[0], format_word[1]):
# - 最后两位非"角,分"则补"整"
change_words.append("整")
return ''.join(change_words)
# 正则匹配将大写金额变为数字
def get_money2num(self, amount):
chinese_num = {'零': 0, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9}
chinese_amount = {'分': 0.01, '角': 0.1, '元': 1, '拾': 10, '佰': 100, '仟': 1000, '圆': 1}
amount_float = 0
if '亿' in amount:
yi = re.match(r'(.+)亿.*', amount).group(1)
amount_yi = 0
for i in chinese_amount:
if i in yi:
amount_yi += chinese_num[yi[yi.index(i) - 1]] * chinese_amount[i]
if yi[-1] in chinese_num.keys():
amount_yi += chinese_num[yi[-1]]
amount_float += amount_yi * 100000000
amount = re.sub(r'.+亿', '', amount, count=1)
if '万' in amount:
wan = re.match(r'(.+)万.*', amount).group(1)
amount_wan = 0
for i in chinese_amount:
if i in wan:
amount_wan += chinese_num[wan[wan.index(i) - 1]] * chinese_amount[i]
if wan[-1] in chinese_num.keys():
amount_wan += chinese_num[wan[-1]]
amount_float += amount_wan * 10000
amount = re.sub(r'.+万', '', amount, count=1)
amount_yuan = 0
for i in chinese_amount:
if i in amount:
if amount[amount.index(i) - 1] in chinese_num.keys():
amount_yuan += chinese_num[amount[amount.index(i) - 1]] * chinese_amount[i]
amount_float += amount_yuan
return amount_float
# 正则匹配发票号
def get_fp_number(self, fp_number):
fp_number = fp_number.strip()
fp_number = fp_number.replace(" ", "")
fp_number = fp_number.replace("B", "8")
fp_number = fp_number.replace("b", "6")
# 替换掉容易混淆字体 其中"IOSZV为不包含的号码
word_1s = ["i", 'I', 'l'] # 此列表中的字符都是数字1
for word_1 in word_1s:
if word_1 in fp_number:
fp_number = fp_number.replace(word_1, "1")
word_2s = ["s", "S"] # 此列表中的字符都是数字5
for word_2 in word_2s:
if word_2 in fp_number:
fp_number = fp_number.replace(word_2, "5")
word_3s = ['O', 'o'] # 此列表中的字母都是数字0
for word_3 in word_3s:
if word_3 in fp_number:
fp_number = fp_number.replace(word_3, "0")
word_4s = ['Z', 'z']
for word_4 in word_4s:
if word_4 in fp_number:
fp_number = fp_number.replace(word_4, "2")
char_str = "0123456789"
for char in fp_number:
if char not in char_str:
fp_number = fp_number.replace(char, "")
return fp_number
# 正则匹配发票code
def get_fp_code(self, fp_code):
fp_code = fp_code.strip()
fp_code = fp_code.replace(" ", "")
fp_code = fp_code.replace("B", "8")
fp_code = fp_code.replace("b", "6")
# 替换掉容易混淆字体 其中"IOSZV为不包含的号码
word_1s = ["i", 'I', 'l'] # 此列表中的字符都是数字1
for word_1 in word_1s:
if word_1 in fp_code:
fp_code = fp_code.replace(word_1, "1")
word_2s = ["s", "S"] # 此列表中的字符都是数字5
for word_2 in word_2s:
if word_2 in fp_code:
fp_code = fp_code.replace(word_2, "5")
word_3s = ['O', 'o'] # 此列表中的字母都是数字0
for word_3 in word_3s:
if word_3 in fp_code:
fp_code = fp_code.replace(word_3, "0")
word_4s = ['Z', 'z']
for word_4 in word_4s:
if word_4 in fp_code:
fp_code = fp_code.replace(word_4, "2")
char_str = "0123456789"
for char in fp_code:
if char not in char_str:
fp_code = fp_code.replace(char, "")
return fp_code
# 正则化匹配买方、卖方名称
def get_mf_name(self, mf_name):
name = mf_name.strip()
name = name.replace(" ", '')
return name
# 正则匹配买方、卖方开户行及账号
def get_mf_account(self, mf_account):
account = mf_account.strip()
account = account.replace(" ", '')
account_1_ = ''
account_2_ = ''
# 先取得开户银行名称
p_account_1 = re.compile(r"\D+.\D") # 匹配非数字的字符
account_1 = p_account_1.findall(account)
# 如果字符串不为空,则替换掉其中所有的非文字字符
if (len(account_1)) != 0:
account_1_str = account_1[0]
for temp_str in account_1_str:
# 判断ASCII码值
if ord(temp_str) < 127:
account_1_str = account_1_str.replace(temp_str, "")
account_1_ = account_1_str
# print("*******"*4,account_1_)
# 在得到开户行账号
p_account_2 = re.compile(r"\d+.*$")
account_2 = p_account_2.findall(account)
if (len(account_2)) != 0:
account_2_str = account_2[0]
char_list_1 = ['S', "s"]
for char in account_2_str:
if char in char_list_1:
account_2_str = account_2_str.replace(char, "5")
char_list_1 = ['Z', "z"]
for char in account_2_str:
if char in char_list_1:
account_2_str = account_2_str.replace(char, "2")
char_list_1 = ['I', "i", "l"]
for char in account_2_str:
if char in char_list_1:
account_2_str = account_2_str.replace(char, "1")
char_list_1 = ["O", "o"]
for char in account_2_str:
if char in char_list_1:
account_2_str = account_2_str.replace(char, "0")
account_2_str = account_2_str.replace("B", "8")
account_2_str = account_2_str.replace("q", "9")
# 遍历字符串,消除其中的非数字部分
char_str = "0123456789"
for char in account_2_str:
if char not in char_str:
account_2_str = account_2_str.replace(char, "")
account_2_ = account_2_str
# print("*******" * 4, account_2_)
return account_1_ + " " + account_2_
# 正则匹配买方、卖方地址与电话;,
def get_mf_infor(self, mf_infor):
infor = mf_infor.strip()
infor = infor.replace(" ", '')
infor = infor.replace("_", '-')
infor = infor.replace("--", '-')
char_str = "~`!@$%^&*+=\"|{}【】[],。《》<>/\\‘;,:"
for char in infor:
if char in char_str:
infor = infor.replace(char, '')
infor = infor.replace('‘', "")
return infor
invoice = Invoice()
# inf = application()
# for it in inf:
# money = invoice.get_money(it[0])
# print(money)
# dir_path = r"D:\project\PaddleOCR_test\money"
dir_path = r"D:\project\PaddleOCR_test\json_data"
# 调用小写金额函数
def get_money(money_1_pre):
if not money_1_pre is None:
money_1 = invoice.get_money(money_1_pre)
# print(filename, " 原始输入小小写金额:", money_1_pre, " 正则化后的小写金额:", money_1)
return money_1
else:
# print(money_1_pre)
return money_1_pre
# 调用开票日期
def get_fp_date(fp_date):
if not fp_date is None:
date = invoice.get_date(fp_date)
# print(filename, " 原始开票日期为:", fp_date, " 正则化后的开票日期:", date)
return date
else:
# print(fp_date)
return fp_date
# 调用发票号码
def get_fp_num(fp_num):
if not fp_num is None:
num = invoice.get_fp_number(fp_num)
# print(filename, " 原始发票号码:", fp_num, " 正则化后的发票号码:", num)
return num
else:
# print(fp_num)
return fp_num
# 调用发票代码
def get_fp_code(fp_code):
if not fp_code is None:
code = invoice.get_fp_code(fp_code)
# print(filename, " 原始发票代码:", fp_code, " 正则化后的发票代码:", code)
return code
else:
# print(fp_code)
return fp_code
# 调用买方、卖方名称
def get_mf_name(mf_name):
if not mf_name is None:
name = invoice.get_mf_name(mf_name)
# print(filename, " 原始购买方名称:", mf_name, " 正则化后的购买方名称:", name)
return name
else:
# print(mf_name)
return mf_name
# 调用买方、卖方识别号
def get_mf_code(mf_code):
if not mf_code is None:
code = invoice.get_mf_code(mf_code)
# print(filename, " 原始购买方识别号:", mf_code, " 正则化后的购买方识别号:", code)
return code
else:
# print(mf_code)
return mf_code
# 得到买方、卖方开户行及账号
def get_mf_account(mf_account):
if not mf_account is None:
account = invoice.get_mf_account(mf_account)
# print(filename, " 原始购买开户行及账号:", mf_account, " 正则化后的开户行及账号:", account)
return account
else:
# print(mf_account)
return mf_account
# 得到买方、卖方地址与电话
def get_mf_infor(mf_infor):
if not mf_infor is None:
infor = invoice.get_mf_infor(mf_infor)
# print(filename, " 原始购买方地址与电话:", mf_infor, " 正则化后的买方地址与电话:", infor)
return infor
else:
# print(mf_infor)
return mf_infor
# 得到大写金额(即传入大写金额与小写的数字金额)
def get_bigmoney(fp_bigmoney, money_1_pre):
# 如果传入的小写金额不为None
# print(filename, " 原始小写金额:", money_1_pre, " 正则化后的小写金额:", money," 得到的大写金额为:",num2bigmoney)
# 如果传入的大写金额发票不为空,则进行匹配处理
if not fp_bigmoney is None:
fp_bigmoney = fp_bigmoney.strip()
fp_bigmoney = fp_bigmoney.replace(" ", '')
word_str = '0123456789'
for word in fp_bigmoney:
if word in word_str:
fp_bigmoney = fp_bigmoney.replace(word, '')
# 利用ASCII码去除掉其他杂乱字符
for word in fp_bigmoney:
if ord(word)<137:
fp_bigmoney = fp_bigmoney.replace(word, '')
temp_bigmoney=fp_bigmoney
char_list = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖", "分", "角", "圆", "拾", "百", "阡", "万", "亿", "兆", "整"]
for bigmoney in fp_bigmoney:
if bigmoney not in char_list:
if not money_1_pre is None:
# 先对传入的数字金额进行正则匹配
money = invoice.get_money(money_1_pre)
money = money.replace("¥", '')
# 如果为0.00元说明在小写金额出现问题,所以就返回原先的数据
if money=="0.00":
# print("哈哈哈",temp_bigmoney)
return temp_bigmoney
break
# 转换得到的大写金额为
num2bigmoney = invoice.get_num2money(money)
return num2bigmoney
break
return fp_bigmoney
# print(filename, " 原始购买方地址与电话:", fp_bigmoney, " 正则化后的买方地址与电话:", None)
# 如果传入的大写金额为空,则先将小写金额转换,得到大写金额
else:
if not money_1_pre is None:
# 先对传入的数字金额进行正则匹配
money = invoice.get_money(money_1_pre)
money = money.replace("¥", '')
# 转换得到的大写金额为
num2bigmoney = invoice.get_num2money(money)
return num2bigmoney
else:
return fp_bigmoney
for parent, dirname, filenames in os.walk(dir_path):
for filename in filenames:
infor_dit={}
infor_dit_goumai={}
infor_dit_xiaoshou={}
file_path = os.path.join(dir_path, filename)
print(file_path)
with open(file_path, encoding="utf-8") as f:
json_f = json.load(f)
#_________________________购买方信息______________________________
fp_goumai = json_f["购买方"]
if not fp_goumai is None:
# 得到买方、卖方--名称
name = get_mf_name(fp_goumai["名称"])
infor_dit_goumai['名称']=name
# 得到买方、卖方--纳税人识别号
mf_code = get_mf_code(fp_goumai["纳税人识别号"])
infor_dit_goumai['纳税人识别号']=mf_code
# 得到地址与电话
infor = get_mf_infor(fp_goumai["地址、电话"])
infor_dit_goumai['地址、电话']=infor
# 得到买方、卖方--开户行账户号
account = get_mf_account(fp_goumai['开户行及账号'])
infor_dit_goumai['开户行及账号']=account
infor_dit['购买方'] = infor_dit_goumai
else:
infor_dit['购买方']=None
# _________________________销售方信息______________________________
fp_goumai = json_f["销售方"]
if not fp_goumai is None:
# 得到买方、卖方--名称
name = get_mf_name(fp_goumai["名称"])
infor_dit_xiaoshou['名称'] = name
# 得到买方、卖方--纳税人识别号
mf_code = get_mf_code(fp_goumai["纳税人识别号"])
infor_dit_xiaoshou['纳税人识别号'] = mf_code
# 得到地址与电话
infor = get_mf_infor(fp_goumai["地址、电话"])
infor_dit_xiaoshou['地址、电话'] = infor
# 得到买方、卖方--开户行账户号
account = get_mf_account(fp_goumai['开户行及账号'])
infor_dit_xiaoshou['开户行及账号'] = account
infor_dit['销售方'] = infor_dit_xiaoshou
else:
infor_dit['销售方']=None
fp_date = json_f["开票日期"]
date = get_fp_date(fp_date)
infor_dit['开票日期'] = date
fp_code = json_f["发票代码"]
code = get_fp_code(fp_code)
infor_dit['发票代码'] = code
fp_num = json_f["发票号码"]
num = get_fp_num(fp_num)
infor_dit['发票号码'] = num
money_1_pre = json_f['小写金额']
li_money = get_money(money_1_pre)
infor_dit['小写金额'] = li_money
fp_bigmoney = json_f["大写金额"]
bigmoney = get_bigmoney(fp_bigmoney, money_1_pre)
infor_dit['大写金额'] = bigmoney
money_1_bad = json_f['税前金额']
bad_money = get_money(money_1_bad)
infor_dit['税前金额'] = bad_money
print("*"*20)
print(infor_dit)