Python实现增值税发票OCR(带源码)

egowell

已于 2022-09-02 13:31:53 修改

阅读量5.3k

点赞数 4

分类专栏： OCR 文章标签：深度学习 python 人工智能

于 2022-08-31 16:30:47 首次发布

本文链接：https://blog.csdn.net/egowell/article/details/126626760

版权

OCR 专栏收录该内容

2 篇文章 4 订阅

订阅专栏

该博客介绍了一个基于Python实现的增值税发票自动识别系统。首先，通过图像预处理，包括视角变换和裁剪，将图片转换为正视图。接着，利用边缘检测和轮廓识别确定发票边界并进行透视变换。然后，根据已知位置截取关键信息区域进行文字识别，采用cnocr库进行文字识别。最后，将识别到的信息保存到Excel文件中。整个流程涉及图像处理、计算机视觉和自然语言处理技术。

摘要由CSDN通过智能技术生成

发票识别日常生活中经常能用到，之前浏览博客发现类似的文章，但源码只给了一小部分，所以决定自己来实现。

1.原始图片视角变换及裁剪：

现实中，我们拍照不可能像扫描那样端正，图片中的票据总会有这样那样的视角上的歪斜，使用这样的图片直接来识别极易出错，为了提高识别的准确性，我们需要先将图片预处理为扫描样式的正视图，并将不属于票据的图片部分剪切掉。

针对这一过程，参考相关资料本项目采用“整体图片边缘检测”（采用cv2.Canny函数）、“票据轮廓检测”（采用cv2.findContours函数），得到所需处理票据的轮廓，之后确定票据的四个顶点，最后使用cv2.getPerspectiveTransform和cv2.warpPerspective对于原始图片进行透视变换得到相应的票据的正视图。

2.根据已知位置识别相应文字：

作为第一个版本，简单起见，这里采用了提前找到感兴趣的如发票代码、销售方名称、备注等条目的位置，直接在上述正视图图片中截取对应的区域的方式找到相应信息的图片，通过cnocr库对于得到的区域图片进行文字识别。

3.将识别到的信息保存到相应excel文件中：

	import cv2
	import numpy as np
	from cnocr import CnOcr
	import pandas as pd
	from pandas import DataFrame
	import os

	#后续生成票据图像时的大小，按照标准增值税发票版式240mmX140mm来设定
	height_resize = 1400
	width_resize = 2400

	# 调整原始图片尺寸
	def resizeImg(image, height=height_resize):
	h, w = image.shape[:2]
	pro = height / h
	size = (int(w * pro), int(height))
	img = cv2.resize(image, size)
	return img

	# 边缘检测
	def getCanny(image):
	# 高斯模糊
	binary = cv2.GaussianBlur(image, (3, 3), 2, 2)
	# 边缘检测
	binary = cv2.Canny(binary, 60, 240, apertureSize=3)
	# 膨胀操作，尽量使边缘闭合
	kernel = np.ones((3, 3), np.uint8)
	binary = cv2.dilate(binary, kernel, iterations=1)
	return binary

	# 求出面积最大的轮廓
	def findMaxContour(image):
	# 寻找边缘
	contours, _ = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
	# 计算面积
	max_area = 0.0
	max_contour = []
	for contour in contours:
	currentArea = cv2.contourArea(contour)
	if currentArea > max_area:
	max_area = currentArea
	max_contour = contour
	return max_contour, max_area

	# 多边形拟合凸包的四个顶点
	def getBoxPoint(contour):
	# 多边形拟合凸包
	hull = cv2.convexHull(contour)
	epsilon = 0.02 * cv2.arcLength(contour, True)
	approx = cv2.approxPolyDP(hull, epsilon, True)
	approx = approx.reshape((len(approx), 2))
	return approx

	# 适配原四边形点集
	def adapPoint(box, pro):
	box_pro = box
	if pro != 1.0:
	box_pro = box/pro
	box_pro = np.trunc(box_pro)
	return box_pro

	# 四边形顶点排序，[top-left, top-right, bottom-right, bottom-left]
	def orderPoints(pts):
	rect = np.zeros((4, 2), dtype="float32")
	s = pts.sum(axis=1)
	rect[0] = pts[np.argmin(s)]
	rect[2] = pts[np.argmax(s)]
	diff = np.diff(pts, axis=1)
	rect[1] = pts[np.argmin(diff)]
	rect[3] = pts[np.argmax(diff)]
	return rect

	# 计算长宽
	def pointDistance(a, b):
	return int(np.sqrt(np.sum(np.square(a - b))))

	# 透视变换
	def warpImage(image, box):
	w, h = pointDistance(box[0], box[1]), \
	pointDistance(box[1], box[2])
	dst_rect = np.array([[0, 0],
	[w - 1, 0],
	[w - 1, h - 1],
	[0, h - 1]], dtype='float32')
	M = cv2.getPerspectiveTransform(box, dst_rect)
	warped = cv2.warpPerspective(image, M, (w, h))
	return warped

	# 统合图片预处理
	def imagePreProcessing(path):
	image = cv2.imread(path)
	# 转灰度、降噪
	#image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
	#image = cv2.GaussianBlur(image, (3,3), 0)
	# 边缘检测、寻找轮廓、确定顶点
	ratio = height_resize / image.shape[0]
	img = resizeImg(image)
	binary_img = getCanny(img)
	max_contour, max_area = findMaxContour(binary_img)
	boxes = getBoxPoint(max_contour)
	boxes = adapPoint(boxes, ratio)
	boxes = orderPoints(boxes)
	# 透视变化
	warped = warpImage(image, boxes)
	# 调整最终图片大小
	height, width = warped.shape[:2]
	#size = (int(width*height_resize/height), height_resize)
	size = (width_resize, height_resize)
	warped = cv2.resize(warped, size, interpolation=cv2.INTER_CUBIC)
	return warped

	# 截取图片中部分区域图像，测试阶段使用，包括显示与保存图片，实际使用时不使用这个函数，使用下面的正式版函数
	def cropImage_test(img, crop_range, filename='Undefined'):
	xpos, ypos, width, height = crop_range
	crop = img[ypos:ypos+height, xpos:xpos+width]
	if filename=='Undefined': #如果未指定文件名，采用坐标来指定文件名
	filename = 'crop-'+str(xpos)+'-'+str(ypos)+'-'+str(width)+'-'+str(height)+'.jpg'
	cv2.imshow(filename, crop) #展示截取区域图片---测试用
	#cv2.imwrite(filename, crop) #imwrite在文件名含有中文时会有乱码，应该采用下方imencode---测试用
	# 保存截取区域图片---测试用
	cv2.imencode('.jpg', crop)[1].tofile(filename)
	return crop

	# 截取图片中部分区域图像
	def cropImage(img, crop_range):
	xpos, ypos, width, height = crop_range
	crop = img[ypos:ypos+height, xpos:xpos+width]
	return crop

	# 从截取图片中识别文字
	def cropOCR(crop, ocrType):
	if ocrType==0:
	text_crop_list = ocr.ocr_for_single_line(crop)
	elif ocrType==1:
	text_crop_list = ocr_numbers.ocr_for_single_line(crop)
	elif ocrType==2:
	text_crop_list = ocr_UpperSerial.ocr_for_single_line(crop)
	text_crop = ''.join(text_crop_list)
	return text_crop


	if __name__ == '__main__':
	# 实例化不同用途CnOcr对象
	ocr = CnOcr(name='') #混合字符
	ocr_numbers = CnOcr(name='numbers', cand_alphabet='0123456789') #纯数字
	ocr_UpperSerial = CnOcr(name='UpperSerial', cand_alphabet='0123456789ABCDEFGHIJKLMNPQRSTUVWXYZ') #编号，只包括大写字母(没有O)与数字

	# 截取图片中部分区域图像-名称
	crop_range_list_name = ['发票代码', '发票号码', '开票日期',
	'校验码', '销售方名称', '销售方纳税人识别号',
	'销售方地址电话', '销售方开户行及账号', '价税合计',
	'备注']

	# 截取图片中部分区域图像-坐标
	crop_range_list_data = [[1870, 40, 380, 38], [1867, 104, 380,38], [1866, 166, 380, 50],
	[1867, 230, 450, 50], [421, 1046, 933, 46], [419, 1091, 933, 48],
	[420, 1145, 933, 47], [421, 1193, 933, 40], [1892, 976, 414, 48],
	[1455, 1045, 325, 38]]

	# 截取图片中部分区域图像-使用ocr的类型，0：混合字符，1：纯数字，2：编号
	crop_range_list_type = [1, 1, 0,
	1, 0, 2,
	0, 0, 0,
	0]

	# 预处理图像
	path = 'test.jpg'
	warped = imagePreProcessing(path)

	# 展示与保存预处理的图片---测试用
	#cv2.imshow('warpImage', warped)
	cv2.imwrite('result.jpg',warped)

	# 处理预处理图像并将结果保存到text_ocr列表中
	text_ocr = []
	for i in range(len(crop_range_list_data)):
	#filename = crop_range_list_name[i]+'.jpg' #测试阶段保存截取图片时使用的文件名，实际使用时不需要
	crop = cropImage(warped, crop_range_list_data[i])
	crop_text = cropOCR(crop, crop_range_list_type[i])
	crop_text = crop_text.replace('o','0') #发票中不会有小写字母o，凡是出现o的都使用0替代
	print(crop_range_list_name[i],':',crop_text)
	text_ocr.append(crop_text)

	# 按年月来保存结果到xlsx文件中，计算文件名
	date_temp = text_ocr[2].split('年')
	year_num = date_temp[0]
	month_num = date_temp[1].split('月')[0]
	filename = year_num+'-'+month_num+'.xlsx'

	# 如果文件还没建立，新建文件
	if not os.path.exists(filename):
	dic = {}
	for i in range(len(crop_range_list_name)):
	dic[crop_range_list_name[i]] = []
	df = pd.DataFrame(dic)
	df.to_excel(filename, index=False)

	data = pd.read_excel(filename)
	if not int(text_ocr[1]) in data['发票号码'].values.tolist():
	new_line_num = data.shape[0]
	data.loc[new_line_num] = text_ocr
	DataFrame(data).to_excel(filename, index=False, header=True)
	else:
	print(path,'is already in',filename,'!')

	cv2.waitKey(0)