1. tesseract工具, 加上pytesseract库,可以实现python文字识别功能
tesseract 官方网页 tesseract-ocr , pytesseract可使用pip安装
2. 阅读pytesseract源码, 发现两个functools中的函数wraps和partial, wraps用于包裹器创建时重定向其__doc__至原函数, partial用于将函数的某些参数设置为默认值, 具体功能参考博文https://blog.csdn.net/YiJie__ShuSheng/article/details/105447203
3. 一个简易的excel表格合并类, 假定第一行为表头, 数据表格结构相同.
#myexcelcombine.py
import os
import xlrd, xlwt
class MyExcelCombine:
def __init__(self, filepath):
self.filepath = filepath
def data_extract(self):
files = os.listdir(self.filepath)
excel_files = [file for file in files if file.endswith('.xls') or file.endswith('.xlsx')]
data = []
headers = []
for file in excel_files:
print(file)
try:
workbook = xlrd.open_workbook(self.filepath + '\\' + file)
sheet = workbook.sheet_by_index(0)
except:
print(file + '有问题, 请及时检查!')
return
rows = list(sheet.get_rows())
num_columns = len(rows[0])
values = []
for row in rows:
values_row = [row[i].value for i in range(num_columns)]
values.append(values_row)
file_data = values[1:]
data.extend(file_data)
if not headers:
headers = values[0]
return data, headers
#检查中英文字符个数, 返回字符串宽度
@staticmethod
def check_chinese(strings):
nums = {'chinese': 0, 'others': 0}
for character in strings:
if u'\u4e00' <= character <= u'\u9fff':
nums['chinese'] += 1
else:
nums['others'] += 1
return nums['chinese'] * 2 + nums['others']
def write2excel(self):
data, headers = self.data_extract()
path = self.filepath + '\\' + '输出结果'
if not os.path.exists(path):
os.mkdir(path)
try:
if os.listdir(path):
for file in os.listdir(path):
os.remove(path + '\\' + file)
except PermissionError as e:
print(str(e) + "无法删除旧文件, 请关闭后重试!")
return
# 表格的列宽设定, 根据数据的最大长度设定或是手动设定
#width = [20] * len(headers)
width = []
for column_index in range(len(data[0])):
column = [str(row[column_index]) for row in data]
max_length = max(list(map(self.check_chinese, column)))
# 不是很精确的公式
width.append(max_length)
# title style
align = xlwt.Alignment()
align.vert = 0x01
align.horz = align.HORZ_CENTER
title_style = xlwt.XFStyle()
title_style.num_format_str = 'general'
font = xlwt.Font()
font.name = '宋体'
font.height = 20 * 12
font.bold = True
title_style.font = font
title_style.alignment = align
# data style
data_style = xlwt.XFStyle()
data_style.num_format_str = 'general'
font = xlwt.Font()
font.name = '宋体'
font.height = 20 * 10
font.bold = False
data_style.font = font
data_style.alignment = align
#write
new_excel = xlwt.Workbook()
sheet = new_excel.add_sheet('sheet1', cell_overwrite_ok=True)
name = path + '\\' + '合并结果.xls'
for i, item in enumerate(headers):
sheet.write(0, i, item, style=title_style)
for i, item in enumerate(data):
for j, value in enumerate(item):
sheet.write(i+2, j, value, style=data_style)
for i in range(0, len(headers)):
sheet.col(i).width = 256 * width[i]
new_excel.save(name)
os.startfile(path)
if __name__ == '__main__':
path = r'C:\Users\10706\Desktop\excel_dir'
f = MyExcelCombine(path)
f.write2excel()