PyQt5使用笔记(四) 离线OCR工具tesseract | Excel文档合并 2021.03.17

最新推荐文章于 2024-06-25 13:37:23 发布

天地之心online

最新推荐文章于 2024-06-25 13:37:23 发布

阅读量487

点赞数

文章标签： python excel pycharm

本文链接：https://blog.csdn.net/qq_17246289/article/details/114952760

版权

1. tesseract工具, 加上pytesseract库,可以实现python文字识别功能

tesseract 官方网页 tesseract-ocr , pytesseract可使用pip安装

2. 阅读pytesseract源码, 发现两个functools中的函数wraps和partial, wraps用于包裹器创建时重定向其__doc__至原函数, partial用于将函数的某些参数设置为默认值, 具体功能参考博文https://blog.csdn.net/YiJie__ShuSheng/article/details/105447203

3. 一个简易的excel表格合并类, 假定第一行为表头, 数据表格结构相同.

#myexcelcombine.py

import os
import xlrd, xlwt



class MyExcelCombine:
    def __init__(self, filepath):
        self.filepath = filepath

    def data_extract(self):
        files = os.listdir(self.filepath)
        excel_files = [file for file in files if file.endswith('.xls') or file.endswith('.xlsx')]
        data = []
        headers = []
        for file in excel_files:
            print(file)
            try:
                workbook = xlrd.open_workbook(self.filepath + '\\' + file)
                sheet = workbook.sheet_by_index(0)
            except:
                print(file + '有问题, 请及时检查!')
                return

            rows = list(sheet.get_rows())
            num_columns = len(rows[0])
            values = []
            for row in rows:
                values_row = [row[i].value for i in range(num_columns)]
                values.append(values_row)
            file_data = values[1:]
            data.extend(file_data)
            if not headers:
                headers = values[0]
        return data, headers

    #检查中英文字符个数, 返回字符串宽度
    @staticmethod
    def check_chinese(strings):
        nums = {'chinese': 0, 'others': 0}
        for character in strings:
            if u'\u4e00' <= character <= u'\u9fff':
                nums['chinese'] += 1
            else:
                nums['others'] += 1
        return nums['chinese'] * 2 + nums['others']

    def write2excel(self):
        data, headers = self.data_extract()
        path = self.filepath + '\\' + '输出结果'
        if not os.path.exists(path):
            os.mkdir(path)
        try:
            if os.listdir(path):
                for file in os.listdir(path):
                    os.remove(path + '\\' + file)
        except PermissionError as e:
            print(str(e) + "无法删除旧文件, 请关闭后重试!")
            return

        # 表格的列宽设定, 根据数据的最大长度设定或是手动设定
        #width = [20] * len(headers)
        width = []
        for column_index in range(len(data[0])):
            column = [str(row[column_index]) for row in data]
            max_length = max(list(map(self.check_chinese, column)))
            # 不是很精确的公式
            width.append(max_length)
        # title style
        align = xlwt.Alignment()
        align.vert = 0x01
        align.horz = align.HORZ_CENTER
        title_style = xlwt.XFStyle()
        title_style.num_format_str = 'general'
        font = xlwt.Font()
        font.name = '宋体'
        font.height = 20 * 12
        font.bold = True
        title_style.font = font
        title_style.alignment = align
        # data style
        data_style = xlwt.XFStyle()
        data_style.num_format_str = 'general'
        font = xlwt.Font()
        font.name = '宋体'
        font.height = 20 * 10
        font.bold = False
        data_style.font = font
        data_style.alignment = align

        #write
        new_excel = xlwt.Workbook()
        sheet = new_excel.add_sheet('sheet1', cell_overwrite_ok=True)
        name = path + '\\' + '合并结果.xls'
        for i, item in enumerate(headers):
            sheet.write(0, i, item, style=title_style)

        for i, item in enumerate(data):
            for j, value in enumerate(item):
                sheet.write(i+2, j, value, style=data_style)

        for i in range(0, len(headers)):
            sheet.col(i).width = 256 * width[i]

        new_excel.save(name)
        os.startfile(path)

if __name__ == '__main__':
    path = r'C:\Users\10706\Desktop\excel_dir'
    f = MyExcelCombine(path)
    f.write2excel()