前言:
python老大要求搞个excel固定模板导入… 不过已经给了工具包,基础语法熟悉后,好像也是拿来即用。
在此记录下demo,该程序支持2003,2007版本的excel解析,但因版本问题调整了xlrd代码,需要替换(否则可能出现错误:‘ElementTree’ object has no attribute ‘getiterator’)。仅自己学习使用,大佬勿喷。
1.准备阶段
1.我的python环境是装的 Anaconda3.9,自己从官网下即可。https://www.anaconda.com/。开发工具我使用的是 pycharm。
2.安装依赖
随便搞个记事本复制下面的内容,cmd 进入命令窗口, pip install -r 路径/xxx.txt
ps: 如果下载速度过慢,可能是没有替换为清华源,自己百度安装即可。
xlrd==1.2.0
pandas==1.3.4
openpyxl==3.0.9
toml==0.10.2
flask==1.1.2
flask_cors==3.0.10
安装完后,需要修改 xlrd的一些源代码。复制下面的路径,找到目录下的 xlrd
的xlsx.py
文件,全局替换 self.tree.getiterator():
为 self.tree.iter():
2.程序代码
- 核心工具类
新建 utils目录 和 read_excel.py 文件
# 读取excel数据
import sys
sys.path.append('../app')
sys.path.append('../pre')
# 该路径是我本地包的目录,根据实际路径修改
sys.path.append('../utils')
sys.path.append('../source')
import os.path
from utils import read_tools
import openpyxl
import xlrd
import json
import unicodedata
import re
import logging
# from app.tools import logging
# logging.info('--------- read_excel -----')
class ReadExcel():
def __init__(self):
self._sheet_name = []
self._wb = None # xlrd wb 对象
self._activesheet_name = ''
self.excel_version = ''
self.sheet = None
# 读取excel信息
def read(self, file_path):
if not os.path.exists(file_path):
raise FileExistsError
file_path = file_path.replace('\\', '/')
self.file_path = file_path
self.file_name = file_path.split('/')[-1]
self.file_dir = '/'.join(file_path.split('/')[:-1])
self.file_path = file_path
self._wb = xlrd.open_workbook(file_path)
if self.file_name.endswith('.xlsx') or self.file_name.endswith('.xlsm'):
self.excel_version = '2007'
elif self.file_name.endswith('.xls'):
self.excel_version = '2003'
self.get_sheet_hidden_info()
# 获取 sheet 页的隐藏情况,隐藏页不做分析。
def get_sheet_hidden_info(self):
self.sheet_hidden = {}
if self.excel_version == '2003':
tmp_wb = xlrd.open_workbook(self.file_path,
formatting_info=True)
for sheet_name in tmp_wb.sheet_names():
if tmp_wb.sheet_by_name(sheet_name).visibility == 1:
self.sheet_hidden[sheet_name] = True
else:
self.sheet_hidden[sheet_name] = False
elif self.excel_version == '2007':
tmp_wb = openpyxl.load_workbook(self.file_path)
for sheet_name in tmp_wb.sheetnames:
if tmp_wb[sheet_name].sheet_state == 'hidden':
self.sheet_hidden[sheet_name] = True
else:
self.sheet_hidden[sheet_name] = False
# 获取所有sheet页名称
def get_sheet_name(self):
if self._sheet_name == []:
self._sheet_name = self._wb.sheet_names()
return self._sheet_name
# 获取活动sheet页
def get_active_sheet_name(self):
def get_xlsx_active_sheet(file):
try:
workbook = openpyxl.load_workbook(file, data_only=True)
s = str(workbook.active) # '<Worksheet "报关单">'
return s[s.index('"') + 1:s.rindex('"')] # 报关单
except:
return ''
if self._activesheet_name == '':
if self.excel_version == '2007':
self._activesheet_name = get_xlsx_active_sheet(self.file_path)
else:
for sheet_name in self.get_sheet_name():
if self._wb.sheet_by_name(sheet_name).sheet_selected:
self._activesheet_name = sheet_name
break
return self._activesheet_name
# 获取 整个sheet 页内容, 字典结构
def get_sheet(self, sheet_name=''):
if sheet_name in self.get_sheet_name():
return read_tools.get_sheet_info_xls(self._wb, sheet_name)
else:
# 不存在的页,取默认页
return read_tools.get_sheet_info_xls(self._wb, self._worksheet_name)
# 设置工作页,以下所有操作都基于这个页
def set_worksheet(self, sheet_name=''):
if sheet_name and sheet_name in self.get_sheet_name():
self._worksheet_name = sheet_name
self._ws = self._wb.sheet_by_name(sheet_name)
else:
self._worksheet_name = self._activesheet_name
self._ws = self._wb.sheet_by_name(self._worksheet_name)
self.sheet = self.get_sheet()
# 获取工作页某个单元格内容
def cell(self, cell):
try:
if self._ws.cell(*cell).value != '':
# return read_tools.clean_str_zero(self._ws.cell_result(cell_result[0], cell_result[1]).value)
return read_tools.format_xlrd_type(self._ws.cell(*cell).value,
self._ws.cell(*cell).ctype)
except:
# 越界了,不需要处理
pass
return ''
def cell_type(self, cell):
try:
if self._ws.cell(*cell).value != '':
return read_tools.format_type_mapping[self._ws.cell(*cell).ctype]
except:
pass
return ''
def num_row(self):
return self._ws.nrows
def num_col(self):
return self._ws.ncols
# 如果一个sheet只有一张表格,那么可以简单的用这个返回结果。
def read_simple_table(self,
json_col_names=[],
# 需要做 json 转换的列名,类似 json.loads('{"split_char":"/()\n"}') -> {"split_char":"/()\n"}
add_limit={}, ## {'begin_cell':None, # 左上角
# 'end_cell':None}, # 又下角, 设定指定区域进行读取
with_blank_title=True, # 标题可为空,自动命名:#blank_col_前面一个节点
):
def get_title(self, table_position):
title = []
for col_j in range(table_position['begin_cell'][1], # 第一列
table_position['end_cell'][1]): # 读到最后一列
t = self.cell((table_position['begin_cell'][0], # 标题行
col_j)).strip()
if t:
title.append(t)
else:
title.append(f'#blank_{col_j}_{title[-1] if title else ""}')
return title
def read_row(self, row_i, title, table_position):
result_row = {}
for col_j in range(table_position['begin_cell'][1], # 第一列
table_position['end_cell'][1]): # 读到最后一列
col_title = title[col_j]
if col_title:
cell_text = self.cell((row_i, col_j))
if cell_text and col_title in json_col_names:
try:
result_row[col_title] = json.loads(cell_text.replace('\n', ' '))
except Exception as e:
logging.warning(f'{cell_text} json 解析错误')
else:
result_row[col_title] = self.cell((row_i, col_j))
return result_row
if add_limit and 'begin_cell' in add_limit and add_limit['begin_cell']:
begin_cell = add_limit['begin_cell']
else:
begin_cell = (0, 0)
if add_limit and 'end_cell' in add_limit and add_limit['end_cell']:
end_cell = (min(add_limit['end_cell'][0] + 1, self.num_row()), # 包含起止位置。
min(add_limit['end_cell'][1] + 1, self.num_col()))
else:
end_cell = (self.num_row(), self.num_col())
table_position = {'begin_cell': begin_cell, 'end_cell': end_cell}
table = []
title = get_title(self, table_position)
for row_i in range(table_position['begin_cell'][0] + 1, # 从标题下一行开始读
table_position['end_cell'][0]): # 直到结束,
result_row = read_row(self, row_i, title, table_position)
if result_row:
table.append(result_row)
return table
def main():
excel = ReadExcel()
# file_path = '../sample/xxx.xls'
# file_path = '../sample/格式.xlsx'
file_path = r'../file/xxx-17份-0520-1.xls'
try:
excel.read(file_path)
except FileExistsError:
return {'status': '-1', 'msg': f'文件不存在:{file_path}'}
# print(excel.get_sheet_name())
# print(excel.get_active_sheet_name())
# print(excel.get_sheet(excel.get_active_sheet_name()))
bigList = []
for sheet_name in excel.get_sheet_name():
excel.set_worksheet(sheet_name)
if re.match(re.compile(r'\d+$'), sheet_name) is not None:
bigList.append(list_example_one(excel, sheet_name))
elif re.match(re.compile(r'\d+\(\d+\)$'), sheet_name) is not None:
bigList.append(list_example_two(excel, sheet_name))
# print(excel.num_row())
# print(excel.num_col())
return
if __name__ == '__main__':
main()
utils下新建 read_tools.py
import logging
import xlrd
import pandas as pd
import datetime
# 清除字符串数字后的.0 # 解决xlrd 会将 9031492000 -> 9031492000.0 的bug
def clean_str_zero(t):
try:
float(t)
t = str(t)
if t.endswith('.0'):
return t[:-2]
return t
except:
pass
return str(t)
import opencc
cc_t2s = opencc.OpenCC('t2s') # 繁体转简体
# s = cc_t2s.convert('眾議長與李克強會談')
format_type_mapping={0:'empty',1:'string', 2:'number', 3:'date', 4:'boolean', 5:'error'}
def format_xlrd_type(t,ty):
# xlrd ctype: 0 empty,1 string, 2 number, 3 date, 4 boolean, 5 error
if ty==1:
t = str(t).replace(' ',' ').strip() # 异常字符 NSBP
return cc_t2s.convert(t)
if ty==2:
return clean_str_zero(t)
if ty==3:
return (datetime.datetime(1900,1,1) +datetime.timedelta(days=int(t)-2)).strftime('%Y/%m/%d') # 数字转换日期
if ty==4:
if t == 1:
r = 'True'
elif t==0:
r = 'False'
else:
r=str(t)
return r
return ''
def get_sheet_info_xls(workbook, sheet_name):
worksheet = workbook.sheet_by_name(sheet_name)
merge_cells = []
cell_texts = {}
cell_format_type={}
# 获取单元格内容信息
for i in range(min(worksheet.nrows,50000)):
for j in range(min(worksheet.ncols,200)):
try:
if worksheet.cell(i, j).value:
# cell_texts[(i, j)] = clean_str_zero(worksheet.cell_result(i, j).value)
if worksheet.cell(i, j).value != '':
cell_texts[(i, j)] = format_xlrd_type(worksheet.cell(i, j).value,
worksheet.cell(i, j).ctype)
cell_format_type[(i, j)] = format_type_mapping[worksheet.cell(i, j).ctype]
except:
logging.error(f'worksheet.cell({i}, {j}).value 不存在,报错')
# cell_format_type 根据需要再返回
return cell_texts
# # 获取单元格合并信息
# for item in worksheet.merged_cells:
# # print(item[0],item[2],item[1]-1,item[3]-1)
# merge_cells.append(((item[0], item[2]), (item[1] - 1, item[3] - 1)))
#
# merge_cells = sorted(merge_cells)
# return cell_texts, merge_cells
def get_sheet_info_xlsx(workbook, sheet_name):
worksheet = workbook[sheet_name]
merge_cells = []
cell_texts = {}
# 获取单元格内容信息
for i in range(worksheet.max_row):
for j in range(worksheet.max_column):
if worksheet.cell(i + 1, j + 1).value:
cell_texts[(i, j)] = worksheet.cell(i + 1, j + 1).value
# 获取单元格合并信息
for m_area in worksheet.merged_cells:
# 合并单元格的起始行坐标、终止行坐标
r1, r2, c1, c2 = m_area.min_row, m_area.max_row, m_area.min_col, m_area.max_col
merge_cells.append(((r1 - 1, c1 - 1), (r2 - 1, c2 - 1)))
merge_cells = sorted(merge_cells)
return cell_texts, merge_cells
def read_excel(file,sheet_name='',use_pandas=False):
if use_pandas:
df = pd.read_excel(file,sheet_name=sheet_name)
return df
else:
workbook = xlrd.open_workbook(file)
if sheet_name:
return workbook.sheet_by_name(sheet_name)
return workbook
if __name__ == '__main__':
pass