之前帮同事处理的excel文件,因为表头是重复向右延伸,导致有很多列其实含义一致,但又不方便存储;为了方便将该excel的数据直接入库,此处需要将excel中进行处理,删除不必要的标题行、转置excel的行列等等
文件结构示例:
处理前:
组织 | 时间1 | 时间2 | ...... | ||||||
人数 | 表头1 | 表头2 | ...... | 人数 | 表头1 | 表头2 | ...... | ...... | |
处理后:
组织 | 时间 | 人数 | 表头1 | 表头2 | ...... |
时间1 | |||||
时间2 | |||||
... |
代码如下,仅供参考:
import pandas
def xls_cont(path):
# 获取第一个sheet
sheet = pandas.read_excel(path, dtype='str', header=None)
return sheet
if __name__ == '__main__':
# 定义文件变量
filepath = 'D:/上报模板(原).xlsx'
aim_path = 'D:/上报模板.xlsx'
sheet = xls_cont(filepath)
# 获取行数列数
rows = sheet.shape[0]
cols = sheet.shape[1]
# 确定数据起始行列(此处用“人员身份”确定数据的起始位置)
for r in range(0, rows):
for c in range(0, cols):
if sheet.values[r, c] == '人员身份':
start_row = r
start_col = c
break
# 获取左侧固定列
new_col = ['人员身份', '人员类别', '大类', '一级组织', '二级组织', '三级组织', '细分人员类别', '费用类别', '指标分类',
'人数', '表头1', '表头2', '表头3', '表头4', '表头5', '表头6', '表头7', '合计1', '合计2']
for c in range(start_col, cols):
if sheet.values[start_row, c] == new_col[0]:
c_dazao = c
elif sheet.values[start_row, c] == new_col[1]:
c_leibie = c
elif sheet.values[start_row, c] == new_col[2]:
c_dalei = c
elif sheet.values[start_row, c] == new_col[3]:
c_zuzhi1 = c
elif sheet.values[start_row, c] == new_col[4]:
c_zuzhi2 = c
elif sheet.values[start_row, c] == new_col[5]:
c_zuzhi3 = c
elif sheet.values[start_row, c] == new_col[6]:
c_xifen = c
elif sheet.values[start_row, c] == new_col[7]:
c_feiyong = c
# 以人数为基准,找右侧数据列(列数有规律才这么弄)
c_renshu = []
for c in range(start_col, cols):
if sheet.values[start_row + 1, c] == '人数':
c_renshu.append(c)
# 开始获取并梳理数据结构
new_list = []
for i_renshu in c_renshu:
for i_r in range(start_row + 2, rows):
xifen = str(sheet.values[i_r, c_xifen])
if xifen != '' and xifen != 'nan':
i_new_list = [sheet.values[i_r, c_dazao], sheet.values[i_r, c_leibie], sheet.values[i_r, c_dalei],
sheet.values[i_r, c_zuzhi1], sheet.values[i_r, c_zuzhi2], sheet.values[i_r, c_zuzhi3],
sheet.values[i_r, c_xifen], sheet.values[i_r, c_feiyong],
sheet.values[start_row, i_renshu],
sheet.values[i_r, i_renshu], sheet.values[i_r, i_renshu + 1],
sheet.values[i_r, i_renshu + 2], sheet.values[i_r, i_renshu + 3],
sheet.values[i_r, i_renshu + 4], sheet.values[i_r, i_renshu + 5],
sheet.values[i_r, i_renshu + 6], sheet.values[i_r, i_renshu + 7],
sheet.values[i_r, i_renshu + 8], sheet.values[i_r, i_renshu + 9]]
new_list.append(i_new_list)
# 数据放入DataFrame
new_sheet = pandas.DataFrame(new_list, columns=new_col)
print(new_sheet)
# 保存文件
new_sheet.to_excel(aim_path)