Python脚本 - 提取Excel中的某些列并生成新文件

Lizzy_Fly

已于 2022-07-06 19:22:13 修改

阅读量4.8k

点赞数 3

分类专栏： Python

于 2022-04-28 17:11:03 首次发布

本文链接：https://blog.csdn.net/Lizzy_Fly/article/details/124478706

版权

python

Python 专栏收录该内容

27 篇文章 1 订阅

订阅专栏

Excel文件中包含很多列，但只需要提取某几列并保存成新文件，以下是脚本代码：

# ······-*- coding: utf-8 -*-
# @Software: PyCharm
# @PythonVersion: Python3.7
# @Purpose:" "
import openpyxl
import os
import sys
import datetime

#----------------需要提前定义好的信息  start-----------------
#原始的数据源完整路径
origin_file_path = r'D:\gdx.xlsx'
#原始的数据源里需要读取的sheet的名称
origin_excel_sheetname = '【Weekly】'
#需要的列的字段，即该列的第一行的值
need_column = ['工单号', '产品','结单组ID','uid','工单新建时间','工单解决时间','工单处理时长','工单状态','是否专标']

#按原文件的顺序排列
#need_column = ['工单号','uid','工单创建时间', '工单产品名称','工单状态','关单时间','处理组']
#提取时间
nowTime = datetime.datetime.now().strftime('%Y-%m-%d %H：%M')
#需要保存的文件路径和文件名
base_dir = 'D:/'
#excel_name = str(nowTime) + '_gdx.xlsx'
excel_name = nowTime + '_gdx.xlsx'
#----------------需要提前定义好的信息  end-----------------

#如果原始的数据源不存在，则直接退出程序
if not os.path.exists(origin_file_path):
    print('原始文件：{},不存在，退出！'.format(origin_excel_sheetname))
    sys.exit(1)
#----------------加载excel数据  start-----------------
#加载excel
wb = openpyxl.load_workbook(origin_file_path)
#根据sheet名称获取工作表,如果有则按sheet的名称读取，如果没有，取
sheet = wb[origin_excel_sheetname]
#等同于
# sheet = wb.get_sheet_by_name('Sheet1')

#最大的行数
row_max = sheet.max_row
#print('--row_max--',row_max)
#最大的列数
clo_max = sheet.max_column
#print('--clo_max--',clo_max)

#----------------加载excel数据  end-----------------


#----------------获取首行字段  start-----------------
#获取首行数据(首行字段名不能重名)，
# 按单元格取值，取第一行的所有列
first_row = []
for i in range(clo_max):
    # print('===i===',i,sheet.cell(row = 1,column = i+1).value)
    row_value = sheet.cell(row = 1,column = i+1).value
    #print('row_value===',row_value)
    if row_value not in first_row:
        first_row.append(row_value)
    else:
        print('首行存在重复的字段，重复字段名为：{}'.format(row_value))
# print('首行字段:'.format(first_row))
#print('first_row====',first_row)
#----------------获取首行字段  start-----------------

#----------------获取需要的列的字段的索引  start-----------------
#获取需要的列的字段
need_column_index = []
sort_index_list = []
for i in first_row:
    if i in need_column:
        index = first_row.index(i)
        #print('==原表中的序列:index==',i,index)
        sort_index = need_column.index(i)
        #print('==新表中的序列:sort_index==',i, sort_index)
        sort_index_list.append(sort_index)
        #print('==新表中的序列列表==', sort_index_list)
        need_column_index.append(index+1)
        #print('==原表中的序列+1==',need_column_index)
#print('需要的列的字段的索引：',need_column_index)
#print('sort_index_list的字段的索引：',sort_index_list)
#----------------获取需要的列的字段的索引  end-----------------

#原表里的字段索引和新建的表格的字段 建立一个映射关系
"""
{
    13:0,
    2:3,
}
"""
sort_map = dict(zip(need_column_index,sort_index_list))
#等同于
# sort_map = {}
# for i,j in zip(need_column_index,sort_index_list):
#     sort_map[i]=j
#print('sort_map=====',sort_map)

#----------------获取需要的列的数据  start-----------------
#根据需要获取的列数据，取出需要的数据，按行保存，数据格式如下：
"""
    [
        [124, 123], #第一行数据
        [123, 123], #第二行数据
    ]
"""
result = []
#按行读取数据
for rows in sheet.iter_rows():
    lis = [0 for i in need_column_index]
    #[0,0,0……]
    flag = False
    for cell in rows:
        #行索引
        row_index = cell.row
        #列索引
        col_index = cell.column
        #print('==row_index,col_index===',row_index,col_index)
        #当列索引在需要拿出的列索引列表里时，且不是第一行，则将数据存入临时变量
        if col_index in need_column_index and row_index !=1:
            sort_index = sort_map[col_index]
            lis[sort_index] = cell.value
            flag = True
            #if sort_index == 5 or sort_index == 6:
                #print('=======sort index========',sort_index,row_index,col_index,cell.value,lis)
            # lis.insert(sort_index,cell.value)
            #print('======lis====',lis)
            # lis.append(cell.value)
        # else:
        #     print('0000000000000')
    #如果临时变量不为空，说明为想要获取的值
    if flag:
        result.append(lis)

#print('需要的列的数据:',result)
#----------------获取需要的列的数据  end-----------------

#----------------将数据存入excel  start-----------------
#将数据存入excel
aim_wb = openpyxl.Workbook()
sheet1 = aim_wb.active
sheet1.title = origin_excel_sheetname
#首行字段名列表
row0 = need_column
# 写入第一行字段名
sheet1.append(row0)
#print(sheet1.append(row0))
#循环读取数据，将数据写入excel

for row in result:
    sheet1.append(row)

#如果提前定义的存储路径存在则直接用，否则为当前脚本路径
base_dir = base_dir if os.path.exists(base_dir) else os.getcwd()
excel_name = excel_name if excel_name else 'excel.xlsx'
save_path = os.path.join('D:/',excel_name)
print('保存路径：',save_path)
aim_wb.save(save_path)