Task:Excel文件按要求导出至模板excel中
要求其实挺多的,一下次也说不完,大致情况如下图所示
数据:配套数据下载链接(如未审核完成,请评论留言邮箱给我哈)
思路:
- 用openpyxl库筛选出“Description”中除了深灰色以外的数据行,并放入一个DataFrame中
- 按照模板要求,用pandas包加工处理,将需要的数据拼接在一起(由于模板中的A列第一个值和F列中的值要求模糊,这份并未完成,有想法的伙伴可以尝试并告诉我呀)
- 使用pd.to_excel输出为一个半成品excel文件(记为Target.xlsx)。
- 由于模板中的“Vendor Item Code”列要求需要跟原excel文件的“Description”列的颜色保持一致。
- 先用openpyxl包筛选出原excel中“Description”列除了白色和深灰色的数据行,记录到字典dic中,其中提取到的行中的“Item Number”列的数据作为Key,“Description”的单元格颜色配置作为value
- 用openpyxl打开Target.xlsx
- 将Target.xlsx的“Vendor Item Code”列与上述字典dic的Key做匹配(匹配上的就,将dic中的value即单元格颜色配置,赋予到Target.xlsx的“Vendor Item Code”列对应的单元格中)
- 然后保存Target.xlsx
代码(半成品-可运行):
import openpyxl
import pandas as pd
import numpy as np
import re
from copy import copy
def get_df(path):
global color
workbook = openpyxl.load_workbook(path)
worksheet = workbook[workbook.sheetnames[0]]
rows = worksheet.max_row
columns = worksheet.max_column
eliminate_color_cell = 'C3'
eliminate_color = worksheet[eliminate_color_cell].fill.fgColor
df = pd.DataFrame()
for x in range(7, rows + 1):
if worksheet.cell(x, 3).fill.fgColor != eliminate_color:
for i in range(1, columns + 1):
df.loc[x, i] = worksheet.cell(row=x, column=i).value
df.columns = [worksheet.cell(6, i).value for i in range(1, columns+1)]
# 得到符合颜色的行 ->字典
color = {}
for x in range(7, rows+1):
cell_color = worksheet.cell(x, 3).fill
if cell_color != worksheet[eliminate_color_cell].fill:
color[worksheet.cell(x, 2).value] = cell_color
return df
def df_process(df, out_path):
new_df = pd.DataFrame()
# 这一列是有问题的
new_df['Item Name'] = df["Description"].apply(lambda x: x.split(' ')[0])+" "+df["Major Classification"]+" " + df["Material Type"]+" "+df["Product Type"]
new_df['Vendor Item Code'] = df["Item Number"]
new_df['Sales Description'] = df['PC/CT'].apply(lambda x: '' if pd.isnull(x) else f'{x} EA/BX ') + (df['SF/CT or LF/CT'].apply(lambda x: '' if pd.isnull(x) else f'{x} ') + df["UOM"]).apply(lambda x: '' if re.findall('\d+', x) == [] else x)
new_df['Unit of Measure'] = df["UOM"].apply(lambda x: "SQUARE FOOT" if x == "SF" else ("EACH" if x == "PC" else ("LINEAR FOOT" if x=="LF" else "")))
new_df["Unit/Box"] = (df['PC/CT'].apply(lambda x: '' if pd.isnull(x) else str(x))+' '+df['SF/CT or LF/CT'].apply(lambda x: '' if pd.isnull(x) else str(x))).apply(lambda x: x.split(' ')[1] if x.split(' ')[1].replace('.', '').isdigit() else x.split(' ')[0])
new_df["Unit/Box"] = new_df["Unit/Box"].apply(lambda x: float(x) if x != '' else np.nan)
# 这一列是有问题的
new_df['Item Color'] = df["Description"].apply(lambda x: x.split(' ')[-2])
new_df['Item Size'] = df["Description"].apply(lambda x: re.findall(r'\d+X\d+', x)[-1] if (re.findall(r'\d+X\d+', x)!= [])else '')
new_df['Pcs in a Box'] = df['PC/CT']
new_df['Sqft By PCSs/SHEET'] = df['SF/PC']
new_df['Sqft By Box'] = df['SF/CT or LF/CT']
new_df['Cost'] = df['Price']
new_df.to_excel(out_path, index=False)
# 匹配颜色并输出
target_workbook = openpyxl.load_workbook(out_path)
target_worksheet = target_workbook[target_workbook.sheetnames[0]]
rows = target_worksheet.max_row
for i in range(1, rows + 1):
if target_worksheet.cell(row=i, column=2).value in color.keys():
target_worksheet.cell(row=i, column=2).fill = copy(color[target_worksheet.cell(row=i, column=2).value])
target_workbook.save(out_path)
def main(in_path, out_path):
data = get_df(in_path)
df_process(data, out_path)
if __name__ == '__main__':
in_path = r'C:\Users\Administrator\excel to excel\1366552-July2021-ISPL EXCEL FORM.xlsx'
out_path = r'C:\Users\Administrator\excel to excel\Target.xlsx'
main(in_path, out_path)