A pdf report automatically converted to excel tool FOR GMZ

WhiTarff

已于 2024-07-08 16:22:39 修改

阅读量172

点赞数 2

分类专栏：自用代码文章标签： pdf excel python 正则表达式

于 2024-07-08 16:12:04 首次发布

本文链接：https://blog.csdn.net/WhiTarff/article/details/140271510

版权

自用代码专栏收录该内容

1 篇文章 0 订阅

订阅专栏

第一部分：Step1_pdfplumber

主要将pdf中表格&非表格文本提取到excel中，基于每行有效单元格数区分标题文字，以此将表格分区可视化。

# 这段代码是用来将 PDF 文档中的表格数据提取出来并转换为 Excel 文件的。
# pip install pdfplumber
# pip install openpyxl
from copy import copy
import os
import pdfplumber
import openpyxl
import pandas as pd
from openpyxl import Workbook
from openpyxl.styles import Border, Side, Font, Alignment, PatternFill
import subprocess
from openpyxl.utils import get_column_letter

def adjust_excel_cells(excel_path, sheet_name):

    wb = openpyxl.load_workbook(excel_path)
    ws = wb[sheet_name]

    max_row = ws.max_row
    for r_idx in range(2, max_row + 1):  # 从第二行开始，因为第一行没有上一行
        cell = ws.cell(row=r_idx, column=1)
        if cell.value is not None and not cell.font.bold and cell.value.strip() != '':  # 如果单元格文本没有加粗且单元格内非空也不含换行符等空白字符
            r_idx_above = r_idx - 1
            while r_idx_above >= 1:  # 寻找同列上方距离自己最近的单元格
                cell_above = ws.cell(row=r_idx_above, column=1)
                if cell_above.font.bold:  # 如果上方的单元格内容为加粗
                    cell.value = f"{cell_above.value}.{cell.value}"  # 将自己单元格内的内容前面加上”文本加粗单元格内的文本.\\"
                    break
                r_idx_above -= 1
    # 自动调根据文本调整列宽
    for col in ws.columns:
        max_length = 0
        column = col[0].column_letter  # 获取列名
        for cell in col:
            try:
                if len(str(cell.value)) > max_length:
                    max_length = len(str(cell.value))
            except:
                pass
        adjusted_width = max_length + 2
        ws.column_dimensions[column].width = adjusted_width

    # 保存修改后的 Excel 文件
    wb.save(excel_path)

def convert_pdf_to_excel(pdf_file, output_excel):
    # 创建 Excel 文件
    wb = Workbook()

    # 打开 PDF 文档并提取表格数据
    with pdfplumber.open(pdf_file) as pdf:
        total_pages = len(pdf.pages)
        all_data = []

        for page_number in range(total_pages):
            page = pdf.pages[page_number]
            
            # 添加表格提取的参数设置
            table = page.extract_table(table_settings={
                "vertical_strategy": "lines",       # 垂直线策略
                "horizontal_strategy": "lines",     # 水平线策略
                "intersection_tolerance": 15        # 交叉容差
            })
            
            all_data.extend(table)

    # 将数据转换为 DataFrame
    df = pd.DataFrame(all_data)

    # 将数据写入 Excel 文件并设置框线及文本居中
    ws = wb.active  # 获取活动工作表
    ws.title = "Table Text"  # 命名第一个sheet为Table Text
    for r_idx, row in enumerate(df.values.tolist(), start=1):
        c_idx = 1
        for value in row:
            # 检查左侧单元格是否为空
            while ws.cell(row=r_idx, column=c_idx).value is not None:
                c_idx += 1
            cell = ws.cell(row=r_idx, column=c_idx, value=value)
            
            # 设置边框为all borders
            border_style = Border(left=Side(style='thin'), 
                                  right=Side(style='thin'), 
                                  top=Side(style='thin'), 
                                  bottom=Side(style='thin'))
            cell.border = border_style
            
            # 设置文本居中对齐
            cell.alignment = Alignment(horizontal='center', vertical='center')

    # 获取有文本的最大行和最大列
    max_row = ws.max_row
    max_column = ws.max_column

    # 设置没有内容的单元格边框格式为无边框
    for row in ws.iter_rows(min_row=1, max_row=max_row, min_col=1, max_col=max_column):
        for cell in row:
            if cell.value is None or str(cell.value).strip() == '':
                cell.border = Border()

    # 对于工作区每行只有一列有数据时，将该单元格填充颜色变为浅蓝色，并将文本加粗
    for row in ws.iter_rows(min_row=1, max_row=max_row, min_col=1, max_col=max_column):
        row_values = [cell.value for cell in row if cell.value is not None and str(cell.value).strip() != '']
        if len(row_values) == 1:  # 如果该行只有一个非空单元格
            for cell in row:
                if cell.value is not None and str(cell.value).strip() != '':
                    cell.fill = PatternFill(start_color="ADD8E6", end_color="ADD8E6", fill_type="solid")  # 设置填充颜色为浅蓝色
                    cell.font = Font(bold=True)  # 设置文本为加粗

    # 特例：当一行内的有内容的单元格数大于2时，将连续出现此条件的区域定义为特殊区，将特殊区第一行的单元格填充颜色设置为浅蓝色，并将文本设置为加粗
    special_area_start = None
    for r_idx in range(1, max_row + 1):
        row = ws[r_idx]
        non_empty_cells = sum(1 for cell in row if cell.value is not None and str(cell.value).strip() != '')
        if non_empty_cells > 2 and special_area_start is None:
            special_area_start = r_idx
            for cell in row:
                cell.fill = PatternFill(start_color="ADD8E6", end_color="ADD8E6", fill_type="solid")
                cell.font = Font(bold=True)
        elif non_empty_cells <= 2 and special_area_start is not None:
            special_area_start = None

    # 自动调整列宽
    for col in ws.columns:
        max_length = 0
        column = col[0].column_letter  # 获取列名
        for cell in col:
            try:
                if len(str(cell.value)) > max_length:
                    max_length = len(str(cell.value))
            except:
                pass
        adjusted_width = max_length + 2
        ws.column_dimensions[column].width = adjusted_width

    non_tabular_ws = wb.create_sheet(title="Non-Tabular Text")
    page = pdf.pages[0]
    non_tabular_text = page.extract_text(x_tolerance=1, y_tolerance=1)
    if non_tabular_text:
        lines = non_tabular_text.split('\n')
        for r_idx, line in enumerate(lines, start=1):
            non_tabular_ws.cell(row=r_idx, column=1, value=line)

    # 自动调整列宽
    for col in non_tabular_ws.columns:
        max_length = 0
        column = col[0].column_letter  # 获取列名
        for cell in col:
            try:
                if len(str(cell.value)) > max_length:
                    max_length = len(str(cell.value))
            except:
                pass
        adjusted_width = max_length + 2
        non_tabular_ws.column_dimensions[column].width = adjusted_width

    # 在第二个sheet“Non-Tabular Text”中，将第一列第五行的单元格变为浅蓝色，将文本加粗
    if "Non-Tabular Text" in wb.sheetnames:
        non_tabular_ws = wb["Non-Tabular Text"]
        cell = non_tabular_ws.cell(row=5, column=1)
        cell.fill = PatternFill(start_color="ADD8E6", end_color="ADD8E6", fill_type="solid")  # 设置填充颜色为浅蓝色
        cell.font = Font(bold=True)  # 设置文本为加粗

        # 将该单元格右侧的单元格写入Calculation name
        cell_right = non_tabular_ws.cell(row=5, column=2, value="Calculation name")
    
    # 在第二个sheet“Non-Tabular Text”中，将第一列最后一行的单元格变为浅蓝色，将文本加粗
    if "Non-Tabular Text" in wb.sheetnames:
        non_tabular_ws = wb["Non-Tabular Text"]
        cell = non_tabular_ws.cell(row=non_tabular_ws.max_row, column=1)
        cell.fill = PatternFill(start_color="ADD8E6", end_color="ADD8E6", fill_type="solid")  # 设置填充颜色为浅蓝色
        cell.font = Font(bold=True)  # 设置文本为加粗

        # 提取该单元格第三个单词，写入右侧单元格
        cell_right = non_tabular_ws.cell(row=non_tabular_ws.max_row, column=2, value=cell.value.split()[2])

    # 设置全局字体为Times New Roman
    global_font = Font(name='Times New Roman')

    # 应用全局字体到所有工作表
    for sheet in wb.sheetnames:
        ws = wb[sheet]
        ws.sheet_format.defaultRowHeight = 20  # 设置默认行高
        ws.sheet_format.defaultColWidth = 12  # 设置默认列宽

        for row in ws.iter_rows():
            for cell in row:
                # 检查原来的字体是否为粗体
                if cell.font.bold:
                    # 如果是，新字体也应为粗体
                    cell.font = Font(name='Times New Roman', bold=True)
                else:
                    cell.font = global_font

    # 保存 Excel 文件
    wb.save(filename=output_excel)

def add_sheet_with_table_text(excel_file, new_sheet_name, source_sheet_name):
    # 加载工作簿和源sheet
    workbook = openpyxl.load_workbook(excel_file)
    source_sheet = workbook[source_sheet_name]
    
    # 创建一个新的sheet
    if new_sheet_name in workbook.sheetnames:
        new_sheet = workbook[new_sheet_name]  # 使用现有的sheet
    else:
        new_sheet = workbook.create_sheet(new_sheet_name)  # 创建一个新的sheet

    # 复制源sheet中的单元格值和样式到新的sheet
    for row in source_sheet.iter_rows():
        for cell in row:
            new_cell = new_sheet.cell(row=cell.row, column=cell.column, value=cell.value)
            if cell.has_style:
                new_cell.font = copy(cell.font)
                new_cell.border = copy(cell.border)
                new_cell.fill = copy(cell.fill)
                new_cell.number_format = copy(cell.number_format)
                new_cell.protection = copy(cell.protection)
                new_cell.alignment = copy(cell.alignment)
    # 自动调整列宽
    for column_cells in new_sheet.columns:
        length = max(len(str(cell.value)) for cell in column_cells)
        new_sheet.column_dimensions[column_cells[0].column_letter].width = length


    # 保存工作簿
    workbook.save(excel_file)

def merge_cells_based_on_condition(excel_path, sheet_name):
    # 加载工作簿并选择指定的工作表
    workbook = openpyxl.load_workbook(excel_path)
    sheet = workbook[sheet_name]

    # 遍历工作表中的行
    for row in sheet.iter_rows(min_row=1, max_col=sheet.max_column, max_row=sheet.max_row):
        # 统计行中非空单元格的数量
        non_empty_cells = [cell for cell in row if cell.value is not None]

        # 如果该行只有一个非空单元格
        if len(non_empty_cells) == 1:
            # 查找下一行中非空单元格数量大于1的行
            next_row_index = row[0].row + 1
            while next_row_index <= sheet.max_row:
                next_row = sheet[next_row_index]
                next_row_non_empty_cells = [cell for cell in next_row if cell.value is not None]
                if len(next_row_non_empty_cells) > 1:
                    # 合并原来只有一个非空单元格的行的前n个单元格
                    first_cell = non_empty_cells[0]
                    last_cell = sheet.cell(row=first_cell.row, column=len(next_row_non_empty_cells))
                    sheet.merge_cells(start_row=first_cell.row, start_column=first_cell.column,
                                      end_row=last_cell.row, end_column=last_cell.column)
                    break
                next_row_index += 1
    # 保存工作簿
    workbook.save(excel_path)
    # 打开生成的 Excel 文件
    #subprocess.Popen(['start', '', output_excel], shell=True)

print("Step 1 开始工作")

# 设置工作目录和PDF文件路径
os.chdir("C:/Users/guanming/Desktop/work_env")

pdf_file = "ecat report.pdf"
output_excel = "Output_Excel_from_ecat_report.xlsx"

# 调用函数
convert_pdf_to_excel(pdf_file, output_excel)

# 调用函数以添加包含表格文本的新sheet
add_sheet_with_table_text('Output_Excel_from_ecat_report.xlsx', 'Table Text of Test', 'Table Text')

# 调用函数以调整单元格格式
adjust_excel_cells('Output_Excel_from_ecat_report.xlsx', 'Table Text of Test')

# 调用函数以合并单元格
merge_cells_based_on_condition('Output_Excel_from_ecat_report.xlsx', 'Table Text')

print("Step 1: 将 PDF 文档转换为 Excel 文件无误！")

第二步：Step2_Handling_multiple_columns

处理多行数据

因后续检索数据仅根据第一列索引，因此将多行对比表转化为两列普通表格。

import pandas as pd
import openpyxl
import re
import os
import win32com.client

print("Step 2 开始工作")
# 打开Excel文件
Excel = win32com.client.Dispatch("Excel.Application")
xlwb = Excel.Workbooks.Open(os.path.abspath("C:/Users/guanming/Desktop/work_env/Output_Excel_from_ecat_report.xlsx"))

# 选择名为'Table Text for Test'的工作表
sheet = xlwb.Sheets("Table Text of Test")

# 获取最大行数和列数
max_row = sheet.UsedRange.Rows.Count
max_col = sheet.UsedRange.Columns.Count

# 初始化特殊行和列的计数器
special_row = 0
special_col = 0

# 初始化特殊行总数的计数器
total_special = 0

# 遍历工作表中的每一行
# 遍历工作表中的每一行

i = 1
last_special_area = -1

l_s_start =0
l_s_end = 0

while i <= max_row:

    #print(f"正在处理第{i}行")
    # 计算行中非空单元格的数量
    non_empty_cells = sum([1 for j in range(1, max_col + 1) if sheet.Cells(i, j).Value is not None])
    
    
    # 如果上一个区域为特殊区域，自身为最后一个特殊区域
    if last_special_area == 0:
            print(f"最后一次特殊区，执行列命名，起始行：{l_s_start}，结束行：{l_s_end}，列数：{non_empty_cells}")
            first_cell_value = sheet.Cells(l_s_start, 2).Value
            for row in range(l_s_start, l_s_end + 1):
                current_cell_value = sheet.Cells(row, 1).Value
                sheet.Cells(row, 1).Value = f"{first_cell_value}.{current_cell_value}" if current_cell_value else first_cell_value
            last_special_area = -1
            l_s_start = 0
            l_s_end = 0
    
    # 如果非空单元格的数量大于2
    if non_empty_cells > 2:
        # 增加特殊列计数器
        special_col = non_empty_cells
        # 将出现non_empty_cells > 2时的行数赋值给special_row
        special_row = i
        # 增加特殊行总数的计数器
        total_special += 1
        # 标记为上一个区域为特殊区域
        last_special_area = special_col - 2
    else:
        # 遇到普通工作区域，如果特殊列计数器大于0
        if special_col > 0:
            print(f"发现特殊工作区域，起始行：{special_row - total_special + 1}，结束行：{special_row}，列数：{special_col}, 剩余处理次数为：{last_special_area}")
            # 在特殊工作区域下方复制插入一个特殊工作区
            sheet.Range(f"A{special_row - total_special + 1}:A{special_row}").EntireRow.Copy()
            sheet.Range(f"A{special_row + 1}:A{special_row + total_special}").EntireRow.Insert(Shift=-4121)
            # 将原始特殊工作区域第一列第二行的数据添加到第一列其他行中
            first_cell_value = sheet.Cells(special_row - total_special + 1, 2).Value
            for row in range(special_row - total_special+ 1, special_row + 1):
                current_cell_value = sheet.Cells(row, 1).Value
                sheet.Cells(row, 1).Value = f"{first_cell_value}.{current_cell_value}" if current_cell_value else first_cell_value
            # 将新复制的特殊工作区的第二列删除，如果被删除的第二列右侧仍有文本，将文本左移一个单元格
            for row in range(special_row + 1, special_row + total_special + 1):
                for col in range(2, special_col):
                    sheet.Cells(row, col).Value = sheet.Cells(row, col + 1).Value
                sheet.Cells(row, special_col).Value = None
                sheet.Cells(row, special_col).ClearFormats()
            # 删除原来特殊工作区域第二列右侧的文本，并清除格式
            for row in range(special_row - total_special + 1, special_row + 1):
                for col in range(3, max_col + 1):
                    sheet.Cells(row, col).Value = None
                    sheet.Cells(row, col).ClearFormats()
            last_special_area -=1

            # 更新max_row
            max_row = sheet.UsedRange.Rows.Count

            # 重置遍历起始点
            i = special_row

            # 重置计数器
            l_s_start = special_row + 1
            l_s_end = special_row + total_special


            special_row = 0
            special_col = 0
            total_special = 0

    # 遍历下一行
    i += 1

# 保存并打开工作簿
xlwb.Save()
#Excel.Visible = True

xlwb.Close(SaveChanges=True)

# 打印成功消息
print("Step 2: 脚本已成功处理Excel文件中的特殊工作区域。")

第三步：Step3_report_to_template

将导出的excel文件借助正则表达式等写入Et中

#pip install pywin32
#pip install openpyxl
import re
import win32com.client
import openpyxl

print("Step 3 开始工作")
# 函数:将Template中CONSTANT VALUE列中的内容复制到新增列中
def paste_constants(sheet_of_template, max_col, max_row):
    for col in range(1, max_col+1):
        if sheet_of_template.Cells(1, col).Value == 'CONSTANT VALUE':
            column_range = sheet_of_template.Range(sheet_of_template.Cells(1, col), sheet_of_template.Cells(max_row, col))
            column_range.Copy()
            return col
    return None

# 函数:查找"Calculation name"和"Model"并写入值
def search_calculation_name_and_model(sheet_of_template, sheet_of_report_nontabular, max_row, max_col):
    for row in range(2, max_row+1):
        if sheet_of_template.Cells(row, 1).Value == 'Calculation name':
            for row2 in range(2, sheet_of_report_nontabular.UsedRange.Rows.Count+1):
                if sheet_of_report_nontabular.Cells(row2, 2).Value == 'Calculation name':
                    sheet_of_template.Cells(row, max_col+1).Value = sheet_of_report_nontabular.Cells(row2, 1).Value
                    sheet_of_template.Cells(row, max_col+1).Interior.ColorIndex = 35
                    for row3 in range(2, max_row+1):
                        if sheet_of_template.Cells(row3, 1).Value == 'Model':
                            model_value = sheet_of_report_nontabular.Cells(row2, 1).Value.split('-')[0].strip()
                            sheet_of_template.Cells(row3, max_col+1).Value = model_value
                            sheet_of_template.Cells(row3, max_col+1).Interior.ColorIndex = 35
                            break
                    break

# 函数:查找"National flag"并写入值
def search_National_flag(sheet_of_template, sheet_of_report_nontabular, max_row, max_col):
    for row in range(2, max_row+1):
        if sheet_of_template.Cells(row, 1).Value == 'National flag':
            for row2 in range(2, sheet_of_report_nontabular.UsedRange.Rows.Count+1):
                if sheet_of_report_nontabular.Cells(row2, 2).Value == 'Asia':
                    sheet_of_template.Cells(row, max_col+1).Value = 0
                    sheet_of_template.Cells(row, max_col+1).Interior.ColorIndex = 35
                    break
                elif sheet_of_report_nontabular.Cells(row2, 2).Value == 'NAO':
                    sheet_of_template.Cells(row, max_col+1).Value = 1
                    sheet_of_template.Cells(row, max_col+1).Interior.ColorIndex = 35
                    break

# 函数:根据content_in_template和content_in_report_table查找并写入target_value
def search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, content_in_template, content_in_report):
    target_value = None
    for row2 in range(1, sheet_of_report_table.UsedRange.Rows.Count+1):
        if sheet_of_report_table.Cells(row2, 1).Value == content_in_report:
            target_value = sheet_of_report_table.Cells(row2, 2).Value
            break
    if target_value is None:
        print(f"'{content_in_report}' not found in sheet_of_report_table")
        return
    for row in range(1, max_row+1):
        if sheet_of_template.Cells(row, 1).Value == content_in_template:
            sheet_of_template.Cells(row, max_col+1).Value = target_value
            sheet_of_template.Cells(row, max_col+1).Interior.ColorIndex = 35
            break

# 函数:查找sheet中指定内容的所有行号
def find_all_rows(sheet, search_value):
    return [i for i in range(1, sheet.UsedRange.Rows.Count+1) if sheet.Cells(i, 1).Value == search_value]

# 打开Excel
Excel = win32com.client.Dispatch("Excel.Application")

# 打开Excel文件 Engineer tool_Template
wb = Excel.Workbooks.Open("C:/Users/guanming/Desktop/work_env/Engineer tool_Template.xlsm")
sheet_of_template = wb.Sheets('Input')

# 打开Excel文件 Output Excel from ecat report
wb2 = Excel.Workbooks.Open("C:/Users/guanming/Desktop/work_env/Output_Excel_from_ecat_report.xlsx")
sheet_of_report_nontabular = wb2.Sheets('Non-Tabular Text')
sheet_of_report_table = wb2.Sheets('Table Text of Test')

max_row = sheet_of_template.UsedRange.Rows.Count
max_col = sheet_of_template.UsedRange.Columns.Count

constant_value_col = paste_constants(sheet_of_template, max_col, max_row)
if constant_value_col:
    sheet_of_template.Cells(1, max_col+1).PasteSpecial(Paste=-4104)
    sheet_of_template.Cells(1, max_col+1).Value = None

search_calculation_name_and_model(sheet_of_template, sheet_of_report_nontabular, max_row, max_col)
search_National_flag(sheet_of_template, sheet_of_report_nontabular, max_row, max_col)

search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Target cooling capacity', 'Chiller.Capacity')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'VFD Size', 'Motor and Starter.VFD Size')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Hertz', 'Motor and Starter.Line Voltage-Phase-Hertz')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Voltage', 'Motor and Starter.Line Voltage-Phase-Hertz')

search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Cooler waterboxes', 'Cooler.Configuration.Waterbox Type')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Condenser waterboxes', 'Condenser.Configuration.Waterbox Type')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Cooler nozzle arrangement', 'Cooler.Configuration.Nozzle Arrangement')

search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Liquid bypass used or not', 'Chiller.Liquid Bypass')




search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Cooler size', 'Cooler.Configuration.Size')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Cooler tubing', 'Cooler.Configuration.Tubing')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Cooler passes', 'Cooler.Configuration.Passes')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Condenser size', 'Condenser.Configuration.Size')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Condenser tubing', 'Condenser.Configuration.Tubing')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Condenser passes', 'Condenser.Configuration.Passes')

search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Isolation Valve Option', 'Chiller.Isolation Valve')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'economizer', 'Chiller.Economizer')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Compressor size', 'Compressor.Size')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Motor Type', 'Motor and Starter.Motor Size')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Evaporator fouling factor', 'Cooler.Configuration.Fouling Factor')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Condenser fouling factor', 'Condenser.Configuration.Fouling Factor')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Cooler fluid type', 'Cooler.Configuration.Fluid Type')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Condenser fluid type', 'Condenser.Configuration.Fluid Type')


search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'motorFrameSize', 'Motor and Starter.Motor Size')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Refrigerant type', 'Chiller.Refrigerant Type')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Evaporator entering brine temperature', 'Cooler.Entering Temperature')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Evaporator leaving brine temperature', 'Cooler.Leaving Temperature')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Condenser entering fluid temperature', 'Condenser.Entering Temperature')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Condenser leaving brine temperature', 'Condenser.Leaving Temperature')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Condenser fluid flow rate', 'Condenser.Flow Rate')
search_target_value(sheet_of_template, sheet_of_report_table, max_row, max_col, 'Condenser max fluid pressure drop', 'Condenser.Pressure Drop')

rows = find_all_rows(sheet_of_template, 'compressorFrameSize')
for row in rows:
    sheet_of_template.Cells(row, max_col+1).Value = 'MW3'
    sheet_of_template.Cells(row, max_col+1).Interior.ColorIndex = 35

sheet_of_template.Columns(max_col+1).AutoFit()

# 保存并关闭文件
wb.Save()
wb.Close(SaveChanges=True)

# 保存并打开wb2
wb2.Save()
Excel.Visible = True

print('Step 3: 已将Output_Excel_from_ecat_report写入Engineer tool_Template')

第四步：Step4_Reformatting_cells

基于Et格式（下拉选项）限定，将写入的数据mapping，便于工程计算

import pandas as pd
import openpyxl
import re
import os
import win32com.client

print("Step 4 开始工作")
# 函数:将Template中CONSTANT VALUE列中的内容复制到新增列中
def paste_constants(sheet, max_col, max_row):
    constant_value_col = None
    for col in range(1, max_col+1):
        if sheet.Cells(1, col).Value == 'CONSTANT VALUE':
            constant_value_col = col
            break
    if constant_value_col is not None:
        column_range = sheet.Range(sheet.Cells(1, constant_value_col), sheet.Cells(max_row, constant_value_col))
        column_range.Copy()
    return constant_value_col

import re

# 该函数返回单元格的值中的数字
def extract_number(cell_value):
    # 检查单元格的值是否不为None且不是浮点数
    if cell_value is not None and not isinstance(cell_value, float):
        # 使用正则表达式查找所有包含小数点的数字
        numbers = re.findall(r'\d+\.\d+|\d+', cell_value)
        return numbers[0] if numbers else None
    return None

# 该函数返回单元格中每个单词的首字母
def extract_first_letter(cell_value):
    # 检查单元格的值是否不为None且不是浮点数
    if cell_value is not None and not isinstance(cell_value, float):
        # 使用正则表达式查找单元格中每个单词的首字母
        letters = re.findall(r'\b\w', cell_value)
        return ''.join(letters) if letters else None
    return None

# 该函数返回单元格的值中指定索引处的元素，如果没有则返回None
def extract_after_dash(cell_value, dash_index):
    # 检查单元格的值是否不为None且不是浮点数
    if cell_value is not None and not isinstance(cell_value, float):
        # 按照“-”分割单元格的值，并返回指定索引处的元素
        split_values = cell_value.split('-')
        return split_values[dash_index].strip() if len(split_values) > dash_index else None
    return None

#re.findall(r"[^-]+", s)
# 该函数返回单元格的值中除了-以外的字符，如果没有则返回None
def extract_refrigerant_type(cell_value):
    # 检查单元格的值是否不为None且不是浮点数
    if cell_value is not None and not isinstance(cell_value, float):
        # 使用正则表达式查找单元格中除了-以外的字符
        types = re.findall(r"[^-]+", cell_value)
        return ''.join(types) if types else None
    return None


# 该函数返回单元格的值中第一个字符
def extract_type(cell_value):
    # 检查单元格的值是否不为None且不是浮点数
    if cell_value is not None and not isinstance(cell_value, float):
        # 使用正则表达式查找单元格中第一个字符
        types = re.findall(r'\D+', cell_value)
        return types[0] if types else None
    return None


# 打开Excel文件
Excel = win32com.client.Dispatch("Excel.Application")
wb = Excel.Workbooks.Open("C:/Users/guanming/Desktop/work_env/Engineer tool_Template.xlsm")
sheet = wb.Sheets('Input')

# 获取最大行列数
max_row = sheet.UsedRange.Rows.Count
max_col = sheet.UsedRange.Columns.Count

# 调用函数
constant_value_col = paste_constants(sheet, max_col, max_row)

# 粘贴插入到有文本的最后一列即max_col右侧，并保留源格式
sheet.Cells(1, max_col+1).PasteSpecial(Paste=-4104)  # -4104 represents the paste operation that keeps source formatting

# 删除新增列第一行的内容
sheet.Cells(1, max_col+1).Value = None

# 遍历第一列并提取内容
for row in range(1, max_row+1):
    cell_value = sheet.Cells(row,1).value
    # 提取数字
    if cell_value in ["Target cooling capacity", "Evaporator entering brine temperature","Condenser max fluid pressure drop","Evaporator leaving brine temperature", "Condenser entering fluid temperature", "Condenser leaving brine temperature", "Condenser fluid flow rate","Evaporator fouling factor","Condenser fouling factor"]:
        sheet.Cells(row, max_col+1).value = extract_number(sheet.Cells(row, max_col).value)
        sheet.Cells(row, max_col+1).Interior.ColorIndex = 4

    # 格式正确，不需要调整格式的值
    elif cell_value in ['National flag','Calculation name','Model','VFD Size','economizer','Compressor size','compressorFrameSize','Motor Type','Cooler size',  'Cooler passes', 'Condenser size', 'Condenser passes','Isolation Valve Option']:
        sheet.Cells(row, max_col+1).value = sheet.Cells(row,max_col).value
        #更改填充颜色为绿色
        sheet.Cells(row, max_col+1).Interior.ColorIndex = 4

    # 提取首字母
    elif cell_value in ["Condenser fluid type",'Cooler fluid type']:
        sheet.Cells(row, max_col+1).value = extract_first_letter(sheet.Cells(row, max_col).value)
        sheet.Cells(row, max_col+1).Interior.ColorIndex = 4

    elif cell_value == "Hertz":
        # 提取第二个“-”后面的内容
        sheet.Cells(row, max_col+1).value = extract_after_dash(sheet.Cells(row, max_col).value, 2)
        sheet.Cells(row, max_col+1).Interior.ColorIndex = 4
    elif cell_value == "Voltage":
        # 提取第一个“-”前面的内容
        sheet.Cells(row, max_col+1).value = extract_after_dash(sheet.Cells(row, max_col).value, 0)
        sheet.Cells(row, max_col+1).Interior.ColorIndex = 4
    # 提取第一个字符
    elif cell_value == "motorFrameSize":
        sheet.Cells(row, max_col+1).value = extract_type(sheet.Cells(row, max_col).value)
        sheet.Cells(row, max_col+1).Interior.ColorIndex = 4
    # 提取除-以外的字符
    elif cell_value == "Refrigerant type":
        sheet.Cells(row, max_col+1).value = extract_refrigerant_type(sheet.Cells(row, max_col).value)
        sheet.Cells(row, max_col+1).Interior.ColorIndex = 4
    # Liquids bypass used or not
    elif cell_value == "Liquid bypass used or not":
        if sheet.Cells(row, max_col).value == "Not Installed":
            sheet.Cells(row, max_col+1).value = "FALSE"
            sheet.Cells(row, max_col+1).Interior.ColorIndex = 4
        else:
            sheet.Cells(row, max_col+1).value = "TRUE"
            sheet.Cells(row, max_col+1).Interior.ColorIndex = 4
    # Compressor Oil Free
    elif cell_value == "Compressor Oil Free":
        # 如果第四行单元格中第五个字符为“-”，则为“TRUE”，否则为“FALSE”
        if sheet.Cells(4, max_col + 1).value[4] == "-":
            sheet.Cells(row, max_col+1).value = "TRUE"
        else:
            sheet.Cells(row, max_col+1).value = "FALSE"
        sheet.Cells(row, max_col+1).Interior.ColorIndex = 4


    sheet.Columns(max_col+1).AutoFit()

# 保存并显示工作簿
wb.Save()
Excel.Visible = True

print("Step 4 已完成单元格格式转换!")