办公自动化：python操作电子表格汇总-CSDN博客

本文链接：https://blog.csdn.net/m0_58192074/article/details/136279415
import os

import shutil  #shutil模块可以复制文件

import pandas as pd

import re





'''1、给文件名删除固定字符'''



def remove_filenames(folder_path, fixed_string):



    found = False  # 初始化标志变量为False

    for root, dirs, files in os.walk(folder_path):



        print(files)



        for file in files:

         

             #   在文件名中去掉特定字符



           if fixed_string in file:



                 new_filename = os.path.join(root, file.replace(fixed_string, ''))

                 os.rename(os.path.join(root, file), new_filename)

                 found = True  # 找到了要删除的字符，将标志变量设置为True



           if not found:

                 print("文件名中没有要删除的字符")  



'''2、给文件名添加固定字符'''

def add_fixed_string_to_filenames(folder_path, fixed_string):



    found = False  # 初始化标志变量为False

    for root, dirs, files in os.listdir(folder_path):

         

        file_path = os.path.join(folder_path, file)



        print(files)



        for file in files:

            new_filename = os.path.join(root,  fixed_string+file)



            os.rename(os.path.join(root, file), new_filename)



            print("添加成功！")



'''3、汇总文件'''        



def CopyFile(filepath, newPath):

    """

    将起始路径的路径文件复制到目标路径

    filepath:起始路径

    newPath:目标路径

    """

    # 获取当前路径下的文件名，返回List

    # 判断目标文件夹是否存在，不存在则创建文件夹

    if not os.path.exists(new_folder_path):

        os.makedirs(new_folder_path)

       

    # 遍历起始文件夹

    fileNames = os.listdir(filepath)

   

    print(fileNames)

    for file in fileNames:

        # 将文件名加入到当前文件路径后面

        newDir = filepath + '/' + file



        # 如果是文件，且文件的结尾是 ".pdf"

        # and newDir.endswith("_3.bmp")语句用于筛选，若去除则会复制所有文件

        if os.path.isfile(newDir) and newDir.endswith(".pdf"):  

            newFile = newPath +'/'+ file

            print(file)

            # 开始文件复制

            shutil.copyfile(newDir, newFile)



        # 如果不是文件，递归这个文件夹的路径            

        elif not os.path.isfile(newDir):

            CopyFile(newDir,newPath)    

             

'''4、读取文件名'''  



def get_filelist(path):

     

      Filelist = []

      for home, dirs, files in os.walk(path):

            for filename in files:

           

                 if filename.endswith(".pdf"):

                 # 把文件名添加入列表，包含完整路径  

                 # Filelist.append(os.path.join(home, filename))



                 # 把文件名添加入列表，只包含文件名

                       Filelist.append(filename)

                       print(len(Filelist))    # 文件数量

                       print(Filelist)   # 文件数量

      if  os.path.exists(new_folder_path +'/' +'newfile.xlsx'):  #判断桌面上是否存在newfile.xlsx文件



            print("文件(newfile.xlsx)已经存在！")



      else:

   

            re = pd.DataFrame(Filelist)  

   

            re.to_excel(new_folder_path +'/' +'newfile.xlsx')       #把所有文件写入wps文件

                       



'''5、数据分析：比对一个电子表格文件中的两列文件的包含关系   '''

def check_if_exists_in_d(file_ex):

      """

  判断 Excel 文件中 Sheet1 的 D 列和 E 列的字符是否同时存在于 B 列中。



  Args:

    df: Excel 文件的 DataFrame 对象。



  Returns:

    一个新的 DataFrame 对象，其中包含 `Exists_in_D` 列，该列表示 A 列和 B 列的字符是否同时存在于 D 列中。

     """

  # 读取 Excel 文件

      df = pd.read_excel(file_ex, sheet_name='Sheet1')

     

  # 判断 A 列和 B 列的字符是否同时存在于 D 列中

      for i in range(len(df)):

           

            value_xm = str(df.loc[i,'xm']).replace(" ","")  #取出姓名，并去除所有的空格



            value_dw = str(df.loc[i,'dw']).strip()   #取出单位，并去除两边的空格            

           

           

            for j in range(len(df)):

                 

                  value_pdf = str(df.loc[j,'pdf']).replace(" ","")  # 遍历pdf文件并去除所有空格




                  if value_xm in value_pdf and value_dw in value_pdf:   #找出姓名和单位与PDF文件名一致的



                  # print("有PDF:"+value_xm)



                        break

            else:

                        print("无PDF:姓名： "+value_xm + "    单位："+value_dw)     # 不一致的输出单位和姓名





'''6、比对两个excel文件中不同列的数据包含关系，并把结果写在一个电子表格里面'''  

def check_excel(excel1_path, excel2_path):

    """

    判断 excel1 中 "xm" 和 "dw" 两列的同一行数据，是否同时包含在 excel2 的 "pdf" 某一行的字符串中



    Args:

        excel1_path (str): excel1 文件路径

        excel2_path (str): excel2 文件路径



    Returns:

        None

    """



    # 读取 excel1 和 excel2

    excel1 = pd.read_excel(excel1_path)

    excel2 = pd.read_excel(excel2_path)



    # 遍历 excel1 中的每一行

    for index, row in excel1.iterrows():

        xm = row['xm']

        dw = row['dw']

       

        # 判断 xm 和 dw 是否同时出现在 excel2 的某一行的 pdf 列中

        if excel2['pdf'].str.contains(re.escape(str(xm))).any() and excel2['pdf'].str.contains(re.escape(str(dw))).any():

           # 使用  .astype()  方法将列的数据类型转换为字符串类型

            excel1['Result'] = excel1['Result'].astype(str)

            excel1.loc[index, 'Result'] = 'yes'

        else:

            excel1['Result'] = excel1['Result'].astype(str)

            excel1.loc[index, 'Result'] = 'no'



    # 保存结果到 excel1

    excel1.to_excel(excel1_path, index=False)



'''7、比对文件夹中的pdf文件，是否和excel文件中的数据相匹配，并把结果写在一个电子表格里面'''  

def get_pdf_name(path):

     

      Filelist = []

      for home, dirs, files in os.walk(path):

            for filename in files:

           

                 if filename.endswith(".pdf"):

                 # 把文件名添加入列表，包含完整路径  

                 # Filelist.append(os.path.join(home, filename))



                 # 把文件名添加入列表，只包含文件名

                       Filelist.append(filename)

                 #      print(len(Filelist))    # 文件数量

                 #      print(Filelist)   # 文件数量

      return(Filelist)

def check_pdf_excel(excel1_path):  

   

    pdf_file = get_pdf_name(folder_path)

    # 将列表转换为数据框

    pdf_df = pd.DataFrame({'pdf': pdf_file})  

   

    # print(pdf_file)  

   

        # 读取 excel1

    excel1 = pd.read_excel(excel1_path,sheet_name="Sheet1")    # 也可以指定sheet

   



    # 遍历 excel1 中的每一行

    for index, row in excel1.iterrows():

        xm = row['姓名']

        dw = row['dw']

                # 判断 xm 和 dw 是否同时出现在 pdf_file 的某一行的 pdf 列中

        if pdf_df['pdf'].str.contains(re.escape(str(xm))).any() and pdf_df['pdf'].str.contains(re.escape(str(dw))).any():

           # 使用  .astype()  方法将列的数据类型转换为字符串类型

            excel1['pdf'] = excel1['pdf'].astype(str)

            excel1.loc[index, 'pdf'] = 'yes'

        else:

            excel1['pdf'] = excel1['pdf'].astype(str)

            excel1.loc[index, 'pdf'] = 'pdf-no'



    # 保存结果到 excel1

    excel1.to_excel(excel1_path, index=False)







# 测试代码            

if __name__ == '__main__':      

   

      '''数据初始化'''              

      fixed_string = '高级编号-'  # 1、替换为你想要添加或删除的字符

      folder_path = r'E:\2023中级pdf'  #  目标文件夹路径

      new_folder_path = r'E:\临时\2023职评\2023中高级编号'   #  新文件夹路径

      file1_excel = r'E:\临时\2023职评\2023中高级编号\2023zjpdf.xlsx'   #5、需要比对的excel文件

      file2_excel = r'E:\临时\2023职评\2023中高级编号\高级编号2\newfile.xlsx'



      '''代码执行部分'''



      # 1、

      # remove_filenames(folder_path, fixed_string)  # 1、删除文件中字符

      # 2

      # add_fixed_string_to_filenames(folder_path, fixed_string)  #2、给文件添加字符

      # 3

      # CopyFile(folder_path,new_folder_path)  # 3、文件汇总

      # 4    

      # get_filelist(folder_path)  # 4、读取路径“folder_path”中的特定后缀名的文件名，并写入电子表格文件“newfile.xlsx”

      # 5

      # 判断  列标签是'xm'和'dw' 列的字符是否同时存在于'pdf'列中

      # df = check_if_exists_in_d(file1_excel)    #

      # 6    

      # check_excel(file1_excel, file2_excel)  

     

      # 7 比对pdf文件和电子表格汇总信息

      check_pdf_excel(file1_excel)