【Python】加载表格\文件遍历\字符串匹配-CSDN博客

本文链接：https://blog.csdn.net/qq_43265842/article/details/140275847
# -*- coding: utf-8 -*-

# ******************************************** #
# Cause i know you are my polaris.   - 星尘    #
# ******************************************** #

import os,re,shutil,time, configparser
import xml.dom.minidom
from typing import Dict, List
from fnmatch import fnmatch, fnmatchcase

# 第三方库
# ------------------------------------------ #
try:
    import xlrd
except:
    os.system('pip install xlrd')
    os.system('python -m pip install xlrd')
    import xlrd
    
try:
    from openpyxl import Workbook, load_workbook
    from openpyxl.styles import Font, PatternFill
except:
    os.system('pip install openpyxl')
    os.system('python -m pip install openpyxl')
    from openpyxl import Workbook, load_workbook
    from openpyxl.styles import Font, PatternFill
# ------------------------------------------ #   

# 处理表格文件
class MyTable:
    def __init__(self):
        self._isBold = False
        self._fontColor = "000000"
        self._bgColor = "FFFFFF"
    
    #加载xlsx文件
    def LoadXLSXFile(self, strPath: str, sheetName: str=""):
        '''
        参数:
            strPath: 文件路径
            sheetName: 表名 注:为空时默认取第一张表
        返回
            listData: 字典列表
        '''
        try:
            workBookObj = load_workbook(strPath)
            sheetObj = None
            if "" == sheetName:
                sheetObj = workBookObj.worksheets[0]
            elif sheetName in workBookObj.sheetnames:
                sheetObj = workBookObj[sheetName]
            else:
                print("sheetName is not exist")
                return []
        except:
            print("can not load file")
            return []
        
        # 取列名
        listTabColName = []
        iColNameIndex = sheetObj.min_row        # 第一行数据当作列名
        for cellItem in sheetObj[iColNameIndex]:
            listTabColName.append(str(cellItem.value))
        
        # 保存数据
        listData = []                           # 最终获取到的数据结果
    
        mergeArea = sheetObj.merged_cells
        
        iMinRow = sheetObj.min_row + 1          # 跳过首行列名
        iMinCol = sheetObj.min_column
        iMaxRow = sheetObj.max_row
        iMaxCol = sheetObj.max_column
        for iRow in range(iMinRow, iMaxRow+1):
            dictRowData = {}                    # 遍历行时保存的单行字典（key为列名）
            for iCol in range(iMinCol, iMaxCol+1):
                iRowTmp = iRow
                iColTmp = iCol
                # 判断是否处在合并区间内
                for mergeItem in mergeArea:
                    if mergeItem.min_row <= iRow <= mergeItem.max_row and mergeItem.min_col <= iCol <= mergeItem.max_col:
                        # 替换为合并区间的左上角坐标索引
                        iRowTmp = mergeItem.min_row
                        iColTmp = mergeItem.min_col
                        break
                
                # 保存单行数据
                if iCol > len(listTabColName):
                    continue
                
                content = sheetObj.cell(iRowTmp, iColTmp).value
                if content is None:
                    dictRowData[listTabColName[iCol-1]] = ''
                else:
                    dictRowData[listTabColName[iCol-1]] = str(content)
                    
            # 存储行字典
            listData.append(dictRowData)
        
        return listData
    
    #加载xls文件 (excel 2007以前)
    def LoadXLSFile(self, strPath: str, sheetName: str=""):
        '''
        参数:
            strPath: 文件路径
            sheetName: 表名 注:为空时默认取第一张表
        返回
            listData: 字典列表
        '''
        try:
            workBookObj = xlrd.open_workbook(strPath, formatting_info=True)
            sheetObj = None
            if "" == sheetName:
                sheetObj = workBookObj[0]
            elif sheetName in workBookObj._sheet_names:
                sheetObj = workBookObj.sheet_by_name(sheetName)
            else:
                return []
        except:
            return []
        
        # 取列名
        listTabColName = []
        iColNameIndex = 0                       # 第一行数据当作列名
        obj = sheetObj[0]
        for cellItem in sheetObj[iColNameIndex]:
            listTabColName.append(str(cellItem.value))
        
        # 保存数据
        listData = []                           # 最终获取到的数据结果
    
        mergeArea = sheetObj.merged_cells
        
        iMinRow = 1                             # 跳过首行列名
        iMinCol = 0
        iMaxRow = sheetObj.nrows
        iMaxCol = sheetObj.ncols
        for iRow in range(iMinRow, iMaxRow):
            dictRowData = {}                    # 遍历行时保存的单行字典（key为列名）
            for iCol in range(iMinCol, iMaxCol):
                iRowTmp = iRow
                iColTmp = iCol
                # 判断是否处在合并区间内
                for mergeItem in mergeArea:
                    if mergeItem[0] <= iRow < mergeItem[1] and mergeItem[2] <= iCol < mergeItem[3]:
                        # 替换为合并区间的左上角坐标索引
                        iRowTmp = mergeItem[0]
                        iColTmp = mergeItem[2]
                        break
                
                # 保存单行数据
                if iCol >= len(listTabColName):
                    continue
                
                content = sheetObj.cell(iRowTmp, iColTmp).value
                if content is None:
                    dictRowData[listTabColName[iCol]] = ''
                else:
                    dictRowData[listTabColName[iCol]] = str(content)
                    
            # 存储行字典
            listData.append(dictRowData)
        return listData
    
    def LoadExcelSheetName(self, strPath: str):
        try:
            workBookObj = xlrd.open_workbook(strPath, formatting_info=True)
            return workBookObj._sheet_names
        except:
            try:
                workBookObj = load_workbook(strPath)
                return workBookObj.sheetnames
            except:
                return []
    
    # 加载excel文件
    def LoadExcelFile(self, strPath: str, sheetName: str=""):
        '''
        参数:
            strPath: 文件路径
            sheetName: 表名 注:为空时默认取第一张表
        返回
            listData: 字典列表
        '''
        try:
            load_workbook(strPath)
            return self.LoadXLSXFile(strPath, sheetName)
        except:
            try:
                xlrd.open_workbook(strPath)
                return self.LoadXLSFile(strPath, sheetName)
            except:
                return []

    def LoadTxt(self, strTxt: str,  linkTag: str = "\t", isExistHead: bool = True):
        if "" == strTxt:
            return []

         # 保存列名
        listFirstLineByKeyName = []
        # 保存数据
        listData = []
        linesData = strTxt.split("\n")
        for lineItem in linesData:
            if "" == lineItem.strip():
                continue
                pass
            
            lineItem = lineItem.replace("\r", "")
            lineItem = lineItem.replace("\n", "")
            if 0 == len(listFirstLineByKeyName) and True == isExistHead:
                listFirstLineByKeyName = lineItem.split(linkTag)
                continue
            
            dictLineData = {}
            arrLineItem = lineItem.split(linkTag)
            if True == isExistHead:
                for iIndex in range(0, len(listFirstLineByKeyName)):
                    strVal = ""
                    if iIndex < len(arrLineItem):
                        strVal = arrLineItem[iIndex]
                    dictLineData[listFirstLineByKeyName[iIndex]] = strVal
            else:
                for iIndex in range(0, arrLineItem):
                    dictLineData[str(iIndex)] = arrLineItem[iIndex]
            
            listData.append(dictLineData)
        
        return listData
    
    # 加载文本文件
    def LoadTxtFile(self, strPath: str, linkTag: str = "\t", isExistHead: bool = True):
        '''
        参数:
            strPath: 文件路径
            linkTag: 数据连接符
            isExistHead: 是否将第一行当作表头处理
        返回
            listData: 字典列表
        '''
        try:
            fileObj = open(strPath, "r", encoding="utf-8")
            strContent = fileObj.read()
            fileObj.close()
        except:
            return []
        
        listData = self.LoadTxt(strContent, linkTag, isExistHead)

        return listData
    
    # 设置保存excel的表头样式
    def SetFontStyle(self, isBold=False, fontColor="000000"):
        self._isBold = isBold
        self._fontColor = fontColor
    def SetFillStyle(self, bgColor="FFFFFF"):
        self._bgColor = bgColor
    
    # 保存文件
    def SaveXLSXFile(self, listData: List[Dict[str, str]], strPath = "", sheetName = "", isReplaceSheet = True):
        '''
        参数:
            listData: 字典列表(注: 列表的key会加入表头)
            strPath: 文件路径
            sheetName: 表名
            isReplaceSheet: 是否覆盖已存在的表
        返回
            True : False 
        '''
        try:
            if os.path.exists(strPath):
                workbookObj = load_workbook(strPath)
            else:
                workbookObj = Workbook()
                workbookObj.remove(workbookObj["Sheet"])
        except:
            return False
        
        if "" == sheetName:
            sheetNewObj = workbookObj.create_sheet()
        elif sheetName not in workbookObj.sheetnames:
            sheetNewObj = workbookObj.create_sheet(sheetName)
        elif True == isReplaceSheet:
            workbookObj.remove(workbookObj[sheetName])
            sheetNewObj = workbookObj.create_sheet(sheetName)
        else:
            print("sheetName is alread exist")
            return False
        
        listKeyName = []
        for iColIndex in range(0, len(listData)):
            dataItem = listData[iColIndex]
            for keyName in dataItem:
                if keyName not in listKeyName:
                    listKeyName.append(keyName)
                
                iRowIndex = listKeyName.index(keyName)
                sheetNewObj.cell(iColIndex+2, iRowIndex+1).value = dataItem[keyName]
        
        # 表头样式
        headFontStyle = Font(bold=self._isBold, color=self._fontColor)
        headFillStyle = PatternFill(start_color=self._bgColor)
        # sheetNewObj.insert_rows(1)
        for iIndex in range(0, len(listKeyName)):
            sheetNewObj.cell(1, iIndex+1).font = headFontStyle
            sheetNewObj.cell(1, iIndex+1).fill = headFillStyle
            sheetNewObj.cell(1, iIndex+1).value = listKeyName[iIndex]
        
        while(True):
            try:
                workbookObj.save(strPath)
            except:
                print("-------------------------")
                print("Error:文件保存失败,请检查文件是否正在被excel打开("+strPath+")", flush=True)
                print("是否重试?(Y/N)", flush=True)
                inRes = input()
                if 'Y' == inRes:
                    continue
                else:
                    print("文件保存失败", flush=True)
                    break
        
        return True
    
    # 数据分组 相同值合并
    def GetGroupDataByTabName(self, listData: List[Dict[str, str]], colName: str = "", splitChar: str = ""):
        '''
        参数:
            listData: 字典列表(注: 列表的key会加入表头)
            colName: 字典索引
            strRefDefRefTxt: 切割字符(默认不切割)
        返回
            dictData: 列表字典
        '''
        dictData = {}
        for dataItem in listData:
            if 0 == len(dataItem) or colName not in dataItem:
                continue
            
            colNameItems = [colName]
            if "" != splitChar:
                colNameItems = dataItem[colName].split(splitChar)
            
            for colNameItem in colNameItems:
                if colNameItem not in dictData:
                    dictData[colNameItem] = []
            
                dictData[colNameItem].append(dataItem)
        
        return dictData
    
    # 数据过滤
    # 注：Condition可以设置一行判断逻辑语句，最终会当作python语句执行，值变量为%val% (字符串类型)
    # 例：查找listData中Name包含‘_’的项 =》 FilterDataByTabName(listData, "Name", "'_' in %val%")
    def FilterDataByTabName(self, listData: List[Dict[str, str]], colName: str = "", Condition: str="True"):
        '''
        参数:
            listData: 字典列表(注: 列表的key会加入表头)
            colName: 字典索引
            Condition: 过滤条件 注:值变量使用%val%参数
        返回
            listFilterData: 字典列表
        '''
        listFilterData = []
        
        for dataItem in listData:
            if 0 == len(dataItem) or colName not in dataItem:
                continue
            
            itemVal = "'" + dataItem[colName] +"'"
            ConditionTmp = Condition
            ConditionTmp = ConditionTmp.replace("%val%", itemVal)
            try:
                if True == eval(ConditionTmp):
                    listFilterData.append(dataItem)
            except:
                pass
        
        return listFilterData
# class Table

# 文件目录操作
class MyFile:
    def __init__(self):
        pass
    
    # 判断文件类型是否满足
    def IsTrueFileType(self, strFile: str, fileType: set={"*.*"}, isStrictCase: bool = False):
        '''
        参数:
            strFile: 文件路径
            fileType: 文件类型集合 注:支持通配符
            isStrictCase: 是否严格匹配大小写
        返回
            True : False
        '''
        
        if False == os.path.isfile(strFile):
            return False
        
        strFile = strFile.replace("/", "\\")
        strFileName = strFile[strFile.rfind("\\")+1:]
        
        funcMatch = None
        if True == isStrictCase:
            funcMatch = fnmatchcase
        else:
            funcMatch = fnmatch
        
        for typeItem in fileType:
            if True == funcMatch(strFileName, typeItem):
                return True
        return False
    
    # 遍历目录获取文件列表 
    def GetFileList(self, strRootDir: str, fileType: set={"*.*"}, isFindChildDir: bool = True, isStrictCase: bool = False):
        '''
        参数:
            strRootDir: 目录路径
            fileType: 文件类型集合 注:支持通配符
            isFindChildDir: 是否查找子目录
            isStrictCase: 是否严格匹配大小写
        返回
            fileList: 文件列表
        '''
        
        fileList = []
        strRootDir = strRootDir.replace("/", "\\")
        if True == os.path.isfile(strRootDir):
            if True == self.IsTrueFileType(strRootDir, fileType, isStrictCase):
                fileList.append(strRootDir)
                return fileList
    
        if os.path.isdir(strRootDir):
            for item in os.listdir(strRootDir):
                file = os.path.join(strRootDir, item)
                
                if True == os.path.isdir(file) and False == isFindChildDir:
                    continue
                
                fileList.extend(self.GetFileList(file, fileType, isFindChildDir, isStrictCase))
        
        self.strDirPath = strRootDir
        self.fileList = fileList
        return fileList
    
    # 批量拷贝文件到指定目录
    def CopyFileList2Dir(self, fileList: List[str], strTargetDir: str):
        '''
        参数:
            fileList: 文件列表
            strTargetDir: 目标目录
        返回
            void
        '''
        strTargetDir = os.path.join(strTargetDir, ".\\")
        
        if False == os.path.isdir(strTargetDir):
            return
        
        for file in fileList:
            try:   
                shutil.copy(file, os.path.dirname(strTargetDir))
            except:
                print("copy fail: '%s' to '%s'"%(file, strTargetDir))
        pass

    def CopyFileList2Makedirs(self, fileList: List[str], strTargetDir: str, strRelativeDir: str = ""):
        '''
        参数:
            fileList: 文件列表
            strTargetDir: 目标目录
            strRelativeDir: 文件列表相对根目录
        返回
            void
        '''
        strTargetDir = os.path.join(strTargetDir, ".\\")
        strRelativeDir = os.path.join(strRelativeDir, ".\\")
        
        if False == os.path.isdir(strTargetDir) or False == os.path.isdir(strRelativeDir):
            return

        for file in fileList:
            relFile = os.path.relpath(file, strRelativeDir)
            newFile = os.path.join(strTargetDir, relFile)
            newDir = os.path.dirname(newFile)

            if False == os.path.isdir(newDir):
                os.makedirs(newDir)
            
            try:   
                shutil.copy(file, newDir)
            except:
                print("copy fail: '%s' to '%s'"%(file, newDir))
              
    # 删除目录及子目录和文件
    def RemoveDir(self, strDir: str):
        '''
        参数:
            strDir: 目标目录
        返回
            void
        '''
        if os.path.isfile(strDir):
            os.remove(strDir)
            return
        
        if os.path.isdir(strDir):
            for item in os.listdir(strDir):
                file = os.path.join(strDir, item)
                if os.path.isfile(file):
                    os.remove(file)
                    continue
                if 0 == len(os.listdir(file)):
                    os.rmdir(file)
                else:
                    self.RemoveDir(file)
            
            if 0 == len(os.listdir(strDir)):
                os.rmdir(strDir)
        pass      
    
    @staticmethod
    def Log(strLogContent: str="", strLogFile: str="log.log"):
        strLogContent = time.strftime("[%Y/%m/%d %H:%M:%S]\t", time.localtime()) + strLogContent + "\n"
        
        fileObj = open(strLogFile, "a", encoding="utf-8")
        fileObj.write(strLogContent)
        fileObj.close()
        pass   
# class MyFile:

class MyXML:
    def __init__(self) -> None:
        self.xmldoc = None
        pass

    def PrettyXML(self, strText = "", strIndent:str = "  ") -> str:
        isAddRoot = False
        strXMLHead = '<?xml version="1.0" ?>'
        strKeyTag = '<?xml'
        strRootTmp = "ROOT_TO_PRETTY"

        listLines = strText.split("\n")
        strTextTmp = ""
        for lineItem in listLines:
            strTextTmp += lineItem.strip()
        strText = strTextTmp

        strText = strText.replace("<?xml", "<!-- <?xml")
        strText = strText.replace("?>", "?> -->")

        if 0 != strText.find(strKeyTag):
            isAddRoot = True
            strText = "<" + strRootTmp + ">" + strText + "</" + strRootTmp + ">" 

        strText = strXMLHead + strText
        xmlText = xml.dom.minidom.parseString(strText)
        strText = xmlText.toprettyxml(indent=strIndent)

        strText = strText.replace("<" + strRootTmp + ">" , "")
        strText = strText.replace("</" + strRootTmp + ">" , "")
        strText = strText.replace("<" + strRootTmp + "/>" , "")
        strText = strText.replace(strXMLHead, "")

        if True == isAddRoot:
            strText = strText.replace("\n"+strIndent, "\n")

        strText = strText.replace("<!-- <?xml", "<?xml")
        strText = strText.replace("?> -->", "?>")
        strText = strText.strip()

        return strText

    def PrettyXMLFile(self, strFile = "") -> None:
        if False is os.path.isfile(strFile):
            return

        fileObj = open(strFile, "r", encoding="utf-8")
        strText = fileObj.read()
        fileObj.close()

        strText = self.PrettyXML(strText)

        fileObj = open(strFile, "w", encoding="utf-8")
        fileObj.write(strText)
        fileObj.close()

# 字符串处理
class MyStr:
    def __init__(self):
        pass
    
    # 按有效空白符切割，并返回列表
    def MySpiltByBlank(self, strTarget: str=""):
        '''
        参数:
            strTarget: 目标字符串
        返回
            listData: 切割后的列表数组
        '''
        strTmp = strTarget.strip()
        
        listData = []
        strTmp = strTmp.replace("\t", " ")
        while(" " in strTmp):
            iPos = strTmp.find(" ")
            strItem = strTmp[0:iPos]
            strTmp = strTmp[iPos:]
            strTmp = strTmp.strip()
            listData.append(strItem)
        listData.append(strTmp)
        
        return listData
    
    # 删除开头为strRemove的字符串
    def RemoveBeginStr(self, strTarget: str="", strRemove: str=""):
        if 0 == strTarget.find(strRemove):
            strTarget = strTarget[len(strRemove):]
        return strTarget

    # 删除结尾为strRemove的字符串
    def RemoveEndStr(self, strTarget: str="", strRemove: str=""):
        if strTarget.endswith(strRemove):
            strTarget = strTarget[0:0-len(strRemove):]
        return strTarget
    
    # 正则表达式提取指定开头和结尾包含的内容，返回列表
    def GetContentBetweenStartAndEnd(self, strTarget: str="", strStart: str="", strEnd: str="", isLine: bool=False):
        '''
        参数:
            strTarget: 目标字符串
            strStart : 起始字符串
            strEnd   : 结尾字符串
        返回
            listData: 匹配到的内容列表
        备注:
            使用正则表达式匹配
        '''
        listData = []
        if "" == strStart or "" == strEnd:
            listData.append(strTarget)
            return listData
        
        strTmp = "(?<={start})(.*?)(?={end})".format(start=strStart, end=strEnd)
        if False is isLine:
            strTmp = "(?<={start})((.|\r|\n)*?)(?={end})".format(start=strStart, end=strEnd)
        pattern  = re.compile(strTmp)
        listData = pattern.findall(strTarget)

        return listData
# class MyStr:

# ini文件
class MyConfigParser(configparser.ConfigParser):
    def __init__(self):
        super().__init__()

    def optionxform(self, optionstr):
        # 重写optionxform方法, 保留参数大小写
        return optionstr
    
    def LoadConfigIni(self, strFile:str = "", encoding: str = "utf-8")->dict[str, dict[str, str]]:
        '''
        参数:
            strFile  : 配置文件ini
            encoding : 配置文件编码(默认utf-8)
        返回
            dictdata : 配置文件字典
        备注:
        '''
        dictdata = {}
        
        if not os.path.exists(strFile):
            return dictdata
        
        self.read(strFile, encoding)
        for sectionItem in self.sections():
            dictSubdata = {}
            for itemset in self.items(sectionItem):
                itemKey = itemset[0]
                itemVal = itemset[1]
                dictSubdata[itemKey] = itemVal
            dictdata[sectionItem] = dictSubdata
        return dictdata
# class MyConfigParser(configparser.ConfigParser):