[Python] 提取若干XML文件相同的元素生成csv或xlsx表格

最新推荐文章于 2024-07-23 17:30:44 发布

小公鸡卡哇伊呀~

最新推荐文章于 2024-07-23 17:30:44 发布

阅读量3.2k

点赞数 2

分类专栏： Python

本文链接：https://blog.csdn.net/ftell/article/details/80554089

版权

Python 专栏收录该内容

12 篇文章 0 订阅

订阅专栏

有若干xml文件，就是Android 的各语种翻译文件，通过提取翻译，能看出有哪些语言的翻译有缺失，这是文件列表：

这是其中一个文件的内容，只是部分内容，文件很长。
文件内容

这是生成csv文件的python脚本:

import os
import csv
import xml.etree.ElementTree as ET

title = set()

def getKeyList():
    tree = ET.parse(r'.\values_new\string_ar.xml')
    root = tree.getroot()
    file_dict = {}
    key_list = []
    for element in root.iter('string'):
        key = element.get('name')
        key_list.append(key)
    return key_list

# Gets a dictionary for each single file
def parseOneFile(file_name):
    tree = ET.parse(file_name)
    root = tree.getroot()
    file_dict = {}
    key_set = set()
    file_dict['file_name_'] = file_name
    for element in root.iter('string'):
        key = element.get('name')
        value = element.text
        if value == None:
            value = "#None#"
        elif value == "":
            value = "#Empty#"
        file_dict[key]=value
        key_set.add(key)
    global title
    title = title | key_set
    return file_dict

def processFiles():
    excel = open("translation_reverse.csv", 'w', newline = '', encoding='utf-8')  # This is the generated txt file path
    csv_file = csv.writer(excel)
    head = []
    all_keys = []   
    csv_lines = [] # This is the rows of of excel

    rootdir = r".\values_new"  # This is the path to place string_xx.xml files
    totle_keys = set() # This variable try to get union of all key sets of each file
    dirlist = os.listdir(rootdir)

    dictionary_list = []

    for i in range(0, len(dirlist)): # Traverse each file in the file folder
        path = os.path.join(rootdir, dirlist[i])
        head.append(dirlist[i])
        csv_row = []
        if os.path.isfile(path):
            csv_row.append(dirlist[i]) # This is file name
            dictionary_list.append(parseOneFile(path)) #  a list: [{dict of en}, {dict of zh}, ...]

    other_item = title - set(getKeyList()) # A small set
    all_keys = getKeyList() + list(other_item) # The complete list'

    head.insert(0, 'N/A')
    csv_lines.append(head) # Writes the first line of cvs, which should be: app_name, etc

    for key in all_keys:
        print("log read key: ", key)
        csv_row = []
        csv_row.append(key)
        #print("keyyyy: ", key)
        for i in range(1, len(head)):  # for each file, do the same thing
            if key in dictionary_list[i-1]:
                #print(dictionary_list[i-1][key])
                csv_row.append(dictionary_list[i-1][key])
            else:
                csv_row.append("## N/A ##")
        csv_lines.append(csv_row)

    csv_file.writerows(csv_lines)
    excel.close()

if __name__ == "__main__":
    processFiles()
    print("************* All Done! *************")

然后改成生成XLSX，XLSX格式的优点是，列表的每一项的宽度会自动调整，csv则是宽度都一样，为了生成xlsx，必须安装library openpyxl
使用pip安装openpyxl

import os
import csv
import xml.etree.ElementTree as ET
from openpyxl import Workbook

title = set()
def getKeyList():
    tree = ET.parse(r'.\values_new\string_ar.xml')
    root = tree.getroot()
    file_dict = {}
    key_list = []
    for element in root.iter('string'):
        key = element.get('name')
        key_list.append(key)
    return key_list

# Gets a dictionary for each single file
def parseOneFile(file_name):
    tree = ET.parse(file_name)
    root = tree.getroot()
    file_dict = {}
    key_set = set()
    file_dict['file_name_'] = file_name
    for element in root.iter('string'):
        key = element.get('name')
        value = element.text
        if value == None:
            value = "#None#"
        elif value == "":
            value = "#Empty#"
        file_dict[key]=value
        key_set.add(key)
    global title
    title = title | key_set
    return file_dict

def processFiles():
    #选取任意一个文件，读取全部元素
    #excel = open("translation_reverse.csv", 'w', newline = '', encoding='utf-8')  # This is the generated txt file path
    #csv_file = csv.writer(excel)
    head = []
    all_keys = []   
    xls_lines = [] # This is the rows of of excel

    wb = Workbook()
    # grab the active worksheet
    ws = wb.active

    rootdir = r".\values_new"  # This is the path to place string_xx.xml files
    totle_keys = set() # This variable try to get union of all key sets of each file
    dirlist = os.listdir(rootdir)
    dictionary_list = []
    #遍历文件夹
    for i in range(0, len(dirlist)): # Traverse each file in the file folder
        path = os.path.join(rootdir, dirlist[i])
        head.append(dirlist[i])
        xls_row = []
        if os.path.isfile(path):
            xls_row.append(dirlist[i]) # This is file name
            dictionary_list.append(parseOneFile(path)) #  a list: [{dict of en}, {dict of zh}, ...]

    other_item = title - set(getKeyList()) # A small set
    all_keys = getKeyList() + list(other_item) # The complete list'
    #print(all_keys)

    head.insert(0, 'N/A')
    ws.append(head) # Writes the first line of cvs, which should be: app_name, etc

    for key in all_keys:
        print("log read key: ", key)
        xls_row = []
        xls_row.append(key)
        #print("keyyyy: ", key)
        for i in range(1, len(head)):  # for each file, do the same thing
            if key in dictionary_list[i-1]:
                #print(dictionary_list[i-1][key])
                xls_row.append(dictionary_list[i-1][key])
            else:
                xls_row.append("## N/A ##")
        ws.append(xls_row)      
    wb.save("translation.xlsx")

if __name__ == "__main__":
    processFiles()
    print("************* All Done! *************")