[Python] 提取若干XML文件相同的元素生成csv或xlsx表格

有若干xml文件,就是Android 的各语种翻译文件,通过提取翻译,能看出有哪些语言的翻译有缺失,这是文件列表:
文件列表

这是其中一个文件的内容,只是部分内容,文件很长。
文件内容

这是生成csv文件的python脚本:

import os
import csv
import xml.etree.ElementTree as ET

title = set()

def getKeyList():
    tree = ET.parse(r'.\values_new\string_ar.xml')
    root = tree.getroot()
    file_dict = {}
    key_list = []
    for element in root.iter('string'):
        key = element.get('name')
        key_list.append(key)
    return key_list

# Gets a dictionary for each single file
def parseOneFile(file_name):
    tree = ET.parse(file_name)
    root = tree.getroot()
    file_dict = {}
    key_set = set()
    file_dict['file_name_'] = file_name
    for element in root.iter('string'):
        key = element.get('name')
        value = element.text
        if value == None:
            value = "#None#"
        elif value == "":
            value = "#Empty#"
        file_dict[key]=value
        key_set.add(key)
    global title
    title = title | key_set
    return file_dict

def processFiles():
    excel = open("translation_reverse.csv", 'w', newline = '', encoding='utf-8')  # This is the generated txt file path
    csv_file = csv.writer(excel)
    head = []
    all_keys = []   
    csv_lines = [] # This is the rows of of excel

    rootdir = r".\values_new"  # This is the path to place string_xx.xml files
    totle_keys = set() # This variable try to get union of all key sets of each file
    dirlist = os.listdir(rootdir)

    dictionary_list = []

    for i in range(0, len(dirlist)): # Traverse each file in the file folder
        path = os.path.join(rootdir, dirlist[i])
        head.append(dirlist[i])
        csv_row = []
        if os.path.isfile(path):
            csv_row.append(dirlist[i]) # This is file name
            dictionary_list.append(parseOneFile(path)) #  a list: [{dict of en}, {dict of zh}, ...]

    other_item = title - set(getKeyList()) # A small set
    all_keys = getKeyList() + list(other_item) # The complete list'

    head.insert(0, 'N/A')
    csv_lines.append(head) # Writes the first line of cvs, which should be: app_name, etc

    for key in all_keys:
        print("log read key: ", key)
        csv_row = []
        csv_row.append(key)
        #print("keyyyy: ", key)
        for i in range(1, len(head)):  # for each file, do the same thing
            if key in dictionary_list[i-1]:
                #print(dictionary_list[i-1][key])
                csv_row.append(dictionary_list[i-1][key])
            else:
                csv_row.append("## N/A ##")
        csv_lines.append(csv_row)

    csv_file.writerows(csv_lines)
    excel.close()

if __name__ == "__main__":
    processFiles()
    print("************* All Done! *************")

然后改成生成XLSX,XLSX格式的优点是,列表的每一项的宽度会自动调整,csv则是宽度都一样,为了生成xlsx,必须安装library openpyxl
使用pip安装openpyxl

import os
import csv
import xml.etree.ElementTree as ET
from openpyxl import Workbook

title = set()
def getKeyList():
    tree = ET.parse(r'.\values_new\string_ar.xml')
    root = tree.getroot()
    file_dict = {}
    key_list = []
    for element in root.iter('string'):
        key = element.get('name')
        key_list.append(key)
    return key_list

# Gets a dictionary for each single file
def parseOneFile(file_name):
    tree = ET.parse(file_name)
    root = tree.getroot()
    file_dict = {}
    key_set = set()
    file_dict['file_name_'] = file_name
    for element in root.iter('string'):
        key = element.get('name')
        value = element.text
        if value == None:
            value = "#None#"
        elif value == "":
            value = "#Empty#"
        file_dict[key]=value
        key_set.add(key)
    global title
    title = title | key_set
    return file_dict

def processFiles():
    #选取任意一个文件,读取全部元素
    #excel = open("translation_reverse.csv", 'w', newline = '', encoding='utf-8')  # This is the generated txt file path
    #csv_file = csv.writer(excel)
    head = []
    all_keys = []   
    xls_lines = [] # This is the rows of of excel

    wb = Workbook()
    # grab the active worksheet
    ws = wb.active

    rootdir = r".\values_new"  # This is the path to place string_xx.xml files
    totle_keys = set() # This variable try to get union of all key sets of each file
    dirlist = os.listdir(rootdir)
    dictionary_list = []
    #遍历文件夹
    for i in range(0, len(dirlist)): # Traverse each file in the file folder
        path = os.path.join(rootdir, dirlist[i])
        head.append(dirlist[i])
        xls_row = []
        if os.path.isfile(path):
            xls_row.append(dirlist[i]) # This is file name
            dictionary_list.append(parseOneFile(path)) #  a list: [{dict of en}, {dict of zh}, ...]

    other_item = title - set(getKeyList()) # A small set
    all_keys = getKeyList() + list(other_item) # The complete list'
    #print(all_keys)

    head.insert(0, 'N/A')
    ws.append(head) # Writes the first line of cvs, which should be: app_name, etc

    for key in all_keys:
        print("log read key: ", key)
        xls_row = []
        xls_row.append(key)
        #print("keyyyy: ", key)
        for i in range(1, len(head)):  # for each file, do the same thing
            if key in dictionary_list[i-1]:
                #print(dictionary_list[i-1][key])
                xls_row.append(dictionary_list[i-1][key])
            else:
                xls_row.append("## N/A ##")
        ws.append(xls_row)      
    wb.save("translation.xlsx")

if __name__ == "__main__":
    processFiles()
    print("************* All Done! *************")

缺点是,没有检查文件元素是否有重复,理论上应该是没有重复的,先假定没有重复。

生成的表格大约是这个样子:
生成的表格

其中关键的两点是:遍历文件夹和提取xml文件元素,然后使用了一些集合运算,以得到完整的元素集合。

  • 2
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 4
    评论
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值