最近需要读取一堆XML文件的数据,这批XML文件很大,虽说用Excel读取很方便,但是读取速度特别慢,故使用Python结合pandas、xml、multiprocessing包实现了数据的快速读取。
# xml2xlsx
import xml.dom.minidom
import pandas as pd
import os
from multiprocessing import Pool
# Read XML file and convert it to XLSX
# Use multiprocess
def xml2excel(filename):
print(filename + '...')
save_path = '/home/pc/xadf'
xml_report = os.path.join(output_path, filename)
outputname = filename.split('.')[0]
xls_report = os.path.join(save_path, outputname + '.xlsx')
#XML文件读取
dom = xml.dom.minidom.parse(xml_report)
z_tag = dom.getElementsByTagName('z')
N = len(z_tag)
z = [z_tag[i].firstChild.data for i in range(N)]
surface_name_tag = dom.getElementsByTagName('s')
surface_name = [surface_name_tag[0].firstChild.data for i in range(N)]
label_tag = dom.getElementsByTagName('l')
label = [label_tag[i//(512*128)].firstChild.data for i in range(N)]
#XLSX文件写入
all_dict = {'surface_name': surface_name, 'label': label, 'z': z}
df = pd.DataFrame(all_dict)
with pd.ExcelWriter(xls_report) as Writer:
df.to_excel(Writer, 'Sheet1', index=False, header=False)
output_path = '/home/pc/akjhfkd'
filenames = os.listdir(output_path)
#并行处理,但不能设置太大,否则容易卡死
pool = Pool(6)
pool.map(xml2excel, filenames)
pool.close()
pool.join()