#!/home/wubin/miniconda3/bin/python
# -*- coding: utf-8 -*-
# https://blog.csdn.net/weixin_39274753/article/details/82221859 优先使用xml.etree.ElementTree模块
# https://blog.csdn.net/weixin_36279318/article/details/79176475 这个讲得也不错
# https://blog.csdn.net/yiluochenwu/article/details/23515923 使用xml.etree.ElementTree模块
import xml.etree.ElementTree as ET
# 全局唯一标识
unique_id = 1
# 遍历所有的节点
def walkData(root_node, level, tag_string, value_string, result_list):
global unique_id
tag_string = tag_string + '/' + root_node.tag
value = ''
tag_tmp = root_node.tag
if tag_tmp == 'JDBOR':
value = 'JDBOR'
elif tag_tmp == 'Availability': #这个标签里面的内容不要
return
elif tag_tmp == 'DisorderList' or tag_tmp == 'PrevalenceList':
value = root_node.attrib['count']
elif tag_tmp == 'OrphaCode' or tag_tmp == 'ExpertLink' or tag_tmp == 'Name' or tag_tmp == 'Source' or tag_tmp == 'ValMoy' :
value = root_node.text
elif tag_tmp == 'Disorder' or tag_tmp == 'DisorderType' or tag_tmp == 'DisorderGroup' or tag_tmp == 'Prevalence' or tag_tmp == 'PrevalenceType':
value = root_node.attrib['id']
elif 'id' in root_node.attrib:
value = root_node.attrib['id']
else:
value = root_node.text
if not value: #AttributeError: 'NoneType' object has no attribute 'encode'
value = 'NA'
else:
value = str(value.encode('utf8')).strip('\n') #UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 4: ordinal not in range(128)
value_string = value_string + '/' + str(value)
temp_list = [unique_id, level, root_node.tag, tag_string,value_string, root_node.attrib]
result_list.append(temp_list)
unique_id += 1
# 遍历每个子节点
children_node = root_node.getchildren() #这个是关键,getchildren(),这个每次只涉及下一层, 而不是用 iter().这个会刨根问底
if len(children_node) == 0:
return
for child in children_node:
walkData(child, level + 1, tag_string, value_string,result_list)
return
# 获得原始数据
# out:
# [
# #ID, Level, Attr Map
# [1, 1, {'ID':1, 'Name':'test1'}],
# [2, 1, {'ID':1, 'Name':'test2'}],
# ]
def getXmlData(file_name):
level = 1 # 节点的深度从1开始
result_list = []
tag_string = ''
value_string = ''
root = ET.parse(file_name).getroot() # 等同于tree = ET.ElementTree(file_name) ?
walkData(root, level, tag_string, value_string, result_list)
return result_list
# file_name = 'test3.xml'
# root = ET.parse(file_name).getroot()
# print(root.tag)
# print(help(root))
# print('======{}========'.format(root.attrib['date']))
# tree = ET.ElementTree(file_name)
# root2 = tree.getroot()
# print('======{}========'.format(root.tag))
#
from optparse import OptionParser
parser = OptionParser()
parser.add_option("-i", "--in_file", action="store", dest="in_file", help="input file")
parser.add_option("-o", "--out_file", action="store", dest="out_file", help="output file")
(options, args) = parser.parse_args()
in_file = options.in_file
out_file = options.out_file
if __name__ == '__main__':
# file_name = 'test3.xml'
# file_name = 'Rare_disease_epidemiology_en_product9_prev.xml'
R = getXmlData(in_file)
F = open(out_file,'w')
for x in R:
x.pop()
x = [str(i) for i in x]
line = '\t'.join(x) + '\n'
F.write(line)
F.close()
pass