xml解析

最新推荐文章于 2024-07-18 14:43:20 发布

风语者666

最新推荐文章于 2024-07-18 14:43:20 发布

阅读量89

点赞数

分类专栏： python 生信

本文链接：https://blog.csdn.net/u014210048/article/details/113558352

版权

python 同时被 2 个专栏收录

32 篇文章 2 订阅

订阅专栏

生信

31 篇文章 3 订阅

订阅专栏

#!/home/wubin/miniconda3/bin/python
# -*- coding: utf-8 -*-
# https://blog.csdn.net/weixin_39274753/article/details/82221859 优先使用xml.etree.ElementTree模块
# https://blog.csdn.net/weixin_36279318/article/details/79176475 这个讲得也不错
# https://blog.csdn.net/yiluochenwu/article/details/23515923  使用xml.etree.ElementTree模块
import xml.etree.ElementTree as ET
# 全局唯一标识
unique_id = 1


# 遍历所有的节点
def walkData(root_node, level, tag_string, value_string, result_list):

    global unique_id
    tag_string = tag_string + '/' + root_node.tag
    value = ''
    tag_tmp = root_node.tag
    if tag_tmp == 'JDBOR':
        value = 'JDBOR'
    elif tag_tmp == 'Availability': #这个标签里面的内容不要
        return
    elif tag_tmp == 'DisorderList' or tag_tmp == 'PrevalenceList':
        value = root_node.attrib['count']
    elif tag_tmp == 'OrphaCode' or tag_tmp == 'ExpertLink' or tag_tmp == 'Name' or tag_tmp == 'Source' or tag_tmp == 'ValMoy'  :
        value = root_node.text
    elif tag_tmp == 'Disorder' or tag_tmp == 'DisorderType' or tag_tmp == 'DisorderGroup' or tag_tmp == 'Prevalence' or tag_tmp == 'PrevalenceType':
        value = root_node.attrib['id']
    elif 'id' in  root_node.attrib:
        value = root_node.attrib['id']
    else:
        value = root_node.text

    if not value:   #AttributeError: 'NoneType' object has no attribute 'encode'
        value = 'NA'
    else:
        value = str(value.encode('utf8')).strip('\n')   #UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 4: ordinal not in range(128)

    value_string = value_string + '/' + str(value)
    temp_list = [unique_id, level, root_node.tag, tag_string,value_string, root_node.attrib]
    result_list.append(temp_list)
    unique_id += 1

    # 遍历每个子节点
    children_node = root_node.getchildren() #这个是关键，getchildren(),这个每次只涉及下一层， 而不是用 iter().这个会刨根问底
    if len(children_node) == 0:
        return
    for child in children_node:
        walkData(child, level + 1, tag_string, value_string,result_list)
    return


# 获得原始数据
# out:
# [
#    #ID, Level, Attr Map
#    [1, 1, {'ID':1, 'Name':'test1'}],
#    [2, 1, {'ID':1, 'Name':'test2'}],
# ]
def getXmlData(file_name):
    level = 1  # 节点的深度从1开始
    result_list = []
    tag_string = ''
    value_string = ''
    root = ET.parse(file_name).getroot() # 等同于tree = ET.ElementTree(file_name) ？
    walkData(root, level, tag_string, value_string, result_list)

    return result_list

# file_name = 'test3.xml'
# root = ET.parse(file_name).getroot()
# print(root.tag)
# print(help(root))
# print('======{}========'.format(root.attrib['date']))
# tree = ET.ElementTree(file_name)
# root2 = tree.getroot()
# print('======{}========'.format(root.tag))
#

from optparse import OptionParser
parser = OptionParser()
parser.add_option("-i", "--in_file", action="store", dest="in_file", help="input file")
parser.add_option("-o", "--out_file", action="store", dest="out_file", help="output file")
(options, args) = parser.parse_args()
in_file = options.in_file
out_file = options.out_file

if __name__ == '__main__':
    # file_name = 'test3.xml'
    # file_name = 'Rare_disease_epidemiology_en_product9_prev.xml'
    R = getXmlData(in_file)
    F = open(out_file,'w')
    for x in R:
        x.pop()
        x = [str(i) for i in x]
        line = '\t'.join(x) + '\n'
        F.write(line)
    F.close()
    pass