xml内容如下:
<?xml version="1.0" encoding="utf8"?>
<dblp>
<article mdate="2002-01-03" key="中国">
<author>E. F. Codd</author>
<title>: A cloud data storage system for supporting both OLTP and OLAP.</title>
<journal>IBM Research Report, San Jose, California</journal>
<volume>RJ909</volume>
<month>August</month>
<year>1971</year>
</article>
<article mdate="2002-01-03" key="美国">
<author>E. F. Codd</author>
<title><i>Entropy</i> Best Paper Award 2013.</title>
<journal>IBM Research Report, San Jose, California</journal>
<volume>RJ909</volume>
<month>August</month>
<year>1971</year>
<cdrom>ibmTR/rj909.pdf</cdrom>
<ee>db/labs/ibm/RJ909.html</ee>
</article>
</dblp>
解析程序如下:
from lxml import etree
def getxml_content():
tree = etree.parse("xml.txt") #获取树结构
root = tree.getroot() # 获取根节点
for elments in root: #遍历根节点获取子节点
#######################
# 方法1、获取节点下所有属性遍历获取,是一个字典
#######################
for key in elments.attrib.keys():
print(key,":",elments.get(key))
#######################
# 方法2、根据已知的名称获取属性值
#######################
print("mdate:",elments.get("mdate")) #.get获取标签里面的属性内容
for e in elments:
print(e.tag,':',e.text) #.tag获取节点(标签)名称,.text获取两个标签中间夹着的内容
print(root) #
xml内容如下:
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<template type="授权委托书" version="1.0">
<root>
<element allowTimes="" code="00" controlName="一般情况" dataFormat="" dataType="" displayName="一般情况" id="2" nodeSource="1" showType="ElementTable" sortNo="14242" sourceId="00">
<text>住院号:0000385632。病区名称:221W介入2。科室名称:介入与血管外科二。病床号:51。患者姓名:李XXX。性别:女。年龄:84岁。受委托人身份证号:XXXX</text>
</element>
</root>
</template>
def xml_parse_two(VISITSQNO,LOCALID,DOCNAME,FILENAME,XML_TWO):
strs=bytes(bytearray(XML_TWO, encoding='utf-8')) #最好是byte,不然报错
t=etree.XML(strs) #获取根template
tree = etree.ElementTree(t)
root=tree.getroot() #获取第一层template根标签
list = []
for elments in root: #获取第二层标签
for elment in elments: #获取第三层标签
for e in elment: #获取第四层标签
attr_list=[]
attr_list.append(VISITSQNO)
attr_list.append(LOCALID)
attr_list.append(DOCNAME)
attr_list.append(FILENAME)
for key in e.attrib.keys(): #获取第四层属性值
if 'controlName' in key or 'displayName' in key:
atrribut_value=e.get(key) #获取第四层属性及属性值
attr_list.append(atrribut_value)
if len(e)>0:
attr_list.append(e[0].text) #第四层content的内容
list.append(tuple(attr_list))
return list