最近遇到需要使用脚本语言解析XML文档的问题,在shell和Python之间纠结半天,发现还是Python中现成的类库好用。
如下是需要解析的xml文档Guest.xml:
<?xml version="1.0" encoding="UTF-8" ?>
<customer name="useX" id="111111111" mail="userX@cultraview.com">
<orderID id="0123456789">2015-05-22</orderID>
<platform id="0001">MSD638</platform>
<modules>
<module id="1000001">
<modulename>Tuner</modulename>
<type>23</type>
</module>
<module id="1000002">
<modulename>Ircode</modulename>
<type>22</type>
</module>
<module id="1000003">
<modulename>Logo</modulename>
<type>27</type>
</module>
<module id="1000004">
<modulename>Backlight</modulename>
<type>25</type>
</module>
<module id="1000005">
<modulename>Panel</modulename>
<type>20</type>
</module>
<module id="1000006">
<modulename>Board</modulename>
<type>23</type>
</module>
</modules>
</customer>
下面是解析使用的Python脚本parseGuestXML.py:
#!/usr/bin/python
# -*- coding:utf-8 -*-
"""
Build image output_image_file from input_directory and properties_file.
Usage: build_image input_directory properties_file output_image_file
"""
from xml.dom import minidom
def get_attrvalue(node, attrname):
return node.getAttribute(attrname) if node else ''
def get_nodevalue(node, index = 0):
return node.childNodes[index].nodeValue if node else ''
def get_xmlnode(node, name):
return node.getElementsByTagName(name) if node else []
def xml_to_string(filename='Guest.xml'):
doc = minidom.parse(filename)
return doc.toxml('UTF-8')
def get_xml_data(filename='Guest.xml'):
doc = minidom.parse(filename)
root = doc.documentElement
## show the attributes of root node
print 'The customer name: %s' % (root.getAttribute("name"))
print 'The customer id: %d' % (int(root.getAttribute("id")))
print 'The customer mail: %s' % (root.getAttribute("mail"))
# get the orderID node
signode=root.getElementsByTagName("orderID")
print 'The orderID of customer : %d' % (int(signode[0].getAttribute("id")))
print 'The time of orderID : %s' % (signode[0].childNodes[0].data)
# get the platform node
signode=root.getElementsByTagName("platform")
# show the data of platform
print 'The platform id:%d' % (int(signode[0].getAttribute("id")))
print 'The platform des:%s' % (signode[0].childNodes[0].data)
# get all the module nodes
module_nodes = get_xmlnode(root,'module')
module_list=[]
for node in module_nodes:
module_id = get_attrvalue(node,'id')
node_name = get_xmlnode(node,'modulename')
node_type = get_xmlnode(node,'type')
user_name =get_nodevalue(node_name[0]).encode('utf-8','ignore')
node_type = get_nodevalue(node_type[0]).encode('utf-8','ignore')
module = {}
module['id'] , module['modulename'] , module['type'] = (
int(module_id), user_name , node_type
)
module_list.append(module)
return module_list
def test_xmltostring():
print xml_to_string()
def test_laod_xml():
user_list = get_xml_data()
for user in user_list :
#print user['sex']
print '-----------------------------------------------------'
if user:
user_str='module id:%d\nModuleName:%s\nType:%s' % (int(user['id']), user['modulename'], user['type'])
print user_str
print '=====================================================\n'
if __name__ == "__main__":
#test_xmltostring()
test_laod_xml()
编写完成后直接执行:python parseGuestXML.py即可完成xml里面各个tag的解析。
有几个注意的点:
(1)getElementsByTagName是获取所有的子节点; // root.getElementsByTagName("module")就是获取所有tag为module的子节点
(2)另外如果getElementsByTagName获取到对应tag的子节点如果再没有子节点,通过[0] 来后去当前唯一的节点;
// node=root.getElementsByTagName("platform") ; node[0].getAttribute("id")) ; node[0].childNodes[0].data
(3)存在子节点的就直接for循环轮训