前言
XML处理是日常工作经常遇到的部分,如配置文件,下面介绍快速使用Python进行XML处理的方法。
XML的处理,在Python的库里,有自身所带的库,也有第三方,如果进行简单的读写,推荐ElementTree和minidom,前者是python特有的xml处理方式,后者是DOM的简单实现。
其他的python中xml处理的库还包括,
xml.dom, DOM API;
xml.dom.pulldom, 支持建立部分dom tree;
xml.sax, 需要自己提供接口函数功能;
xml.parsers.expat, 快速xml处理,但是不安全。
1. ElementTree
1.1 ElementTree 写Xml文件
from xml.etree import ElementTree
def xml_writer():
root = ElementTree.Element("Data")
country1 = ElementTree.SubElement(root, "country")
country1.set("name", "Liechtenstein")
rank1 = ElementTree.SubElement(country1, "rank")
rank1.text = "1"
year1 = ElementTree.SubElement(country1, "year")
year1.text = "2011"
gdppc = ElementTree.SubElement(country1, "gdppc")
gdppc.text = "141100"
country2 = ElementTree.SubElement(root, "country")
country2.set("name", "Singapore")
rank1 = ElementTree.SubElement(country2, "rank")
rank1.text = "4"
year1 = ElementTree.SubElement(country2, "year")
year1.text = "2011"
gdppc = ElementTree.SubElement(country2, "gdppc")
gdppc.text = "59900"
country3 = ElementTree.SubElement(root, "country")
country3.set("name", "Panama")
rank1 = ElementTree.SubElement(country3, "rank")
rank1.text = "68"
year1 = ElementTree.SubElement(country3, "year")
year1.text = "2011"
gdppc = ElementTree.SubElement(country3, "gdppc")
gdppc.text = "13600"
tree = ElementTree.ElementTree(root)
tree.write("samplexml.xml")
xml_writer()
|
生成结果
1.2 ElementTree顺序依次读取XML文件
def xml_reader():
tree = ElementTree.parse("samplexml.xml")
root = tree.getroot()
print(root.tag)
for child in root:
print(" " + child.tag)
for gran_child in child:
print(" " + gran_child.tag + ":" + gran_child.text)
xml_reader()
|
结果输出
Data
country
rank:1
year:2011
gdppc:141100
country
rank:4
year:2011
gdppc:59900
country
rank:68
year:2011
gdppc:13600
|
1.3 ElementTree查找读XML文件
def xml_finder():
tree = ElementTree.parse("samplexml.xml")
root = tree.getroot()
for item in root.iter("gdppc"):
print item.text
xml_finder()
|
输出结果
141100
59900
13600
|
1.4 ElementTree修改XML文件
def xml_modifier():
tree = ElementTree.parse("samplexml.xml")
root = tree.getroot()
for item in root.iter("gdppc"):
item.text = str(int(item.text) + 99)
tree.write("samplexml.xml")
xml_modifier()
xml_finder()
|
输出结果
141199
59999
13699
|
1.5 ElementTree删除某个节点
def xml_deleter():
tree = ElementTree.parse("samplexml.xml")
root = tree.getroot()
for country in root.findall("country"):
rank = int(country.find('rank').text)
if rank > 50:
root.remove(country)
tree.write("samplexml.xml")
print "calling xml_deleter..."
xml_deleter()
print "after delete, the file content will be:"
xml_reader()
|
输出结果
Data
country
rank:1
year:2011
gdppc:141100
country
rank:4
year:2011
gdppc:59900
country
rank:68
year:2011
gdppc:13600
calling xml_deleter...
after delete, the file content will be:
Data
country
rank:1
year:2011
gdppc:141100
country
rank:4
year:2011
gdppc:59900
|
1.6 ElementTree从字符串load数据
xmlstr = '''<?xml version="1.0"?>
<data>
<country name="Liechtenstein">
<rank updated="yes">2</rank>
<year>2008</year>
<gdppc>141100</gdppc>
<neighbor name="Austria" direction="E"/>
<neighbor name="Switzerland" direction="W"/>
</country>
<country name="Singapore">
<rank updated="yes">5</rank>
<year>2011</year>
<gdppc>59900</gdppc>
<neighbor name="Malaysia" direction="N"/>
</country>
</data>
'''
root = ElementTree.fromstring(xmlstr)
for child in root:
print child.tag
|
1.4 ElementTree通过XPath读取XML
需要具体了解XPATH的规则,几个简单例子:
def xml_xpath():
xmlstr = '''<?xml version="1.0"?>
<data>
<country name="Liechtenstein">
<rank updated="yes">2</rank>
<year>2008</year>
<gdppc>141100</gdppc>
<neighbor name="Austria" direction="E"/>
<neighbor name="Switzerland" direction="W"/>
</country>
<country name="Singapore">
<rank updated="yes">5</rank>
<year>2011</year>
<gdppc>59900</gdppc>
<neighbor name="Malaysia" direction="N"/>
</country>
</data>
'''
print "xmlstr is :"
print xmlstr
root = ElementTree.fromstring(xmlstr)
# Top-level elements
print "finding ."
childlist = root.findall(".")
for item in childlist:
print item.tag
# All 'neighbor' grand-children of 'country' children of the top-level
# elements
print "finding ./country/neighbor"
childlist = root.findall("./country/neighbor")
for item in childlist:
print item.tag
# Nodes with name='Singapore' that have a 'year' child
print "finding .//year/..[@name='Singapore']"
childlist = root.findall(".//year/..[@name='Singapore']")
for item in childlist:
print item.tag
# 'year' nodes that are children of nodes with name='Singapore'
print "finding .//*[@name='Singapore']/year"
childlist = root.findall(".//*[@name='Singapore']/year")
for item in childlist:
print item.tag
# All 'neighbor' nodes that are the second child of their parent
print "finding .//neighbor[2]"
childlist = root.findall(".//neighbor[2]")
for item in childlist:
print item.tag
xml_xpath()
|
结果输出
xmlstr is :
<?xml version="1.0"?>
<data>
<country name="Liechtenstein">
<rank updated="yes">2</rank>
<year>2008</year>
<gdppc>141100</gdppc>
<neighbor name="Austria" direction="E"/>
<neighbor name="Switzerland" direction="W"/>
</country>
<country name="Singapore">
<rank updated="yes">5</rank>
<year>2011</year>
<gdppc>59900</gdppc>
<neighbor name="Malaysia" direction="N"/>
</country>
</data>
finding .
data
finding ./country/neighbor
neighbor
neighbor
neighbor
finding .//year/..[@name='Singapore']
country
finding .//*[@name='Singapore']/year
year
finding .//neighbor[2]
neighbor
|
2. Minidom
Why Minidom instead of DOM?
xml.dom.minidom is a minimal implementation of the Document Object
Model interface, with an API similar to that in other languages. It is intended
to be simpler than the full DOM and also significantly smaller. Users who are
not already proficient with the DOM should consider using the
xml.etree.ElementTree module for their XML processing instead.
主要意思是Minidom是一个DOM的简单实现,使用者需要熟悉DOM规范。
|
2.1 minidom写文件
def xml_minidom_writer():
impl = xml.dom.minidom.getDOMImplementation()
dom = impl.createDocument(None, "countrylist", None)
root = dom.documentElement
contry = dom.createElement('country')
contry.setAttribute('name', 'Liechtenstein')
root.appendChild(contry)
rank = dom.createElement('rank')
rank_value = dom.createTextNode('2')
rank.appendChild(rank_value)
rank.setAttribute('updated', 'yes')
contry.appendChild(rank)
year = dom.createElement('year')
year_value = dom.createTextNode('2008')
year.appendChild(year_value)
contry.appendChild(year)
contry = dom.createElement('country')
contry.setAttribute('name', 'Singapore')
root.appendChild(contry)
rank = dom.createElement('rank')
rank_value = dom.createTextNode('5')
rank.appendChild(rank_value)
rank.setAttribute('updated', 'yes')
contry.appendChild(rank)
year = dom.createElement('year')
year_value = dom.createTextNode('2011')
year.appendChild(year_value)
contry.appendChild(year)
f = open('sample.xml', 'w')
dom.writexml(f, indent ="", addindent = ' ', newl = '\n')
f.close()
pass
xml_minidom_writer()
|
运行结果
<?xml version="1.0" ?>
<countrylist>
<country name="Liechtenstein">
<rank updated="yes">2</rank>
<year>2008</year>
</country>
<country name="Singapore">
<rank updated="yes">5</rank>
<year>2011</year>
</country>
</countrylist>
|
2.2 minidom读文件
def xml_minidom_reader():
doc = xml.dom.minidom.parse("sample.xml")
root = doc.documentElement
country_list = root.getElementsByTagName('country')
print('list country in xml file:')
print('\n')
for country in country_list:
print(country.toxml())
print('\n')
print('list rank in each country')
print('\n')
for country in country_list:
print(country.nodeName)
nameNode = country.getElementsByTagName('rank')[0]
print(nameNode.nodeName + ':' + nameNode.childNodes[0].nodeValue)
xml_minidom_reader()
|
输出结果
list country in xml file:
<country name="Liechtenstein">
<rank updated="yes">2</rank>
<year>2008</year>
</country>
<country name="Singapore">
<rank updated="yes">5</rank>
<year>2011</year>
</country>
list rank in each country
country
rank:2
country
rank:5
|
Wish you happy with python:)