1.解析XML文件
使用Python单纯解析XML的最基本的方法就是直接使用Python自带的minidom库。
比如有如下XML文件:
<?xml version="1.0"?>
<root>
<singlenode>This is a single node</singlenode>
<multinode>This is first one</multinode>
<multinode>This is second one</multinode>
<multinode>This is third one</multinode>
<complexNode name="complex">
<childnode>This is a child node</childnode>
</complexNode>
</root>
使用如下代码获取里面每一项内容:
from xml.dom import minidom
if __name__ == '__main__':
try:
xmldoc = minidom.parse("test.xml")
singleNode = xmldoc.getElementsByTagName("singlenode")[0]
print("Data of singleNode: %s"%(singleNode.firstChild.data))
multiNodes = xmldoc.getElementsByTagName("multinode")
for multiNode in multiNodes:
print("Data of multiNode: %s"%(multiNode.firstChild.data))
complexNode = xmldoc.getElementsByTagName("complexNode")[0]
print("The name attribute of complexNode: %s"%(complexNode.getAttribute("name")))
childNode = complexNode.getElementsByTagName("childnode")[0]
print("Data of childNode: %s"%(childNode.firstChild.data))
except:
print("Error: Parse XML file failed.")
输出结果如下:
Data of multiNode: This is second one
Data of multiNode: This is third one
The name attribute of complexNode: ComplexNode
Data of childNode: This is a child node
2.生成与保存XML文件
同样可以直接使用Python自带的库来生成和保存XML文件。比如要生成上例使用的XML文件:
from xml.dom import minidom
import codecs
if __name__ == '__main__':
#generate new XMLDoc
impl = minidom.getDOMImplementation()
newdoc = impl.createDocument(None, "root", None)
rootNode = newdoc.documentElement
singleNode = newdoc.createElement("singlenode")
singleNode.appendChild(newdoc.createTextNode("This is a single node"))
rootNode.appendChild(singleNode)
multipleNodeTexts = ["This is first one", "This is second one", "This is third one"]
for multipleNodeText in multipleNodeTexts:
multipleNode = newdoc.createElement("multinode")
multipleNode.appendChild(newdoc.createTextNode(multipleNodeText))
rootNode.appendChild(multipleNode)
complexNode = newdoc.createElement("complexNode")
rootNode.appendChild(complexNode)
complexNode.setAttribute("name", "ComplexNode")
childNode = newdoc.createElement("childnode")
childNode.appendChild(newdoc.createTextNode("This is a child node"))
complexNode.appendChild(childNode)
#Save this XML to file
result_file = open("result.xml", 'wb+')
writer = codecs.lookup('utf-8')[3](result_file)
newdoc.writexml(writer, encoding='utf-8')
writer.close()
3.运用XSL
运用XSL和后面要介绍的通过XMLSchema来进行校验都需要使用第三方库了。这里介绍的是一个比较流行的开源库libxml2,这是一个C库,我们使用这个库的python binding(当然也可以使用Python直接调用C接口)。
使用如下代码可以使用test.xsl为test.xml文件作转换,并将结果保存到test.html文件里面:
import libxml2
import libxslt
if __name__ == '__main__':
styledoc = libxml2.parseFile("test.xsl")
style = libxslt.parseStylesheetDoc(styledoc)
doc = libxml2.parseFile("test.xml")
result = style.applyStylesheet(doc, None)
style.saveResultToFilename("test.html", result, 0)
style.freeStylesheet()
doc.freeDoc()
result.freeDoc()
4. 通过XMLSchema来对XML文件进行校验
如下代码使用test.xsd来对test.xml文件进行校验:
import sys
import libxml2
if __name__ == '__main__':
#Parse xml file
try:
doc = libxml2.readFile("test.xml", None, libxml2.XML_PARSE_HUGE)
except:
print("The xml file is invalid")
sys.exit()
#Parse schema
schema = None
try:
ctxt = libxml2.schemaNewParserCtxt("test.xsd")
schema = ctxt.schemaParse()
del ctxt
except:
print("Error: Parse schema failed")
doc.freeDoc()
sys.exit()
validationCtxt = schema.schemaNewValidCtxt()
instance_Err = validationCtxt.schemaValidateDoc(doc)
del validationCtxt
del schema
doc.freeDoc()
if instance_Err != 0:
print("The XML file is not valid")
else:
print("The XML file is valid")
附录:
Libxml2的python binding
http://www.xmlsoft.org/python.html