首先,sax解析最直观,当然,也可以容许xml文件出些错。
先给定一个xml文件book.xml,
<
catalog
>
< book isbn ="0-596-00128-2" >
< title > Python & XML </ title >
< author > Jones, Drake </ author >
</ book >
< book isbn ="0-596-00085-5" >
< title > Programming Python </ title >
< author > Lutz </ author >
</ book >
< book isbn ="0-596-00281-5" >
< title > Learning Python </ title >
< author > Lutz, Ascher </ author >
</ book >
< book isbn ="0-596-00797-3" >
< title > Python Cookbook </ title >
< author > Martelli, Ravenscroft, Ascher </ author >
</ book >
<!-- imagine more entries here -->
</ catalog >
< book isbn ="0-596-00128-2" >
< title > Python & XML </ title >
< author > Jones, Drake </ author >
</ book >
< book isbn ="0-596-00085-5" >
< title > Programming Python </ title >
< author > Lutz </ author >
</ book >
< book isbn ="0-596-00281-5" >
< title > Learning Python </ title >
< author > Lutz, Ascher </ author >
</ book >
< book isbn ="0-596-00797-3" >
< title > Python Cookbook </ title >
< author > Martelli, Ravenscroft, Ascher </ author >
</ book >
<!-- imagine more entries here -->
</ catalog >
写一个BookHandler, 如下:
#
-*- coding: utf-8 -*-
import xml.sax.handler
class BookHandler(xml.sax.handler.ContentHandler):
def __init__ (self):
self.inTitle = 0 # handle XML parser events
self.mapping = {} # a state machine model
def startElement(self, name, attributes):
if name == " book " : # on start book tag
self.buffer = "" # save ISBN for dict key
self.isbn = attributes[ " isbn " ]
elif name == " title " : # on start title tag
self.inTitle = 1 # save title text to follow
def characters(self, data):
if self.inTitle: # on text within tag
self.buffer += data # save text if in title
def endElement(self, name):
if name == " title " :
self.inTitle = 0 # on end title tag
self.mapping[self.isbn] = self.buffer # store title text in dict
import xml.sax
import pprint
parser = xml.sax.make_parser( )
handler = BookHandler( )
parser.setContentHandler(handler)
parser.parse( ' book.xml ' )
pprint.pprint(handler.mapping)
import xml.sax.handler
class BookHandler(xml.sax.handler.ContentHandler):
def __init__ (self):
self.inTitle = 0 # handle XML parser events
self.mapping = {} # a state machine model
def startElement(self, name, attributes):
if name == " book " : # on start book tag
self.buffer = "" # save ISBN for dict key
self.isbn = attributes[ " isbn " ]
elif name == " title " : # on start title tag
self.inTitle = 1 # save title text to follow
def characters(self, data):
if self.inTitle: # on text within tag
self.buffer += data # save text if in title
def endElement(self, name):
if name == " title " :
self.inTitle = 0 # on end title tag
self.mapping[self.isbn] = self.buffer # store title text in dict
import xml.sax
import pprint
parser = xml.sax.make_parser( )
handler = BookHandler( )
parser.setContentHandler(handler)
parser.parse( ' book.xml ' )
pprint.pprint(handler.mapping)
结果如下:
Process started >>>
{u'0-596-00085-5': u'Programming Python',
u'0-596-00128-2': u'Python & XML',
u'0-596-00281-5': u'Learning Python',
u'0-596-00797-3': u'Python Cookbook'}<<< Process finished.
================ READY ================
不过,这是比较简单的情况了。而且我们可以看到,结果全是以unicode串输出的。