解析html
from HTMLParser import HTMLParser
import sys
class TestParser(HTMLParser):
def __init__(self):
self.title = ''
self.readingtitle = 0
self.body = ''
self.readingbody = 0
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
if self.readingbody:
self.body += ''
if tag == 'title':
self.readingtitle = 1
elif tag == 'body':
self.readingbody = 1
def handle_data(self, data):
if self.readingtitle:
self.title += data
elif self.readingbody:
self.body += data
def handle_endtag(self, tag):
if tag == 'title':
self.readingtitle = 0
elif tag == 'body':
self.readingbody = 0
if self.readingbody:
self.body += ''
def gettitle(self):
return self.title
def getbody(self):
return self.body
# testparser.py test.html
#fd = open(sys.argv[1])
fd = open("test.html");
tp = TestParser()
tp.feed(fd.read())
print "Title is:", tp.gettitle()
print "Body is:", tp.getbody()
对于不严格的html(比如缺少关闭tag),可以使用TidyLib。
Document TitleThis is a text
输出
>>>
Title is: Document Title
Body is:
This is a text
解析xml
from xml.dom import minidom, Node
import sys
def scanNode(node, level = 0):
msg = node.__class__.__name__
if node.nodeType == Node.ELEMENT_NODE:
msg += ", tag: " + node.tagName
print level, msg
if node.hasChildNodes:
for child in node.childNodes:
scanNode(child, level + 1)
# testparser.py test.xml
doc = minidom.parse("test.xml");
scanNode(doc)test.xml
fengrufeitun
12
输出
>>>
0 Document
1 Element, tag: books
2 Text
2 Element, tag: book
3 Text
3 Element, tag: name
4 Text
3 Text
3 Element, tag: price
4 Text
3 Text
2 Text