#-*-coding:utf-8-*-
from HTMLParser import HTMLParser
import sys
from htmlentitydefs import name2codepoint
from htmlentitydefs import entitydefs
class TitleParser(HTMLParser):
def __init__(self):
self.titlereading = 0
self.title = ''
HTMLParser.__init__(self);
def handle_starttag(self, tag, attrs):
if tag == 'title':
self.titlereading = 1
print 'start reading tag'
def handle_data(self, data):
if self.titlereading:
self.title += data
print 'reading tag data'
def handle_endtag(self, tag):
self.titlereading = 0
print 'end reading tag'
def handle_entityref(self, name):
if entitydefs.has_key(name):
self.handle_data(name)
else :
self.handle_data('&' + name + ';')
def handle_charref(self, name): #用来处理字符
try:
charnum = int(name)
except ValueError:
return
if charnum < 1 or charnum > 255:
return
self.handle_data(chr(charnum))
def getTitle(self):
return self.title
fileHandle = open('./d.html')
myParser = TitleParser()
myParser.feed(fileHandle.read())
print 'title is :', myParser.getTitle()
fileHandle.close()
d.html
<html>
<head>
<title>Document tile & Intros®</title>
</head>
<body>
this is my text.
</body>
</html>