用HTMLParser模块解析HTML文件,一般要定义一个子类HTMLParser.HTMLParser,并添加用来处理不同标签的函数,这个程序用来提取标题title
myTile.html
<html>
<head>
<title>my first html analyze program</title>
</head>
<body>
this is my text
</body>
</html>
下面使用python写的html文件解析代码
#-*-coding:utf-8-*-
from HTMLParser import HTMLParser
import sys
class TitleParser(HTMLParser):
def __init__(self):
self.title = ''
self.readingtitle = 0
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
if tag == 'title':
self.readingtitle = 1
def handle_data(self, data):
if self.readingtitle:
self.title += data
def handle_endtag(self, tag):
if tag == 'title':
self.readingtitle = 0
def gettitle(self):
return self.title
#fd = open('D:/study/practice/python/NetworkProgramming/analyzeHTML/a.html')
fileHandle = open('./a.html')
titleParser = TitleParser()
titleParser.feed(fileHandle.read())
print "title is :", titleParser.gettitle()
显示的输出结果
title is : my first html analyze program