HTMLParser是python自带的网页解析库,使用也很简单,主要需要继承基类HTMLParser,然后
重载handle_starttag、handle_data、handle_endtag三个函数即可。
下面给出一个抽取网页链接的示例
#!/usr/bin/env python
#coding=utf-8
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.links = []
def handle_starttag(self,tag,attrs):
#print attrs attrs is lists of tuples.
if tag == 'a':
if len(attrs) == 0:
pass
else:
for (variable,value) in attrs:
if variable == "href":
self.links.append(value)
if __name__ == "__main__":
html_code = """
<a href="www.google.com"> google.com</a>
<A Href="www.pythonclub.org"> PythonClub </a>
<A