'''
Created on 2013-12-2
http://cloudaice.com/yong-pythonde-htmlparserfen-xi-htmlye-mian/
@author: Administrator
'''
from HTMLParser import HTMLParser
import urllib
import sys
class ParseLinks(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.data = []
self.href=0
self.linkname=''
def handle_starttag(self, tag,attrs):
if tag == 'a':
for name,value in attrs:
if name == 'href':
self.href =1
def handle_data(self, data):
if self.href :
self.linkname += data
def handle_endtag(self, tag):
if tag == 'a':
self.linkname = ''.join(self.linkname.split())
self.linkname = self.linkname.strip()
if self.linkname:
self.data.append(self.linkname)
self.linkname = ''
self.href = 0
def getresult(self):
for value in self.data:
print value
if __name__ == "__main__":
MyParser = ParseLinks()
MyParser.feed(urllib.urlopen("http://www.python.org/index.html").read())
MyParser.getresult()
Created on 2013-12-2
http://cloudaice.com/yong-pythonde-htmlparserfen-xi-htmlye-mian/
@author: Administrator
'''
from HTMLParser import HTMLParser
import urllib
import sys
class ParseLinks(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.data = []
self.href=0
self.linkname=''
def handle_starttag(self, tag,attrs):
if tag == 'a':
for name,value in attrs:
if name == 'href':
self.href =1
def handle_data(self, data):
if self.href :
self.linkname += data
def handle_endtag(self, tag):
if tag == 'a':
self.linkname = ''.join(self.linkname.split())
self.linkname = self.linkname.strip()
if self.linkname:
self.data.append(self.linkname)
self.linkname = ''
self.href = 0
def getresult(self):
for value in self.data:
print value
if __name__ == "__main__":
MyParser = ParseLinks()
MyParser.feed(urllib.urlopen("http://www.python.org/index.html").read())
MyParser.getresult()
MyParser.close()
上面的解析实现了下面的功能,还是正则比较方便
p = re.compile(r'<a.*?>(.*?)</a>', re.I| re.M)
match = p.findall(html)
print match