#-*- encoding: utf-8 -*-
import htmllib,urllib,formatter,string
class GetLinks(htmllib.HTMLParser,str):
def __init__(self,str):
self.str=str
self.links = {}
f = formatter.NullFormatter()
htmllib.HTMLParser.__init__(self, f)
def anchor_bgn(self, href, name, type):
self.save_bgn()
self.link = href
def anchor_end(self):
text = string.strip(self.save_end())
if text.find(self.str)!=-1 :
if self.link and text:
self.links[text] = self.link
def findall(str1,strfront,i,strlat):
fp = urllib.urlopen(strfront+str(i)+strlat)
data = fp.read()
fp.close()
linkdemo = GetLinks(str1)
linkdemo.feed(data)
linkdemo.close()
for href, link in linkdemo.links.items():
print href, "=>", link
i=1
strfront='http://readthedocs.org/docs/learn-python-the-hard-way-zh_cn-translation/en/latest/ex'
strlat='.html'
search='ex'
while i<20 :
findall(search,strfront,i,strlat)
i=i+1