标签:html.parser HTMLParser
urllib.request urlopen
urllib parse
LinkParser(HTMLParser):
handle_starttag(, tag, attrs):
tag == :
(key, value) attrs:
key == :
newUrl = parse.urljoin(.baseUrl, value)
.links = .links + [newUrl]
getLinks(, url):
.links = []
.baseUrl = url
response = urlopen(url)
response.getheader()==:
htmlBytes = response.read()
htmlString = htmlBytes.decode()
.feed(htmlString)
htmlString, .links
:
,[]
spider(url, word, maxPages):
pagesToVisit = [url]
numberVisited = foundWord = numberVisited < maxPages pagesToVisit != [] foundWord:
numberVisited = numberVisited +url = pagesToVisit[]
pagesToVisit = pagesToVisit[:]
:
(numberVisited, , url)
parser = LinkParser()
data, links = parser.getLinks(url)
pagesToVisit = pagesToVisit + links
data.find(word)>-:
foundWord = pagesToVisit = pagesToVisit + links
()
:
()
foundWord:
(, word, , url)
:
()
spider(,,)
标签:
原文地址:http://my.oschina.net/u/215677/blog/523343