import urllib2 #获取源码的函数,urllib3更换 urllib.request.urlopen(url).read()
return urllib2.urlopen(url).read()
def get_next_target(page): #每次处理page中寻找链接的函数
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def print_all_links(page): #循环寻找
while True:
url, endpos = get_next_target(page)
if url:
print (url)
page = page[endpos:]
else:
break
#print_all_links(get_page('http://xkcd.com/353'))
print_all_links(get_page('http://www.baidu.com')) #用百度做一下实验
还没有完善,a标签触发的javascript事件还未处理。一些函数在python3中不能用,最近学的是python2的课程。