# coding=utf-8
import time
import urllib.request
from bs4 import BeautifulSoup
t = time.time()
def scanpage(url, suburl):
websiteurl = url
t = time.time()
n = 0
html = urllib.request.urlopen(websiteurl).read()
soup = BeautifulSoup(html, "lxml")
Upageurls = {}
pageurls = soup.find_all("a", href=True)
for links in pageurls:
# print(links.get("href"))
if suburl in links.get("href") and links.get("href") not in Upageurls:
Upageurls[links.get("href")] = 0
for links in Upageurls.keys():
print(n, links, end='')
try:
urllib.request.urlopen(links).getcode()
except:
print("connect failed")
else:
t2 = time.time()
print(urllib.request.urlopen(links).getcode(), ' ', end='执行时间为: ')
t1 = time.time()
print(round((t1 - t2), 2))
n += 1
print("total is " + repr(n) + " links, 供执行时间为: ", round((time.time() - t), 2), 's')
scanpage("http://news.baidu.com", "baidu.com")
结果: