from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random
pages = set()#定义一个集合
random.seed(datetime.datetime.now())#定义个随机种子是当前系统时间
#获取页面中所有内链的列表
def getInternalLinks(bs,includeUrl):
includeUrl="{}://{}".format(urlparse(includeUrl).scheme,urlparse(includeUrl).netloc)
internalLinks = []
#找出所有以/为开头的链接
for link in bs.find_all("a",
href = re.compile("^(/|.*"+includeUrl+")")):
if link.attrs["href"] is not None:
if link.attrs["href"] not in internalLinks:
if(link.attrs["href"].startwith("/")):
internalLinks.append(
includeUrl+link.attrs["href"])
else:
internalLinks.append(link.attrs["href"])
return internalLinks
#获取页面中所有外链的列表
def getExternalLinks(bs,excludeUrl):
externalLinks=[]
#找出所有以"http"或“WWW”为开头切不包含当前URL的链接
for link in bs.find_all("a",
href = re.compile("^(http|www)((?!"+excludeUrl+").)*$")):
if link.attrs["href"] is not None:
if link.attrs["href"] not in externalLinks:
externalLinks.append(link.attrs["href"])
return externalLinks
def getRandomExternalLink(startingPage):
html = urlopen(startingPage)
bs = BeautifulSoup(html,"html.parser")
externalLinks = getExternalLinks(bs,
urlparse(startingPage).netloc)
if len(externalLinks) ==0:
print("No external links , looking around the site for one")
domain = "{}://{}".format(urlparse(startingPage).scheme,
urlparse(startingPage).netloc)
internalLinks = getInternalLinks(bs,domain)
return getRandomExternalLink((internalLinks[random.randint(0,len(internalLinks)-1)]))
else:
return externalLinks[random.randint(0,len(externalLinks)-1)]
def followExternalOnly(startingSite):
externalLink = getRandomExternalLink(startingSite)
print("random external link is :{}".format(externalLink))
followExternalOnly(externalLink)
followExternalOnly("http://oreilly.com")
2020-11-27
最新推荐文章于 2024-09-27 10:11:28 发布