写一个爬虫,实现外链间的随机跳转。
若某页面没有外链,则随机跳转到一个内链,然后继续收集外链。
代码如下:
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random
pages = set()
random.seed(datetime.datetime.now())
def getInternalLinks(bsObj, includeurl):
includeurl = urlparse(includeurl).scheme+"://"+urlparse(includeurl).netloc
internalLinks = []
for link in bsObj.findAll("a",href = re.compile("^(/|.*" + includeurl + ")")):
if link.attrs["href"] is not None:
if link.attrs["href"] not in internalLinks:
if(link.attrs["href"].startswith("/")):
internalLinks.append(includeurl + link.attrs["href"])
else:
internalLinks.append(link.attrs[