收集整个网站数据
加载各项模块
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import ssl
import datetime
import random
码代码是件十分欢喜的事,就懒得处理pycharm证书问题
ssl._create_default_https_context = ssl._create_unverified_context
基于当前时间生成随机种子
random.seed(datetime.datetime.now())
获取页面内所有内链列表
def get_internal_links(bsObj, internal_url):
internal_links = []
# 找出所有以'/'或以'.'开头的内链
for link in bsObj.findAll('a', href=re.compile
("^(/|.*'+internal_url+')")):
# 若链接存在
if link.attrs['href'] is not None:
# 若为新链接
if link.attrs['href'] not in internal_links:
internal_links.append(link.attrs['href'])
return internal_links
获取页面内所有外链列表
def get_external_links(bsObj, external_url):
external_links = []
# 找出以'http'或'www'开头并且不包含当前URL的外链
for link in bsObj.findAll('a', href=re.compile("^(http|www)
((?!'+external_url+').)*$")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in external_links:
external_links.append(link.attrs['href'])
return external_links
分割地址
# 分割地址以便后续函数获得当前页面URL
def split_address(address):
address_parts = address.replace('http://', '').split('/')
return address_parts
获取下一随机外链
def get_random_external_link(starting_page):
html = urlopen(starting_page)
bsObj = BeautifulSoup(html, 'html.parser')
# 调用外链列表获取函数-获取不含当前页面URL外链列表
external_links = get_external_links(bsObj, split_address(starting_page)
[0])
# 若在当前页面未找到不含当前页面URL外链
if len(external_links) == 0:
# 以当前页面的内链列表中随机一个内链作为随机外链
internal_links = get_internal_links(bsObj, starting_page)
return internal_links[random.randint(0, len(internal_links)-1)]
else:
return external_links[random.randint(0, len(external_links)-1)]
追寻获得的唯一随机外链
def follow_external_link_only(starting_site):
# 此处'starting site'要替换为调用函数中的指定'starting_site'
external_link_only = get_random_external_link('starting site')
print('获得外链:', external_link_only)
# 循环调用追寻外链函数
follow_external_link_only(external_link_only)
调用函数
follow_external_link_only('starting_site')
以百度百科(http://baike.baidu.com)为例
完整代码如下:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import ssl
import datetime
import random
ssl._create_default_https_context = ssl._create_unverified_context
random.seed(datetime.datetime.now())
def get_internal_links(bsObj, internal_url):
internal_links = []
# 找出所有以'/'或以'.'开头的内链
for link in bsObj.findAll('a', href=re.compile("^(/|.*'+internal_url+')")):
# 若链接存在
if link.attrs['href'] is not None:
# 若为新链接
if link.attrs['href'] not in internal_links:
internal_links.append(link.attrs['href'])
return internal_links
def get_external_links(bsObj, external_url):
external_links = []
# 找出以'http'或'www'开头并且不包含当前URL的外链
for link in bsObj.findAll('a', href=re.compile("^(http|www)((?!'+external_url+').)*$")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in external_links:
external_links.append(link.attrs['href'])
return external_links
# 分割地址以便后续函数获得当前页面URL
def split_address(address):
address_parts = address.replace('http://', '').split('/')
return address_parts
def get_random_external_link(starting_page):
html = urlopen(starting_page)
bsObj = BeautifulSoup(html, 'html.parser')
# 调用外链列表获取函数-获取不含当前页面URL外链列表
external_links = get_external_links(bsObj, split_address(starting_page)[0])
# 若在当前页面未找到不含当前页面URL外链
if len(external_links) == 0:
# 以当前页面的内链列表中随机一个内链作为随机外链
internal_links = get_internal_links(bsObj, starting_page)
return internal_links[random.randint(0, len(internal_links)-1)]
else:
return external_links[random.randint(0, len(external_links)-1)]
def follow_external_link_only(starting_site):
# 此处'starting site'要替换为调用函数中的指定'starting_site'
external_link_only = get_random_external_link('http://baike.baidu.com')
print('获得外链:', external_link_only)
# 循环调用追寻外链函数
follow_external_link_only(external_link_only)
follow_external_link_only('http://baike.baidu.com')