今天首先把昨天的练习3重复做了一遍,还是有些不熟练,不过好多了。
今天的主题是采集整个网站,主要涉及是根据链接一步步的把整个网站全部采集,形成网站地图
昨天的练习3
# from urllib.request import urlopen
# from bs4 import BeautifulSoup
# import re
# import datetime
# import random
# def randomUrl(articleUrl):
# url = "http://en.wikipedia.org"+articleUrl
# html = urlopen(url)
# bsObj = BeautifulSoup(html)
# links = bsObj.find('div',{'id':'bodyContent'}).findAll('a',href = re.compile('^(/wiki/)((?!:).)*$'))
# return links
# pass
# random.seed(datetime.datetime.now())
# newlinks = randomUrl('/wiki/Kevin_Bacon')
# while len(newlinks)>0:
# link = newlinks[random.randint(0,len(newlinks)-1)].attrs['href']
# print(link)
# newlinks = randomUrl(link)
# pass
练习1 网络数据采集示例(逐个采集维基百科的每一个链接)
# from urllib.request import urlopen
# from bs4 import BeautifulSoup
# import re
# datas = set()
# def getlinks(linkUrl):
# global datas
# url = 'http://en.wikipedia.org'+linkUrl
# html = urlopen(url)
# bsObj = BeautifulSoup(html)
# for link in bsObj.findAll('a',href=re.compile("^(/wiki/)")):
# if 'href' in link.attrs:
# if link.attrs['href'] not in datas:
# newlink = link.attrs['href']
# print(newlink)
# datas.add(newlink)
# getlinks(newlink)
# else:
# print('这个页面重复啦!')
# pass
# getlinks('')
练习2 收集整个网站数据
# from urllib.request import urlopen
# from bs4 import BeautifulSoup
# import re
# datas = set()
# def getLinks(pageUrl):
# global datas
# url = 'http://en.wikipedia.org'+pageUrl
# html = urlopen(url)
# bsobj = BeautifulSoup(html)
# try:
# print(bsobj.h1.get_text())
# print(bsobj.find(id='mw-content-text').findAll('p')[0])
# print(bsobj.find(id="ca-edit").find('span').find('a').attrs['href'])
# pass
# except AttributeError as e:
# print('这页面里没有我们要的某些信息')
# for link in bsobj.findAll('a',href=re.compile("^(/wiki/)")):
# if 'href'in link.attrs:
# if link.attrs['href'] not in datas:
# newlink = link.attrs['href']
# print(newlink)
# print('---------------------')
# datas.add(newlink)
# getLinks(newlink)
# pass
# else:
# print('我们来过这儿喽')
# getLinks('')
今天的练习就这两个,虽然数量少但是不看书自己敲出来比前面的练习挑战性高的多,完整做完感觉自己还有很多不足,加油!