python爬虫学习第十五天

最新推荐文章于 2024-02-02 20:51:26 发布

可惜没有如果

最新推荐文章于 2024-02-02 20:51:26 发布

阅读量266

点赞数

分类专栏：学习笔记文章标签： python

本文链接：https://blog.csdn.net/qq_34194478/article/details/76944171

版权

学习笔记专栏收录该内容

45 篇文章 0 订阅

订阅专栏

今天首先把昨天的练习3重复做了一遍，还是有些不熟练，不过好多了。
今天的主题是采集整个网站，主要涉及是根据链接一步步的把整个网站全部采集，形成网站地图
昨天的练习3

# from urllib.request import urlopen
# from bs4 import BeautifulSoup
# import re
# import datetime
# import random

# def randomUrl(articleUrl):
#   url = "http://en.wikipedia.org"+articleUrl
#   html = urlopen(url)
#   bsObj = BeautifulSoup(html)
#   links = bsObj.find('div',{'id':'bodyContent'}).findAll('a',href = re.compile('^(/wiki/)((?!:).)*$'))
#   return links
#   pass

# random.seed(datetime.datetime.now())
# newlinks = randomUrl('/wiki/Kevin_Bacon')
# while len(newlinks)>0:
#   link = newlinks[random.randint(0,len(newlinks)-1)].attrs['href']
#   print(link)
#   newlinks = randomUrl(link)
#   pass

练习1 网络数据采集示例（逐个采集维基百科的每一个链接）

# from urllib.request import urlopen
# from bs4 import BeautifulSoup
# import re

# datas = set()
# def getlinks(linkUrl):
#   global datas
#   url = 'http://en.wikipedia.org'+linkUrl
#   html = urlopen(url)
#   bsObj = BeautifulSoup(html)
#   for link in bsObj.findAll('a',href=re.compile("^(/wiki/)")):
#       if 'href' in link.attrs:
#           if link.attrs['href'] not in datas:
#               newlink = link.attrs['href']
#               print(newlink)
#               datas.add(newlink)
#               getlinks(newlink)
#           else:
#               print('这个页面重复啦！')

#   pass
# getlinks('')

练习2 收集整个网站数据

# from urllib.request import urlopen
# from bs4 import BeautifulSoup
# import re

# datas = set()
# def getLinks(pageUrl):
#   global datas
#   url = 'http://en.wikipedia.org'+pageUrl
#   html = urlopen(url)
#   bsobj = BeautifulSoup(html)
#   try:
#       print(bsobj.h1.get_text())
#       print(bsobj.find(id='mw-content-text').findAll('p')[0])
#       print(bsobj.find(id="ca-edit").find('span').find('a').attrs['href'])
#       pass
#   except AttributeError as e:
#       print('这页面里没有我们要的某些信息')
#   for link in bsobj.findAll('a',href=re.compile("^(/wiki/)")):
#       if 'href'in link.attrs:
#           if link.attrs['href'] not in datas:
#               newlink = link.attrs['href']
#               print(newlink)
#               print('---------------------')
#               datas.add(newlink)
#               getLinks(newlink)
#               pass
#           else:
#               print('我们来过这儿喽')
# getLinks('')

今天的练习就这两个，虽然数量少但是不看书自己敲出来比前面的练习挑战性高的多，完整做完感觉自己还有很多不足，加油！

可惜没有如果

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python爬虫学习第十五天

今天首先把昨天的练习3重复做了一遍，还是有些不熟练，不过好多了。今天的主题是采集整个网站，主要涉及是根据链接一步步的把整个网站全部采集，形成网站地图昨天的练习3# from urllib.request import urlopen# from bs4 import BeautifulSoup# import re# import datetime# import random# d
复制链接

扫一扫