BeautifulSoup库是解析,遍历,维护标签树的功能库,(安装:pip install beautifulsoup4)
from bs4 import BeautifulSoup
soup = BeautifulSoup('<html>data</html>', 'html.parser')
soup_ = BeautifulSoup(open('D://data.html'), 'html.parser')
BeautifulSoup类的基本元素
标签树的遍历
下行遍历
上行遍历
平行遍历
平行遍历发生在同一个父节点下的各节点间
注:迭代类型只能用于循环遍历
关系
修饰方法:soup.prettify()
查找
<>.find_all(),返回列表类型
def find_all(self, name = None, #对标签名称的检索字符串,亦可为列表、正则表达式、bool值
attrs = {}, #对标签属性值的检索字符串
recursive = True, #是否对子孙节点全部检索
text = None, #<>...</>中字符串区域的检索字符串,亦可为列表、正则表达式、bool值
limit = None, #限制得到的搜索结果数量
**kwargs): #关键字参数,如果不为内置参数,搜索时会把其当成attrs形式来搜索
扩展方法
实例
‘’‘
爬取最好大学排名
’‘’
import requests
from bs4 import BeautifulSoup
import bs4
class College():
def __init__(self, url):
self.url = url
self.text = 'nothing'
self.sort_info = 'nothing'
def get_html_text(self):
try:
r = requests.get(self.url)
r.raise_for_status()
r.encoding = r.apparent_encoding
self.text = r.text
return self.text
except:
print('获取HTML异常')
def get_sort_info(self):
soup = BeautifulSoup(self.text, 'html.parser')
sort_info = soup.tbody.children
college_info = []
for tr in sort_info:
if isinstance(tr, bs4.element.Tag):
trs = tr('td')
college_info.append([trs[0].string, trs[1].string, trs[2].string])
self.sort_info = college_info
def print_sort_info(self):
'''
for college in self.sort_info:
print('名次:' + college[0])
print('学校:' + college[1])
print('地址:' + college[2])
print('---------------------------')
'''
f = '{0:^10}\t{1:{3}^10}\t{2:^10}' #格式控制
print(f.format('名次', '学校', '地址', chr(12288))) #chr(12288)为UTF-8的空格编码
for college in self.sort_info:
print(f.format(college[0], college[1], college[2], chr(12288)))
if __name__ == '__main__':
college = College('http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html')
college.get_html_text()
college.get_sort_info()
college.print_sort_info()
学习地址:https://www.icourse163.org/learn/BIT-1001870001?tid=1206093223#/learn/content