实例目录:
1.BeautifulSoup库的基本用法
2.BeautifulSoup库的基本元素
3.基于bs4库的HTML内容遍历
4.基于bs4库的HTML格式化和编码
5.中国大学定向排名
1.BeautifulSoup库的基本用法
import requests
r=requests.get("http://python123.io/ws/demo.html")
demo = r.text
print(demo)
from bs4 import BeautifulSoup#导入BeautifulSoup包
soup=BeautifulSoup(demo,"html.parser")#实例化对象(煲一锅汤)
print(soup.prettify())
2.BeautifulSoup库的基本元素
import requests
from bs4 import BeautifulSoup
r=requests.get("http://python123.io/ws/demo.html")
demo = r.text
#BeautifulSoup库解析器
# soup=BeautifulSoup(demo,"html.parser")
# soup=BeautifulSoup(demo,"lxml")
# soup=BeautifulSoup(demo,"xml")
# soup=BeautifulSoup(demo,"html5lib")
#BeautifulSoup类的基本元素
#Tag,Name,Attribute, NavigableString,Comment
######1.Tag标签
soup=BeautifulSoup(demo,"html.parser")
#获取title
print(soup.title) #<title>This is a python demo page</title>
tag=soup.a
#获取a标签
print(tag) #<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>
#a标签的类型
print(type(tag)) #<class 'bs4.element.Tag'>
######2.Name标签
#a标签的名字
print(soup.a.name) #a
#a标签的父亲的名字
print(soup.a.parent.name) #p
#a标签的父亲的父亲的名字
print(soup.a.parent.parent.name) #body
######3.Attribute标签
#获取a标签的属性
print(tag.attrs) #{'href': 'http://www.icourse163.org/course/BIT-268001', 'class': ['py1'], 'id': 'link1'}
#获取a标签中class的值
print(tag.attrs["class"]) #['py1']
#获取a标签中href的值
print(tag.attrs["href"]) #http://www.icourse163.org/course/BIT-268001
#获取a标签的属性的类型
print(type(tag.attrs)) #<class 'dict'>
######4.NavigableString标签
print(tag.string) #Basic Python
print(soup.p.string) #The demo python introduces several python courses.
print(type(soup.p.string)) #<class 'bs4.element.NavigableString'>
######5.Comment标签
newSoup=BeautifulSoup("<b><!--this is a comment--></b><p>this is not a comment</p>","html.parser")
print(newSoup.b.string) #this is a comment
print(type(newSoup.b.string)) #<class 'bs4.element.Comment'>
print(newSoup.p.string) #this is not a comment
print(type(newSoup.p.string)) #<class 'bs4.element.NavigableString'>
3.基于bs4库的HTML内容遍历
import requests
from bs4 import BeautifulSoup
r=requests.get("http://python123.io/ws/demo.html")
demo = r.text
#################1.下行遍历###########################
#.contents子节点的列表,将<tag>所有儿子节点存入列表
soup=BeautifulSoup(demo,"html.parser")
#获取head标签
print(soup.head) #<head><title>This is a python demo page</title></head>
#获取head标签的子标签
print(soup.head.contents) #[<title>This is a python demo page</title>]
print(soup.body.contents)
print(len(soup.body.contents)) #5
print(soup.body.contents[1])
#.children 子节点的迭代类型,与.contents类似,用于循环遍历儿子节点
#遍历儿子节点
for child in soup.body.children:
print(child)
#.descendants子孙节点的迭代类型,包含所有子孙节点,用于循环遍历
for child in soup.body.descendants:
print(child)
#################2.上行遍历
#.parent 返回当前节点的父亲节点
# .parents 返回当前节点的所有先辈节点
#html的父亲是他自身
#soup的先辈是none
#################3.平行遍历
#.next_sibling返回按照HTML文本顺序的下一个平行节点标签
print(soup.a.next_sibling) #and
print(soup.a.next_sibling.next_sibling)
#.previous_sibling返回按照HTML文本顺序的上一个平行节点标签
print(soup.a.previous_sibling)
#.next_siblings迭代类型,返回按照HTML文本顺序的后续所有平行节点标签
for sibling in soup.a.next_siblings:
print(sibling)
#.previous_siblings迭代类型,返回按照HTML文本顺序的前续所有平行节点标签
for sibling in soup.a..previous_siblings:
print(sibling)
4.基于bs4库的HTML格式化和编码
#prettify()方法格式化
import requests
from bs4 import BeautifulSoup
r=requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup=BeautifulSoup(demo,"html.parser")
print(soup.prettify())
print(soup.a.prettify())
#编码
#在bs4库将任何读入的HTML文件或者字符串都转化了utf-8格式,所有python3.0以上没有影响
5.中国大学定向排名
import requests
from bs4 import BeautifulSoup
import bs4
#从网络上获取大学排名网页内容
def getHTMLText(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return " "
#提取网页内容中信息到合适的数据结构
def fillUnivList(ulist,html):
soup=BeautifulSoup(html,"html.parser")
for tr in soup.find("tbody").children:
if isinstance(tr,bs4.element.Tag):
tds=tr("td")
ulist.append([tds[0].string,tds[1].string,tds[3].string])
#利用数据结构展示并输出结果
def printUnivList(ulist,num):
tplt="{0:^10}\t{1:{3}^10}\t{2:^10}"
print(tplt.format("排名","学校名称","总分",chr(12288)))
for i in range(num):
u=ulist[i]
print(tplt.format(u[0],u[1],u[2],chr(12288)))
def main():
uinfo=[]
url="http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html"
html=getHTMLText(url)
fillUnivList(uinfo,html)
printUnivList(uinfo,30)
main()