Beautiful Soup类的基本元素
例子
from bs4 import BeautifulSoup
r = requests.get('https://python123.io/ws/demo.html')
demo = r.text
soup = BeautifulSoup(demo,'html.parser') #'html.parser'是解析器
print(soup.prettify())
结果
Beautiful Soup库的理解
基于bs4库的HTML内容遍历方法
下行遍历
from bs4 import BeautifulSoup
r = requests.get('https://python123.io/ws/demo.html')
demo = r.text
demo
soup = BeautifulSoup(demo,'html.parser')
soup.head #head标签,任何html可以分为head、body标签
soup.head.contents #head的儿子节点
soup.body.contents #body的儿子节点
len(soup.body.contents) #body标签的儿子节点数量
soup.body.contents[1] #body标签的第二个儿子节点
for child in soup.body.children: #遍历儿子节点
#print(child)
for child in soup.body.descendants: #遍历子孙节点
print(child)
上行遍历
from bs4 import BeautifulSoup
import requests
r = requests.get('https://python123.io/ws/demo.html')
demo = r.text
demo
soup = BeautifulSoup(demo,'html.parser')
#打印a标签的所有父辈标签的名字
for parent in soup.a.parents:
if parent is None:
print(parent)
else:
print(parent.name)
平行遍历(只能平行遍历同一个父辈标签下的标签)
from bs4 import BeautifulSoup
import requests
r = requests.get('https://python123.io/ws/demo.html')
demo = r.text
demo
soup = BeautifulSoup(demo,'html.parser')
soup.a.next_sibling #a标签的下一个节点
#下一个节点不一定是标签,也有可能是NavigableString类型
soup.a.next_sibling.next_sibling #a标签的下一个节点的下一个节点
soup.a.previous_sibling #a标签的上一个节点
for sibling in soup.a.next_siblings: #遍历后续节点
print(sibling)
基于bs4库的HTML格式化和编码
from bs4 import BeautifulSoup
import requests
r = requests.get('https://python123.io/ws/demo.html')
demo = r.text
soup = BeautifulSoup(demo,'html.parser')
soup.prettify() #.prettify()在标签后加换行符
print(soup.prettify()) #print函数将\n换行符实现