from bs4 import BeautifulSoup
html ='''
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
<g><!--zz,hhh--></g>
'''#soup=BeautifulSoup(html,'lxml') #这里 lxml 是解析器 也会补全html 解析器也有多种#print(soup)#print(soup.prettify()) #这种格式有缩进,跟便于阅读#1.Tag对象 就是一个个标签的全部 相当于完全复制标签(自己的理解)
soup=BeautifulSoup(html,'lxml')print(soup.title)#soup.标签名 来获取标签 如果有多个标签 默认的是获取第一个print(soup.p.name)#soup.标签名.name 来获取标签名 如果标签名不存在就会报错print(soup.p.attrs)#soup.标签名.attrs 获取的是标签的属性 返回的是一个字典print(soup.p['class'])print(soup.p.get('class'))#这两种方式 都可以获取p标签中class属性对应的值
soup.p['class']='new'#可修改class属性对应的值#soup.p.get('class')='new' #这种方法就不行了#2.NavigableString 获取标签中内容print(soup.p.string)#3.BeautifulSoup 表示的是一个文档的全部内容 相当于tag#4.Comment 可以提取注释内容print(soup.g.string)
#遍历文档树from bs4 import BeautifulSoup
html ='''
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
<g><!--zz,hhh--></g>
'''
soup=BeautifulSoup(html,'lxml')
head=soup.head
print(head.contents)#content 得到的是head下的所有子节点 且返回的是 列表print(head.children)#children 返回的所有子节点的迭代器 可用for循环遍历出内容for i in head.children:print(i)for string in soup.strings:# strings 如果html中有多个字符串 可以这个提出来的 但是有空格这些东西print(string)#print(repr(string)) #可以把换行符这些打印出来for string in soup.stripped_strings:#stripped_strings 就可以去掉空格这些print(string)