1.BeautifulSoup基本语法
BeautifulSoup只要字符串,它都可以识别,只要你的字符串的格式是HTML或者XML文档就可以
from bs4 import BeautifulSoup # 导入BeautifulSoup
html= '''<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="title">
<b>The Dormouse's story</b>
</p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister"id="link2">Lacie</a>and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
'''
print(type(html)) # <class 'str'>
soup=BeautifulSoup(html,'lxml')
print(type(soup)) # <class 'bs4.BeautifulSoup'>
# print(html)
# print(soup.prettify()) # 让html文件缩进更加合理
print(soup.title,type(soup.title)) # <title>The Dormouse's story</title> <class 'bs4.element.Tag'>
print(soup.title.name,type(soup.title.name)) # title 返回title的标签名,就是title <class 'str'>
print(soup.p['class']) # ['title']
print(soup.find('a')) # 找到第一个标签为a链接
print(soup.find_all('a')) # 找到所有的a链接
2.Xpath与lxml包
xpath='//*[@id="content"]/h1'
3. 网页标签解析
让缩进非常明确
from bs4 import BeautifulSoup # 导入BeautifulSoup
html= '''<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="title">
<b>The Dormouse's story</b>
</p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister"id="link2">Lacie</a>and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
'''
print(type(html)) # <class 'str'>
soup=BeautifulSoup(html,'lxml')
print(type(soup)) # <class 'bs4.BeautifulSoup'>
# print(html)
# print(soup.prettify()) # 让html文件缩进更加合理
print(soup.title,type(soup.title)) # <title>The Dormouse's story</title> <class 'bs4.element.Tag'>
print(soup.title.name,type(soup.title.name)) # title 返回title的标签名,就是title <class 'str'>
print(soup.p['class']) # ['title']
print(soup.find('a')) # 找到第一个标签为a链接
print(soup.find_all('a')) # 找到所有的a链接
x1='//*[@id="content"]/h1' # id是唯一的
print(soup.title.text,type(soup.title.text)) # The Dormouse's story <class 'str'>
print(soup.title.string,type(soup.title.string)) # The Dormouse's story <class 'bs4.element.NavigableString'>
print('-------------------------------------------------')
html2="<html>" \
"<td>some text</td>" \
"<td></td>" \
"<td><p>more text</p></td>" \
"<td>even <p>more text</p></td> </html>"
soup2=BeautifulSoup(html2,'lxml')
# print(soup2.prettify())
tds=soup2.find_all('td')
print(tds)
for td in tds:
print(td.text)
print('-------------------------')
for td in tds:
print(td.string)
print(soup2.p.text) # more text
4.搜索文档树
当html中有多个标签时(例如a标签),可以通过属性去定位
这里一般情况下,写的是网页的字符串,上面这种推荐使用
from bs4 import BeautifulSoup
import requests
html = '''<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="title">
<b>The Dormouse's story</b>
</p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister"id="link2">Lacie</a>and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
'''
'''
r=requests.get('https://book.douban.com/latest?subcat=%E5%B0%8F%E8%AF%B4')
print(type(r.text)) # <class 'str'>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find('title')) # <title>The Dormouse's story</title>
print(soup.title) # <title>The Dormouse's story</title>
print('++++++++++++++++++++++++++')
print(soup.find('a'))
print(soup.find('a', id='link3'))
print(soup.find('p', class_="story")) # 注意class是内置对象,所以应该写成class_
print(soup.find('p', class_="story").text) # 输出标签当中的文本
print(soup.find('a', id='link3')['href']) # http://example.com/tillie
print(soup.find_all('a'),type(soup.find_all('a'))) # <class 'bs4.element.ResultSet'> set集合类型
print(soup.find_all('a')[0]) # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print(soup.find_all('a')[2]) # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
print(soup.find_all('a')[0].text) # Elsie
print(soup.find_all(['p','a'])) # 既找p标签,也找a标签
5.遍历文档树
from bs4 import BeautifulSoup
import requests
html = '''<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="title">
<b>The Dormouse's story</b>
</p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister"id="link2">Lacie</a>and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.prettify())
'''
ul = soup.find('p', class_='story')
print(ul, type(ul)) # <class 'bs4.element.Tag'>
a=ul.find_all('a')
print(a)
'''
# 子节点--contents
print(type(soup.body.contents)) # <class 'list'>
print(soup.body.contents)
print('------------------------------')
print(soup.find('body').contents)
print('------------------------------')
for i in soup.body.contents: # 遍历循环使用contents,使用children遍历不出来
print(i)
print('*****************')
# 子节点--children
b_ch = soup.body.children
print(type(b_ch)) # <class 'list_iterator'>
print(list(b_ch))
# 子孙结点--descendants (输出子孙结点它会重复性的把里面每个标签都输出)
b_dc = soup.body.descendants
print(type(b_dc)) # <class 'generator'>
print(list(b_dc))
# 父节点--parent
t = soup.title
print(t)
tp = t.parent
print(tp) # 输出了title的上一层标签head
# 父节点--parents (递归得到元素的所有父辈节点)
tps=t.parents
print(type(tps)) # <class 'generator'> 生成器
for i in tps:
print(i)
# 兄弟节点--next_sibling、next_siblings
html2="<a><b>text1</b><c>text2</c></b></a>"
soup2=BeautifulSoup(html2,'lxml')
print(soup2.prettify())
print(soup2.b.next_sibling) # <c>text2</c>
y=soup2.b.next_siblings # 找到所有的兄弟节点
print(list(y))