使用CSS选择器
CSS选择器:[http://www.w3school.com.cn/cssref/css_selectors.asp]
Beautiful Soup文档:https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
基础用法
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
# 查找class=panel 中,class=panel-heading
print(soup.select('.panel .panel-heading')) # 选择class 用 . 查找之间用空格来分隔
[<div class="panel-heading">
<h4>Hello</h4>
</div>]
# 查找标签ul 中,标签li
print(soup.select('ul li'))
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]
# 查找 id = list-2 中 class=element
print(soup.select('#list-2 .element')) # 选择id 用 #
[<li class="element">Foo</li>, <li class="element">Bar</li>]
获取属性
[‘属性名’] 或 .attrs[‘属性名’]
for ul in soup.select('ul'): # 筛选ul,获取list,遍历得到id属性(两种方法)
print(ul['id'])
print(ul.attrs['id'])
list-1
list-1
list-2
list-2
获取内容
get_text() 或 .string
for li in soup.select('li'):
print('Get Text:', li.get_text())
print('String:', li.string)
Get Text: Foo
String: Foo
Get Text: Bar
String: Bar
Get Text: Jay
String: Jay
Get Text: Foo
String: Foo
Get Text: Bar
String: Bar
# 二者的效果是完全一致的