提取 网页内容 四大基本方法之
2.beautifulsoup的使用 bs4模块
准备代码信息,用来练习获取内容:
from bs4 import BeautifulSoup
#准备代码信息,用来练习获取内容
html ='''
<html>
<head><title>The Dormouse's story</title></head>
<body>
<h1><b>123456</b></h1>
<p class="title" name="dromouse">
<b>The Dormouse's story</b>
aaaaa
</p>
<p class="title" name="dromouse" title='new'><b>The Dormouse's story</b>a</p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
<a href="http://example.com/tillie" class="siterr" id="link4">Tillie</a>;
<a href="http://example.com/tillie" class="siterr" id="link5">Tillie</a>;
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
<ul id="ulone">
<li>01</li>
<li>02</li>
<li>03</li>
<li>04</li>
<li>05</li>
</ul>
<div class='div11'>
<ul id="ultwo">
<li>0001</li>
<li>0002</li>
<li>0003</li>
<li>0004</li>
<li>0005</li>
</ul>
</div>
</body>
</html>
'''
1.得到beautifulsoup对象
soup = BeautifulSoup(html,'html.parser') #选择解析器
2.获取内容
(1)获取标题对象
print(soup.title)
获取标题文本字符串:
print(soup.title.string) #返回迭代器,出现换行就找不到了
print(soup.title.text)
print(soup.title.get_text())
title = soup.find('title').get_text()
print(title)
通过上下级关系获取对象
print(soup.title.parent)
print(soup.title.child)
print(soup.title.children)
(2)获取第一个p标签
print(soup.p.get_text())
print(soup.find('p').text)
#获取p的子标签们 (空行也看成了一个children)
print(soup.p.children)
for i,echo in enumerate(soup.p.children):
print(i,echo)
(3)获取标签的属性
#只能找到第一个a标签
print('1',soup.a)
print('2',soup.a.name)
#应该这样写
print(soup.a.attrs)
print(soup.a.attrs['href'])
print(soup.a.attrs['id'])
print(soup.a.attrs['class'][0]) #获得的是一个列表,可以用下标查询
(4)获取多个
print(soup.find('p')) #获取一个
print(soup.find_all('p')) #获取soup内的p标签 返回一个列表
(5)多层查询
find_all查询返回的是列表,使用下标寻找想要的内容
print(soup.find_all('ul'))
print(soup.find_all('ul')[0].find_all('li'))
(6)通过指定的属性,获取对象
print(soup.find(id='ulone')) #单个对象
print(soup.find('ul',id='ulone'))
print(soup.find_all('ul',id='ulone')) #可以使用下标查询
class是关键字 要这么写class_
print('class1',soup.find_all('p',class_='title'))
print('class2',soup.find_all('p',attrs={'class':'title'})) #更通用
print('class3',soup.find_all('p',attrs={'class':'title','title':'new'})) #多条件
用函数作为参数,获取元素
def judgeTilte1(t):
if t=='title':
return True
print(soup.find_all(class_=judgeTilte1))
判断长度
import re #正则表达式
reg = re.compile("sis")
def judgeTilte2(t):
#返回长度为6,且包含'sis'的t参数
return len(str(t))==6 and bool(re.search(reg,t))
print(soup.find_all(class_=judgeTilte2))
#获取文本内容
# <p class="title" name="dromouse">
# <b>The Dormouse's story</b>
# aaaaa
# </p>
print(soup.find('p').text)
print(soup.find('p').string) #返回迭代器,出现换行就找不到了<b>,
print(soup.find('p').get_text())
获取可以加limit的,限制访问个数
print(soup.find_all('a',limit=2))
recursive = True 寻找子孙 ;recursive = False只找子
print(soup.find_all('body')[0].find_all('ul',recursive = False))