from bs4 import BeautifulSoup
import requests
req = requests.get('http://www.iqiyi.com/')
ret = req.content.decode('utf-8')
# print(ret)
# 使用BeautifulSoup解析HTML
# soup = BeautifulSoup(ret, 'lxml')
soup = BeautifulSoup(ret, 'html.parser') #解析速度快,容错率高
pret = soup.prettify() # 格式美化
print(pret)
body = soup.html.body
# 解析HTML
# body = soup.html.body.div.div.div.div.div.div.img
# 使用string必须保证里面没有子标签
# print(body.string)
# 使用strings返回一个可迭代对象,遍历输出
content = soup.html.body.strings
# children
child = soup.html.body.children
print(child)
# 获取该标签下的所有文本
text = body.div.div.text
print(text)
# 获取父节点、兄弟节点、前后节点
print(body.div.parent)
print(body.div.parents)
print(body.div.div.next_sibiling)
# find_all 获取第一个元素的span标签
print(body.find_all('span'))
print(body.find_all(['span', 'a']))
print(body.find_all(class_='title-txt', attrs={'name': 'hahah'}))
# find
print(body.find(class_='title-txt').text)
print(body.find('a')['onfocus'])
print(body.find('a').attrs['onfocus'])
# 限定查找的数量
print(body.find_all('a', limit=2))
# url = 'http://www.runoob.com/python/python-100-examples.html'
# header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
# req = requests.get(url, headers=header)
# soup = BeautifulSoup(req.content.decode('utf-8'), 'html.parser')
# print(soup)
# content = soup.find(id='content').ul.find_all('a')
# for i in content:
# print(i['href'])
# css选择器
# 1.通过标签名查找
print(soup.select('a'))
print(soup.select('#adClick'))
# 2.组合获取
# 前一个是父元素的类选择器,后一个是子元素的类选择器
print(soup.select('.nav-list-item .nav-list-link'))
# 3.获取属性值
for i in soup.select('.nav-list-item .nav-list-link'):
print(i['rseat'])
使用BeautifulSoup解析HTML
最新推荐文章于 2024-09-05 21:50:39 发布