bs4学习

https://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/

# coding: utf-8
from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
    <body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
# print(soup.prettify())
print(soup.title)   # 直接获取title标签内容
print(soup.body.b)  # soop.p.b 获取b标签内容
print(soup.find_all('a'))  # 返回所有a标签的列表
'''
for item in soup.find_all('a'):
    print(item['href'])
    print(item.get('href'))   # 获取a标签内的所有url
'''
# print(soup.contents)  # 返回一个列表  整个html内容
# print(len(soup.contents))  # 2
# print(soup.get_text())  #  获取标签里面的内容,不包含标签
print(soup.title.parent)  # 打印head标签<head><title>The Dormouse's story</title></head>

'''
sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>")
print(sibling_soup.b.next_sibling)   # <c>text2</c>   兄弟节点(父节点相同)
sibling_soup.c.previous_sibling
# <b>text1</b>
'''
print(soup.find_all(['a', 'b']))
print(soup.find_all(id='link1'))
print(soup.find_all('a', class_='sister'))
print(soup.find_all("a", attrs={"class": "sister"}))
# soup.find_all("a", limit=2)

# select
print(soup.select('title'))  # [<title>The Dormouse's story</title>]
print(soup.select('body a'))  # 逐层查找
print(soup.select('html head title'))  # [<title>The Dormouse's story</title>]
# 找到某个标签下的直接子标签
print(soup.select('html > title'))  # []
print(soup.select('head > title'))  # [<title>The Dormouse's story</title>]
print(soup.select('p > #link1'))  # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
# 通过CSS类名查找
print(soup.select('.sister'))
print(soup.select('[class~=sister]'))
# 通过id查找
print(soup.select('#link2'))  # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
print(soup.select('a#link1'))
# soup.select("#link1,#link2")
print(soup.select('a[href]'))

'''
# print(soup.prettify())
prettify() 方法将Beautiful Soup的文档树格式化后以Unicode编码输出,每个XML/HTML标签都独占一行
'''
'''
Beautiful Soup为不同的解析器提供了相同的接口,但解析器本身时有区别的.
同一篇文档被不同的解析器解析后可能会生成不同结构的树型文档.
'''
'''
默认情况下,Beautiful Soup会将当前文档作为HTML格式解析,
如果要解析XML文档,要在 BeautifulSoup 构造方法中加入第二个参数 “xml”:
soup = BeautifulSoup(markup, "xml")
当然,还需要 安装lxml
'''

#coding: utf-8
from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, "html.parser")
# print(soup.prettify())
print(soup.title)
print(soup.title.name)
print(soup.a)
print(soup.p['class'])
print(soup.find_all('a'))

for link in soup.find_all('a'):
    print(link.get('href'))

print(soup.get_text())

解析器:
Python标准库 BeautifulSoup(markup, “html.parser”)

lxml HTML 解析器 BeautifulSoup(markup, “lxml”) //推荐使用lxml作为解析器,因为效率更高

lxml XML 解析器
BeautifulSoup(markup, [“lxml-xml”])
BeautifulSoup(markup, “xml”)

html5lib BeautifulSoup(markup, “html5lib”)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值