BeautifulSoup解析 HTML标签Tag及属性attrs的常用方法

铁松溜达py

已于 2024-03-01 02:00:07 修改

阅读量2.1k

点赞数 27

文章标签： css html 前端 beautifulsoup python

于 2024-03-01 01:59:13 首次发布

本文链接：https://blog.csdn.net/book_dw5189/article/details/136384725

版权

# 使用 BeautifulSoup 解析 HTML 内容
# BeautifulSoup 对象表示整个解析树或文档，可以用来遍历、搜索和操作文档。
# 常用方法：
# find(name, attrs, recursive, text, **kwargs)：在文档中查找第一个符合条件的元素，并返回该元素的 Tag 对象。
# find_all(name, attrs, recursive, text, limit, **kwargs)：在文档中查找所有符合条件的元素，并返回一个列表。
# find_parent(name, attrs, recursive, text, **kwargs)：查找父元素并返回其 Tag 对象。
# find_next_sibling(name, attrs, recursive, text, **kwargs)：查找下一个同级元素并返回其 Tag 对象。
# get_text(separator, strip, types)：获取文档中的所有文本内容，并返回一个字符串或列表。
# prettify(formatter, encoding, newline, indent_level)：将文档格式化后返回一个字符串，通常用于输出美观的HTML代码。
# select(selector)：使用 CSS 选择器语法来查找元素，并返回一个列表。
# select_one(selector)：使用 CSS 选择器语法来查找第一个符合条件的元素，并返回其 Tag 对象。
# 当使用 BeautifulSoup 的 find 方法查找标签元素时，可以根据不同的需求使用不同的方式来定位目标标签。下面是一些常用的方式和示例：
# 通过标签名称查找：tag = soup.find('a') # 查找第一个<a>标签
# 通过指定标签属性查找：tag = soup.find('a', {'class': 'example'}) # 查找第一个class为'example'的<a>标签
# 通过 CSS 类名查找：tag = soup.find(class_='example') # 查找第一个class为'example'的标签
# 通过 ID 查找：tag = soup.find(id='example') # 查找id为'example'的标签
# 通过正则表达式查找：tag = soup.find(text=re.compile('example')) # 查找文本内容包含'example'的标签
# 通过自定义函数查找：
# def has_class_but_no_id(tag):
# return tag.has_attr('class') and not tag.has_attr('id')
# tag = soup.find(has_class_but_no_id) # 查找具有class属性但没有id属性的任意标签
# 结合多个属性查找：tag = soup.find('a', {'class': 'example', 'id': 'link'}) # 查找第一个同时具有class为'example'和id为'link'的<a>标签
# 多个条件逻辑组合查找：tag = soup.find('a', class_='example', id='link') # 查找第一个同时具有class为'example'和id为'link'的<a>标签
# 使用 CSS 选择器语法查找：tag = soup.select_one('a.example#link') # 使用CSS选择器语法查找class为'example'且id为'link'的<a>标签
# 查找特定位置的标签：tag = soup.find_all('a')[2] # 获取第三个<a>标签（索引从0开始）
# 查找父节点中的特定子节点：
# parent_tag = soup.find('div', id='parent')
# child_tag = parent_tag.find('a') # 在id为'parent'的<div>标签中查找第一个<a>标签
# 查找包含特定文本的标签：
# tag = soup.find(text='Example text') # 查找第一个包含文本 'Example text' 的标签

# 示例 HTML 内容
# html_content = '''
# <div class="container">
# <p class="paragraph">Paragraph 1</p>
# <p class="paragraph">Paragraph 2</p>
# <div id="footer">Footer</div>
# </div>
# '''
# ****通过标签名称查找：paragraphs = soup.find_all('p')
# ****查通过指定标签属性查找：soup.find_all('p', class_='paragraph')
# ****使用正则表达式查找所有以 "Para" 开头的文本：regex_paragraphs = soup.find_all(text=re.compile(r'^Para'))
# ****限制返回结果数量为 1：limited_results = soup.find_all('p', limit=1)
# ****使用 attrs 参数查找具有特定属性的标签：soup.find_all(attrs={'class': 'paragraph'})
# ****通过 CSS 类名查找：soup.find(class_='example') # 查找第一个class为'example'的标签
# ****通过 ID 查找：soup.find(id='example') # 查找id为'example'的标签

# select() 方法时，通过 CSS 选择器来查找特定的元素
# 通过类名查找元素： elements = soup.select('div.content')
# 通过 ID 查找元素： element = soup.select('div#header')
# 通过标签名查找元素： elements = soup.select('a')
# 通过组合选择器查找元素： listitems = soup.select('ul.menu li')查找class为menu的ul元素下的所有li子元素

# 字典推导式{key_expression: value_expression for item in iterable}
# numbers = [1, 2, 3, 4, 5]
# squared_dict = {x: x ** 2 for x in numbers}
# print(squared_dict)
# 获取 <a> 标签中的 href 和 title 属性的取值，存储在字典 attrs 中
# 在示例代码中，['href', 'title'] 是一个硬编码的固定列表，用于指定要提取的属性名。
# 最终生成一个包含 {'href': link.get('href'), 'title': link.get('title')} 的字典

from bs4 import BeautifulSoup

# 假设这是您要解析的 HTML 内容
html_content = """
<html>
<body>
<div class='httpweb' id='daohangweb'>
<a href="https://www.example.com" title="example">example.com Link sample</a>
<a href="https://www.guancha.cn/" title="guancha">guancha Link sample</a>
    <div class="child">
        <p>Paragraph 1</p>
        <p>Paragraph 2</p>
    </div>
</div>
</body>
</html>
"""

# 使用 BeautifulSoup 解析 HTML 内容
# BeautifulSoup 对象表示整个解析树或文档，可以用来遍历、搜索和操作文档。
# 常用方法： 
# find(name, attrs, recursive, text, **kwargs)：在文档中查找第一个符合条件的元素，并返回该元素的 Tag 对象。
# find_all(name, attrs, recursive, text, limit, **kwargs)：在文档中查找所有符合条件的元素，并返回一个列表。
# find_parent(name, attrs, recursive, text, **kwargs)：查找父元素并返回其 Tag 对象。
# find_next_sibling(name, attrs, recursive, text, **kwargs)：查找下一个同级元素并返回其 Tag 对象。
# get_text(separator, strip, types)：获取文档中的所有文本内容，并返回一个字符串或列表。
# prettify(formatter, encoding, newline, indent_level)：将文档格式化后返回一个字符串，通常用于输出美观的HTML代码。
# select(selector)：使用 CSS 选择器语法来查找元素，并返回一个列表。
# select_one(selector)：使用 CSS 选择器语法来查找第一个符合条件的元素，并返回其 Tag 对象。
# 示例 HTML 内容
# html_content = '''
# <div class="container">
#     <p class="paragraph">Paragraph 1</p>
#     <p class="paragraph">Paragraph 2</p>
#     <div id="footer">Footer</div>
# </div>
# '''
# ****通过标签名称查找：paragraphs = soup.find_all('p')
# ****查通过指定标签属性查找：soup.find_all('p', class_='paragraph')
# ****使用正则表达式查找所有以 "Para" 开头的文本：regex_paragraphs = soup.find_all(text=re.compile(r'^Para'))
# ****限制返回结果数量为 1：limited_results = soup.find_all('p', limit=1)
# ****使用 attrs 参数查找具有特定属性的标签：soup.find_all(attrs={'class': 'paragraph'})
# ****通过 CSS 类名查找：soup.find(class_='example')  # 查找第一个class为'example'的标签
# ****通过 ID 查找：soup.find(id='example')  # 查找id为'example'的标签
# ****通过自定义函数查找：
# def has_class_but_no_id(tag):
#     return tag.has_attr('class') and not tag.has_attr('id')
# tag = soup.find(has_class_but_no_id)  # 查找具有class属性但没有id属性的任意标签

# 当使用 BeautifulSoup 的 find 方法查找标签元素时，可以根据不同的需求使用不同的方式来定位目标标签。下面是一些常用的方式和示例：
# 通过标签名称查找：tag = soup.find('a')  # 查找第一个<a>标签
# 通过指定标签属性查找：tag = soup.find('a', {'class': 'example'})  # 查找第一个class为'example'的<a>标签
# 通过 CSS 类名查找：tag = soup.find(class_='example')  # 查找第一个class为'example'的标签
# 通过 ID 查找：tag = soup.find(id='example')  # 查找id为'example'的标签
# 通过正则表达式查找：tag = soup.find(text=re.compile('example'))  # 查找文本内容包含'example'的标签
# 通过自定义函数查找：
# def has_class_but_no_id(tag):
#     return tag.has_attr('class') and not tag.has_attr('id')
# tag = soup.find(has_class_but_no_id)  # 查找具有class属性但没有id属性的任意标签
# 结合多个属性查找：tag = soup.find('a', {'class': 'example', 'id': 'link'})  # 查找第一个同时具有class为'example'和id为'link'的<a>标签
# 多个条件逻辑组合查找：tag = soup.find('a', class_='example', id='link')  # 查找第一个同时具有class为'example'和id为'link'的<a>标签
# 使用 CSS 选择器语法查找：tag = soup.select_one('a.example#link')  # 使用CSS选择器语法查找class为'example'且id为'link'的<a>标签
# 查找特定位置的标签：tag = soup.find_all('a')[2]  # 获取第三个<a>标签（索引从0开始）
# 查找父节点中的特定子节点：
# parent_tag = soup.find('div', id='parent')
# child_tag = parent_tag.find('a')  # 在id为'parent'的<div>标签中查找第一个<a>标签
# 查找包含特定文本的标签：
# tag = soup.find(text='Example text')  # 查找第一个包含文本 'Example text' 的标签


soup = BeautifulSoup(html_content, 'html.parser')

# 获取 class 为 "child" 的 div 标签
child_div = soup.find('div', class_='child')

# 使用 contents 获取直接子节点  #使用 parent 获取父节点# 
print("div 标签class_='child' child Contents:", child_div.contents)

# 使用 children 获取直接子节点迭代器
print("div 标签class_='child' Children:")
for child in child_div.children:
    if child == '\n':
      pass
    else:
      print(child)

# 获取标签的所有子孙节点
tagall=[]
for tag in soup.descendants:
    if tag.name:
      tagall.append(tag.name)
print('获取标签的所有子孙节点:',tagall)

# 获取经过 .strip() 后不为空的文本内容并连接成一个字符串
text_content = '; '.join(s.strip() for s in soup.strings if s.strip())
print(type(text_content))
print("提取标签内的所有文本内容:",text_content)


# select() 方法时，通过 CSS 选择器来查找特定的元素
# 通过类名查找元素： elements = soup.select('div.content')
# 通过 ID 查找元素： element = soup.select('div#header')
# 通过标签名查找元素： elements = soup.select('a')
# 通过组合选择器查找元素： listitems = soup.select('ul.menu li')查找class为menu的ul元素下的所有li子元素

# CSS选择器通过组合选择器查找元素
divs = soup.select('div.httpweb a')
print("查找div.httpweb下的a签名")
for div in divs:
    if div == '\n':
      pass
    else:
      print(div)

# Tag对象表示 HTML/XML 文档中的一个标签元素，可以访问标签的属性、内容等信息。
# 常用方法：
# get(): 获取指定属性的值。p_tag.get('class')
# text: 获取标签内的文本内容。
# find(), find_all(): 在当前标签下查找子元素。
# 找到<a> 标签内的text
link = soup.find('a')
text_string = link.text
print(f'\n第一个a标签内的字符串:{text_string}\n')

# 找到所有 <a> 标签
links = soup.find_all('a')

# 字典推导式{key_expression: value_expression for item in iterable}
# numbers = [1, 2, 3, 4, 5]
# squared_dict = {x: x ** 2 for x in numbers}
# print(squared_dict)
# 获取 <a> 标签中的 href 和 title 属性的取值，存储在字典 attrs 中
# 在示例代码中，['href', 'title'] 是一个硬编码的固定列表，用于指定要提取的属性名。
# 最终生成一个包含 {'href': link.get('href'), 'title': link.get('title')} 的字典
# 找到所有 <a> 标签


# 初始化一个空列表，用于存储每个 <a> 标签的属性字典
# 提取多个属性值：可以同时提取多个属性的值，并存储在字典中
attrs_list = []

# 遍历每个 <a> 标签
for link in links:
    print(f'打印标签的文本:{link.text}')
    # 获取当前 <a> 标签的 href 和 title 属性的取值
    attrs = {attr: link.get(attr) for attr in ['href', 'title']}
    # 将属性字典添加到列表中
    attrs_list.append(attrs)

# 打印获取到的属性值列表
print(attrs_list)

# Tag类是 BeautifulSoup 库中非常重要的类之一，用于表示 HTML 或 XML 文档中的一个标签元素。
# 通过 Tag对象，我们可以访问标签的属性、内容等信息。
# 以下是 Tag 类常用的一些方法和属性：
# name：获取标签的名称。
# attrs：获取标签的属性字典。
# string：获取标签内的文本内容，如果标签内只有一个 NavigableString 对象，则返回该字符串；如果有多个子节点，则返回 None。
# get(key, default)：获取指定属性的值，如果属性不存在则返回默认值。
# find(name, attrs, recursive, text, **kwargs)：在当前标签内查找第一个符合条件的元素，并返回其 Tag 对象。
# find_all(name, attrs, recursive, text, limit, **kwargs)：在当前标签内查找所有符合条件的元素，并返回一个列表。
# find_parent(name, attrs, recursive, text, **kwargs)：查找当前标签的父元素并返回其 Tag 对象。
# find_next_sibling(name, attrs, recursive, text, **kwargs)：查找当前标签的下一个同级元素并返回其 Tag 对象。
# get_text(separator, strip, types)：获取当前标签内的所有文本内容，并返回一个字符串或列表。
# prettify(formatter, encoding, newline, indent_level)：将当前标签及其子孙元素格式化后返回一个字符串。

from bs4 import BeautifulSoup

# 示例 HTML 内容
html_content = '''
<html>
<head>
    <title>示例页面</title>
</head>
<body>
    <div id="main-content" class="container">
        <h1>Welcome to BeautifulSoup</h1>
        <p class="description">BeautifulSoup is a Python library for parsing HTML and XML documents.</p>
        <p class="description">BeautifulSoup is good.</p>
        <a href="https://www.example.com">Visit our website</a>
    </div>
</body>
</html>
'''

# 创建 BeautifulSoup 对象
soup = BeautifulSoup(html_content, 'html.parser')

# 获取<div>标签的属性字典
div_tag = soup.find('div')
div_attrs = div_tag.attrs
print("div标签的属性字典：", div_attrs)


# 找到<h1>标签并获取文本内容
h1_tag = soup.find('h1')
h1_text = h1_tag.get_text()
print("h1标题文本内容：", h1_text)

# 获取<p>标签的文本内容
p_tag = soup.find('p')
p_text = p_tag.string 
# string：获取标签内的文本内容，如果标签内只有一个 NavigableString 对象，则返回该字符串；如果有多个子节点，则返回 None。
print("p标签的文本内容：", p_text)


# 获取<div>标签的class属性值
div_tag = soup.find('div')
div_class = div_tag.get('class')
print("div标签的class属性值：", div_class)

# 获取<p>标签的class属性值
p_tag = soup.find('p')
p_class = p_tag.get('class')
print("p标签的class属性值：", p_class)

# 获取<a>标签的href属性值
a_tag = soup.find('a')
a_href = a_tag.get('href')
print("a标签的href属性值：", a_href)

# Tag类是 BeautifulSoup 库中非常重要的类之一，用于表示 HTML 或 XML 文档中的一个标签元素。
# 通过 Tag对象，我们可以访问标签的属性、内容等信息。
# 以下是 Tag 类常用的一些方法和属性：
# name：获取标签的名称。
# attrs：获取标签的属性字典。
# string：获取标签内的文本内容，如果标签内只有一个 NavigableString 对象，则返回该字符串；如果有多个子节点，则返回 None。
# get(key, default)：获取指定属性的值，如果属性不存在则返回默认值。
# find(name, attrs, recursive, text, **kwargs)：在当前标签内查找第一个符合条件的元素，并返回其 Tag 对象。
# find_all(name, attrs, recursive, text, limit, **kwargs)：在当前标签内查找所有符合条件的元素，并返回一个列表。
# find_parent(name, attrs, recursive, text, **kwargs)：查找当前标签的父元素并返回其 Tag 对象。
# find_next_sibling(name, attrs, recursive, text, **kwargs)：查找当前标签的下一个同级元素并返回其 Tag 对象。
# get_text(separator, strip, types)：获取当前标签内的所有文本内容，并返回一个字符串或列表。
# prettify(formatter, encoding, newline, indent_level)：将当前标签及其子孙元素格式化后返回一个字符串。


# 当使用 BeautifulSoup 的 find 方法查找标签元素时，可以根据不同的需求使用不同的方式来定位目标标签。下面是一些常用的方式和示例：
# 通过标签名称查找：tag = soup.find('a')  # 查找第一个<a>标签
# 通过指定标签属性查找：tag = soup.find('a', {'class': 'example'})  # 查找第一个class为'example'的<a>标签
# 通过 CSS 类名查找：tag = soup.find(class_='example')  # 查找第一个class为'example'的标签
# 通过 ID 查找：tag = soup.find(id='example')  # 查找id为'example'的标签
# 通过正则表达式查找：tag = soup.find(text=re.compile('example'))  # 查找文本内容包含'example'的标签
# 通过自定义函数查找：
# def has_class_but_no_id(tag):
#     return tag.has_attr('class') and not tag.has_attr('id')
# tag = soup.find(has_class_but_no_id)  # 查找具有class属性但没有id属性的任意标签
# 结合多个属性查找：tag = soup.find('a', {'class': 'example', 'id': 'link'})  # 查找第一个同时具有class为'example'和id为'link'的<a>标签
# 多个条件逻辑组合查找：tag = soup.find('a', class_='example', id='link')  # 查找第一个同时具有class为'example'和id为'link'的<a>标签
# 使用 CSS 选择器语法查找：tag = soup.select_one('a.example#link')  # 使用CSS选择器语法查找class为'example'且id为'link'的<a>标签
# 查找特定位置的标签：tag = soup.find_all('a')[2]  # 获取第三个<a>标签（索引从0开始）
# 查找父节点中的特定子节点：
# parent_tag = soup.find('div', id='parent')
# child_tag = parent_tag.find('a')  # 在id为'parent'的<div>标签中查找第一个<a>标签
# 查找包含特定文本的标签：
# tag = soup.find(text='Example text')  # 查找第一个包含文本 'Example text' 的标签