requests包和beautifulsoup4_pythonbeautifulsoup4和requests-CSDN博客

本文链接：https://blog.csdn.net/cs445579253/article/details/130676963

本文演示了如何使用Python的requests库获取网页内容，通过设置编码处理GBK编码的页面。接着，利用BeautifulSoup解析HTML，定位特定id的div标签，遍历li标签下的p标签以提取文本信息。此外，还介绍了BeautifulSoup模块在HTML数据提取中的应用。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >


```python
import requests
from bs4 import BeautifulSoup

# ########################## 第一步：发送请求并获取数据 ##########################
# gb2312 -> gbk编码压缩的数据
res = requests.get(url="https://www.autohome.com.cn/news/")

# requests内部帮我们做编码处理
# res.encoding = 'gbk'
# print(res.text)

# 原始内容
data = res.content.decode('gbk')
# print(data)

# ########################## 第二步：根据特征去获取局部数据 ##########################
"""
如果你想要在一个HTML格式的字符串中寻找自己想要的数据，你需要安装一个第三方的模块（专门帮助我们对HTML格式数据进行处理）
    pip install BeautifulSoup4
"""
# 整个HTML字符串
soup_object = BeautifulSoup(data, "html.parser")

# 1.寻找id=auto-channel-lazyload-article的div标签，近一步去缩小区域（1个）
new_area_object = soup_object.find(name='div', attrs={"id": "auto-channel-lazyload-article"})
# print(new_area_object)

# 2.在一步的基础上，寻找他里面所有的li标签（多个）
li_area_object_list = new_area_object.find_all(name='li')

# 3.循环每一个li标签，去获取他里面p标签的内容。
for li_object in li_area_object_list:
    # li标签中寻找p标签：没找到就是None
    p_object = li_object.find(name="p")
    # 如果没找到p标签，就是让他continue
    if not p_object:
        continue
    # 获取p标签内部的字符串内容
    print(p_object.text)
    print('==============')
```



#### 1.3.2 BeautifulSoup4模块

```
pip install BeautifulSoup4
```

专门用于帮助我们在一个HTML格式的字符串中提取我们想要的数据。

```python
text = """
<div>
	<h1 id="hello">123</h1>
	<h1>asdfasdfasdf</h1>
	<p>asdfasdf</p>
	<ul>  <li>标题1</li>   </ul>
	<div>
		<h2>fff</h2>
		<h2>fff</h2>
		<ul>  <li class='x1'>标题1</li>   </ul>
		<ul>  <li class='x1'>标题1</li>   </ul>
		<ul>  <li class='x1'>标题1</li>   </ul>
	</div>
	<div id="comment">
		<ul><li>标题1</li></ul>
		<ul><li>标题1</li></ul>
		<ul><li>标题1</li></ul>
		<ul><li>标题1</li></ul>
		<ul><li>标题1</li></ul>
	</div>
	<img src="xxxxxx" />
</div>"""

from bs4 import BeautifulSoup

soup_object = BeautifulSoup(text,"html.parser")

# 寻找第一个
v1 = soup_object.find(name="div",attrs={"id":"comment"}) # 标签,
v2 = v1.find_all(name="li") # [标签,标签,标签,标签,]

# 寻找所有
v3 = soup_object.find_all(name="li",attrs={"class":"x1"}) # [标签,标签,]

# 获取到某个标签 <h1 id="hello" src="xx" name="wupeiqi"> 123 </h1>
v4 = soup_object.find(name="h1",attrs={"id":"hello"}) # 标签
print( v4.text )
print( v4.attrs["src"] )
print( v4.attrs["name"] )

案例：


```python
import re
import os
import shutil
import requests
from bs4 import BeautifulSoup

FILE_PATH = "files"


def download_image(file_path, url):
    res = requests.get(
        url=url
    )
    # 判断file文件夹是否存在，如果不存在，则创建files文件夹
    if not os.path.exists(FILE_PATH):
        os.makedirs(FILE_PATH)

    with open(file_path, mode='wb') as f:
        f.write(res.content)


def run():
    if os.path.exists(FILE_PATH):
        # 在执行之前删除文件
        shutil.rmtree(FILE_PATH)

    res = requests.get(url="http://s.10010.com/hebei/mobilelist-0-0-0-0-0-0-0-0-177-0-0-p2/")
    soup_object = BeautifulSoup(res.text, 'html.parser')
    goods_object_list = soup_object.find_all(name='li', attrs={"class": "goodsLi"})

    file_object = open("db.txt", mode='a', encoding='utf-8')

    for goods in goods_object_list:
        title = goods.find(name='p', attrs={"class": "mobileGoodsName"}).find(name='a').text
        price = goods.find(name="label", attrs={'class': "priceD"}).text
        price_number = int(re.findall("￥(\d+)", price)[0])
        comment = goods.find(name="p", attrs={'class': "evalNum"}).text
        comment_number = int(re.findall("已有(\d+)人评价", comment)[0])
        image_url = goods.find(name='img').attrs['data-original']

        file_name = image_url.rsplit("/", maxsplit=1)[-1]
        file_path = os.path.join(FILE_PATH, file_name)
        
        # 下载
        download_image(file_path, image_url)

        line = "{}|{}|{}|{}|{}\n".format(title, price_number, comment_number, file_path, image_url)
        file_object.write(line)

    file_object.close()


if __name__ == '__main__':
    run()
```