```python
import requests
from bs4 import BeautifulSoup
# ########################## 第一步:发送请求并获取数据 ##########################
# gb2312 -> gbk编码压缩的数据
res = requests.get(url="https://www.autohome.com.cn/news/")
# requests内部帮我们做编码处理
# res.encoding = 'gbk'
# print(res.text)
# 原始内容
data = res.content.decode('gbk')
# print(data)
# ########################## 第二步:根据特征去获取局部数据 ##########################
"""
如果你想要在一个HTML格式的字符串中寻找自己想要的数据,你需要安装一个第三方的模块(专门帮助我们对HTML格式数据进行处理)
pip install BeautifulSoup4
"""
# 整个HTML字符串
soup_object = BeautifulSoup(data, "html.parser")
# 1.寻找id=auto-channel-lazyload-article的div标签,近一步去缩小区域(1个)
new_area_object = soup_object.find(name='div', attrs={"id": "auto-channel-lazyload-article"})
# print(new_area_object)
# 2.在一步的基础上,寻找他里面所有的li标签(多个)
li_area_object_list = new_area_object.find_all(name='li')
# 3.循环每一个li标签,去获取他里面p标签的内容。
for li_object in li_area_object_list:
# li标签中寻找p标签:没找到就是None
p_object = li_object.find(name="p")
# 如果没找到p标签,就是让他continue
if not p_object:
continue
# 获取p标签内部的字符串内容
print(p_object.text)
print('==============')
```
#### 1.3.2 BeautifulSoup4模块
```
pip install BeautifulSoup4
```
专门用于帮助我们在一个HTML格式的字符串中提取我们想要的数据。
```python
text = """
<div>
<h1 id="hello">123</h1>
<h1>asdfasdfasdf</h1>
<p>asdfasdf</p>
<ul> <li>标题1</li> </ul>
<div>
<h2>fff</h2>
<h2>fff</h2>
<ul> <li class='x1'>标题1</li> </ul>
<ul> <li class='x1'>标题1</li> </ul>
<ul> <li class='x1'>标题1</li> </ul>
</div>
<div id="comment">
<ul><li>标题1</li></ul>
<ul><li>标题1</li></ul>
<ul><li>标题1</li></ul>
<ul><li>标题1</li></ul>
<ul><li>标题1</li></ul>
</div>
<img src="xxxxxx" />
</div>"""
from bs4 import BeautifulSoup
soup_object = BeautifulSoup(text,"html.parser")
# 寻找第一个
v1 = soup_object.find(name="div",attrs={"id":"comment"}) # 标签,
v2 = v1.find_all(name="li") # [标签,标签,标签,标签,]
# 寻找所有
v3 = soup_object.find_all(name="li",attrs={"class":"x1"}) # [标签,标签,]
# 获取到某个标签 <h1 id="hello" src="xx" name="wupeiqi"> 123 </h1>
v4 = soup_object.find(name="h1",attrs={"id":"hello"}) # 标签
print( v4.text )
print( v4.attrs["src"] )
print( v4.attrs["name"] )
案例:
```python
import re
import os
import shutil
import requests
from bs4 import BeautifulSoup
FILE_PATH = "files"
def download_image(file_path, url):
res = requests.get(
url=url
)
# 判断file文件夹是否存在,如果不存在,则创建files文件夹
if not os.path.exists(FILE_PATH):
os.makedirs(FILE_PATH)
with open(file_path, mode='wb') as f:
f.write(res.content)
def run():
if os.path.exists(FILE_PATH):
# 在执行之前删除文件
shutil.rmtree(FILE_PATH)
res = requests.get(url="http://s.10010.com/hebei/mobilelist-0-0-0-0-0-0-0-0-177-0-0-p2/")
soup_object = BeautifulSoup(res.text, 'html.parser')
goods_object_list = soup_object.find_all(name='li', attrs={"class": "goodsLi"})
file_object = open("db.txt", mode='a', encoding='utf-8')
for goods in goods_object_list:
title = goods.find(name='p', attrs={"class": "mobileGoodsName"}).find(name='a').text
price = goods.find(name="label", attrs={'class': "priceD"}).text
price_number = int(re.findall("¥(\d+)", price)[0])
comment = goods.find(name="p", attrs={'class': "evalNum"}).text
comment_number = int(re.findall("已有(\d+)人评价", comment)[0])
image_url = goods.find(name='img').attrs['data-original']
file_name = image_url.rsplit("/", maxsplit=1)[-1]
file_path = os.path.join(FILE_PATH, file_name)
# 下载
download_image(file_path, image_url)
line = "{}|{}|{}|{}|{}\n".format(title, price_number, comment_number, file_path, image_url)
file_object.write(line)
file_object.close()
if __name__ == '__main__':
run()
```