requests包和beautifulsoup4


```python
import requests
from bs4 import BeautifulSoup

# ########################## 第一步:发送请求并获取数据 ##########################
# gb2312 -> gbk编码压缩的数据
res = requests.get(url="https://www.autohome.com.cn/news/")

# requests内部帮我们做编码处理
# res.encoding = 'gbk'
# print(res.text)

# 原始内容
data = res.content.decode('gbk')
# print(data)

# ########################## 第二步:根据特征去获取局部数据 ##########################
"""
如果你想要在一个HTML格式的字符串中寻找自己想要的数据,你需要安装一个第三方的模块(专门帮助我们对HTML格式数据进行处理)
    pip install BeautifulSoup4
"""
# 整个HTML字符串
soup_object = BeautifulSoup(data, "html.parser")

# 1.寻找id=auto-channel-lazyload-article的div标签,近一步去缩小区域(1个)
new_area_object = soup_object.find(name='div', attrs={"id": "auto-channel-lazyload-article"})
# print(new_area_object)

# 2.在一步的基础上,寻找他里面所有的li标签(多个)
li_area_object_list = new_area_object.find_all(name='li')

# 3.循环每一个li标签,去获取他里面p标签的内容。
for li_object in li_area_object_list:
    # li标签中寻找p标签:没找到就是None
    p_object = li_object.find(name="p")
    # 如果没找到p标签,就是让他continue
    if not p_object:
        continue
    # 获取p标签内部的字符串内容
    print(p_object.text)
    print('==============')
```



#### 1.3.2 BeautifulSoup4模块

```
pip install BeautifulSoup4
```

专门用于帮助我们在一个HTML格式的字符串中提取我们想要的数据。

```python
text = """
<div>
	<h1 id="hello">123</h1>
	<h1>asdfasdfasdf</h1>
	<p>asdfasdf</p>
	<ul>  <li>标题1</li>   </ul>
	<div>
		<h2>fff</h2>
		<h2>fff</h2>
		<ul>  <li class='x1'>标题1</li>   </ul>
		<ul>  <li class='x1'>标题1</li>   </ul>
		<ul>  <li class='x1'>标题1</li>   </ul>
	</div>
	<div id="comment">
		<ul><li>标题1</li></ul>
		<ul><li>标题1</li></ul>
		<ul><li>标题1</li></ul>
		<ul><li>标题1</li></ul>
		<ul><li>标题1</li></ul>
	</div>
	<img src="xxxxxx" />
</div>"""

from bs4 import BeautifulSoup

soup_object = BeautifulSoup(text,"html.parser")

# 寻找第一个
v1 = soup_object.find(name="div",attrs={"id":"comment"}) # 标签,
v2 = v1.find_all(name="li") # [标签,标签,标签,标签,]

# 寻找所有
v3 = soup_object.find_all(name="li",attrs={"class":"x1"}) # [标签,标签,]

# 获取到某个标签 <h1 id="hello" src="xx" name="wupeiqi"> 123 </h1>
v4 = soup_object.find(name="h1",attrs={"id":"hello"}) # 标签
print( v4.text )
print( v4.attrs["src"] )
print( v4.attrs["name"] )

案例:


```python
import re
import os
import shutil
import requests
from bs4 import BeautifulSoup

FILE_PATH = "files"


def download_image(file_path, url):
    res = requests.get(
        url=url
    )
    # 判断file文件夹是否存在,如果不存在,则创建files文件夹
    if not os.path.exists(FILE_PATH):
        os.makedirs(FILE_PATH)

    with open(file_path, mode='wb') as f:
        f.write(res.content)


def run():
    if os.path.exists(FILE_PATH):
        # 在执行之前删除文件
        shutil.rmtree(FILE_PATH)

    res = requests.get(url="http://s.10010.com/hebei/mobilelist-0-0-0-0-0-0-0-0-177-0-0-p2/")
    soup_object = BeautifulSoup(res.text, 'html.parser')
    goods_object_list = soup_object.find_all(name='li', attrs={"class": "goodsLi"})

    file_object = open("db.txt", mode='a', encoding='utf-8')

    for goods in goods_object_list:
        title = goods.find(name='p', attrs={"class": "mobileGoodsName"}).find(name='a').text
        price = goods.find(name="label", attrs={'class': "priceD"}).text
        price_number = int(re.findall("¥(\d+)", price)[0])
        comment = goods.find(name="p", attrs={'class': "evalNum"}).text
        comment_number = int(re.findall("已有(\d+)人评价", comment)[0])
        image_url = goods.find(name='img').attrs['data-original']

        file_name = image_url.rsplit("/", maxsplit=1)[-1]
        file_path = os.path.join(FILE_PATH, file_name)
        
        # 下载
        download_image(file_path, image_url)

        line = "{}|{}|{}|{}|{}\n".format(title, price_number, comment_number, file_path, image_url)
        file_object.write(line)

    file_object.close()


if __name__ == '__main__':
    run()
```

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值