自动爬取图片并保存

be368

已于 2022-10-24 20:05:11 修改

阅读量501

点赞数

文章标签： 1024程序员节

于 2022-10-24 19:57:50 首次发布

本文链接：https://blog.csdn.net/qq_67528799/article/details/127499511

版权

一.准备工作

用python来实现对百度图片的爬取并保存，以情绪图片为例

二.代码实现

这次的爬取主要用了如下的第三方库

import re
import time
import requests
from bs4 import BeautifulSoup
import os

第一部分：获取网页内容

baseurl = 'https://cn.bing.com/images/search?q=%E6%83%85%E7%BB%AA%E5%9B%BE%E7%89%87&qpvt=%e6%83%85%e7%bb%aa%e5%9b%be%e7%89%87&form=IGRE&first=1&cw=418&ch=652&tsc=ImageBasicHover'
head = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67"}
    response = requests.get(baseurl, headers=head)  # 获取网页信息
    html = response.text  # 将网页信息转化为text形式

第二部分: 解析网页

Img = re.compile(r'img.*src="(.*?)"')  # 正则表达式匹配图片
soup = BeautifulSoup(html, "html.parser")  # BeautifulSoup解析html
    #i = 0  # 计数器初始值
    data = []  # 存储图片超链接的列表
    for item in soup.find_all('img', src=""):  # soup.find_all对网页中的img—src进行迭代
        item = str(item)  # 转换为str类型
        Picture = re.findall(Img, item)  # 结合re正则表达式和BeautifulSoup, 仅返回超链接
        for b in Picture:
            data.append(b)
            #i = i + 1
            return data[-1]
 
    # print(i)

第三部分：保存图片

    for m in getdata(
            baseurl='https://cn.bing.com/images/search?q=%E6%83%85%E7%BB%AA%E5%9B%BE%E7%89%87&qpvt=%e6%83%85%e7%bb%aa%e5%9b%be%e7%89%87&form=IGRE&first=1&cw=418&ch=652&tsc=ImageBasicHover'):
        resp = requests.get(m)  #获取网页信息
        byte = resp.content  # 转化为content二进制
        print(os.getcwd()) # os库中输出当前的路径
        i = i + 1 # 递增
        # img_path = os.path.join(m)
        with open("path{}.jpg".format(i), "wb") as f: # 文件写入
            f.write(byte)
            time.sleep(0.5) # 每隔0.5秒下载一张图片放入D://情绪图片测试
        print("第{}张图片爬取成功!".format(i))

运行截图