读取s3图片并保存至excel_读取s3图片python-CSDN博客

本文链接：https://blog.csdn.net/MusicDancing/article/details/128190047

1. 构建Client类，实现图片读取

1.1 导包&config

client.py

import os
import base64
import numpy as np
import pandas as pd
import cv2
import boto3
# boto3安装: pip3 install opencv-python boto3

config = {
    "region_name": "us-east-1",
    "endpoint_url": "https://s3.rapidcompute.com",
    # "image_bucket": "prod-barwaqt-image",
    "aws_access_key_id": "rcus_bd-prod",
    "aws_secret_access_key": "OgRKm6h...2HdbKA6s",
}

1.2 类实现

class Client:
    def __init__(self):
        self.config = config
        self.client = boto3.client('s3', **self.config)

    def read_image(self, bucket_name, image_key):
        try:
            response = self.client.get_object(Bucket=bucket_name, Key=image_key)
            body = response.get('Body')
            tmp = np.frombuffer(body.read(), np.uint8)
            image = cv2.imdecode(tmp, cv2.IMREAD_COLOR)
            return 'OK', image
        except Exception as e:
            return 'ERROR', 'READ_IMAGE_ERROR'

    def read_image_b64(self, bucket_name, image_key):
        status, image = self.read_image(bucket_name, image_key)
        if status == 'OK':
            retval, buffer = cv2.imencode('.jpg', image)
            pic_str = base64.b64encode(buffer)
            return status, pic_str.decode()
        else:
            return status, image

2. 图片下载至本地文件夹mydir

2.1 图片下载

def save_img(img_path, file_name='test.jpg'):
    status, img = client.read_image('prod-barwaqt-image', img_path)   
    if status == 'OK':
        file_name = os.path.join('mydir', line.split('/')[-1])
        # os.makedirs('./mydir', exist_ok=True)
        cv2.imwrite(file_name, img)
        # cv2.imwrite(file_name, img, [cv2.IMWRITE_PNG_COMPRESSION, 8])  # 轻度压缩

2.2 测试

2.2.1 单图下载

client = Client()
img_path = 'prod/18/be56/18be564c36b05d730257dbbe87ede614.jpg'
save_img(img_path)

2.2.2 批量下载

client = Client()
df = pd.read_csv('img_path.csv')
df['s3_path'].apply(save_img)

img_path.csv 如下：

user_account_id	s3_path
210805010001565250	prod/12/e122/12e122b5328e1b5007b3de5c76e0bf02.jpg
210812010008799851	prod/26/92b7/2692b7c55bb71581586a6392926c0a24.jpg

2.2.3 多线程下载

from pandarallel import pandarallel
pandarallel.initialize(nb_workers=10, use_memory_fs=False, progress_bar=True)
client = Client()  # 这里client一定要定义在外面
df['img_path'].parallel_apply(save_img)

2.2.4 本地图片批量压缩

def img_batch_zip(input_dir, output_dir):
    for item in os.listdir(input_dir):
        try:
            # 把jpeg、png格式的图片转换成jpg格式
            target_file_name = os.path.join(output_dir, item.split(".")[0] + ".jpg")
            # print(target_file_name)
            img = cv2.imread(input_dir + item)
            # print(img.shape)
            output_image = img_pad(img)    # 重度压缩
            cv2.imwrite(target_file_name, output_image, [cv2.IMWRITE_PNG_COMPRESSION, 8])
        except Exception as e:
            pass

图片压缩

def img_pad(pil_file):
    # h,w 先后不要写错，不然图片会变形
    h, w, c = pil_file.shape
    # print(h, w, c)
    fixed_size = 1600  # 输出正方形图片的尺寸

    if h >= w:
        factor = h / float(fixed_size)
        new_w = int(w / factor)
        if new_w % 2 != 0:
            new_w -= 1
        pil_file = cv2.resize(pil_file, (new_w, fixed_size))
        pad_w = int((fixed_size - new_w) / 2)
        array_file = np.array(pil_file)
    else:
        factor = w / float(fixed_size)
        new_h = int(h / factor)
        if new_h % 2 != 0:
            new_h -= 1
        pil_file = cv2.resize(pil_file, (fixed_size, new_h))
        pad_h = int((fixed_size - new_h) / 2)
        array_file = np.array(pil_file)
    return array_file

3. 保存图片至excel

读取test_data.csv中的数据，将地址字段进行相应图片下载，追加至行末。

输入：test_data.csv

输出：res.xlsx

3.1 导包

# -*- coding: utf-8 -*-
import os
import pandas as pd
import cv2
import xlsxwriter
import tqdm

# 定义一个excel文件，并添加一个sheet
BOOK = xlsxwriter.Workbook('res.xlsx')
SHEET = BOOK.add_worksheet('sheet1')
CEIL_HEIGHT = 256
SHEET.set_default_row(CEIL_HEIGHT)
SHEET.set_column(0, 18, CEIL_HEIGHT / 18)

3.2 插入图片内容

1. 在一个单元格插入一张图片

def inset_a_img(img_name, target_col):  # target_col：插入的位置
    # 从本地文件夹读图片
    image_path = os.path.join("./mydir/", img_name)
    h, w, *_ = cv2.imread(image_path).shape
    scale = CEIL_HEIGHT * 1.3 / h
    SHEET.insert_image(line.Index + 1, target_col, image_path,  # x_offset可调整x轴图片偏移
                       {'x_offset': 100, 'y_offset': 2, 'x_scale': scale, 'y_scale': scale, 'positioning': 1})

2. 处理一行数据

def insert_image(line):
    print('正在操作第几行: ', line.Index)
    print("该行有多少列: ", len(line))
    
    # 从第2列开始循环插入（第1列为索引）
    for i in range(1, len(line)):
        # print("正在操作第几列 col_no: ", i)
        if pd.isna(line[i]):
            SHEET.write(line.Index + 1, i-1, '')  # 由于插入了表头，所以从第一行开始写
        else:
            SHEET.write(line.Index + 1, i-1, line[i])

        if i == 1 and not pd.isna(line.s3_path_1):   # 当该列为s3_path_1，且其值不为空
            target_col = 3
            img_name = line.s3_path_1.split('/')[-1]
            print(img_name)
            inset_a_img(img_name, target_col)
        if i == 2 and not pd.isna(line.s3_path_2):  # 为Nan的置空，不写入图片
            target_col = 5
            img_name = line.s3_path_2.split('/')[-1]
            inset_a_img(img_name, target_col)

3.2 测试

df = pd.read_csv('test_data.csv', dtype=str)
col_list = ['user_account_id', 's3_path_1', 's3_path_2']
df.columns = col_list
# 为写入excel表头
for i in range(len(col_list)):
    SHEET.write(0, i, col_list[i])  # 第0行第i列插入字段

for line in tqdm.tqdm(df.itertuples()):  # tqdm: 显示进度条
    # print(line)
    # 算上index列，每行有len(col_list)+1 列
    # Pandas(Index=0, user_account_id='21...346', s3_path_1='e4.jpg',
    #        s3_path_2='fc.jpg')
    insert_image(line)

BOOK.close()