【HuggingFace 如何上传数据集 (3) 】Dataset-稳定上传与下载大量(上万)图片等格式的数据

系列文章目录


前言


一、上传

from datasets import Dataset, Features, Array3D, Value
import os
import numpy as np
from PIL import Image

# Define image path
image_dir = "/path/to/local_img_dir"

# Load and convert images to numpy arrays with resizing
def load_image(image_path):
    image = Image.open(image_path).resize((512, 512))  # Resize to 512x512
    return np.array(image)

# Generate image file paths list
image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith(".png")]

# Create dataset, returning a dictionary with keys 'image' and 'filename'
def generate_dataset():
    for path in image_paths:
        yield {"image": load_image(path), "filename": os.path.basename(path)}

dataset = Dataset.from_generator(
    generate_dataset,
    features=Features({
        "image": Array3D(dtype="uint8", shape=(512, 512, 3)),  # Updated shape
        "filename": Value("string")
    })
)

# Set batch size to avoid memory overflow
dataset = dataset.with_format("numpy", writer_batch_size=50)  # Adjust as necessary

# Push dataset to Hugging Face Hub
dataset.push_to_hub("your/data_repo")

  1. 注意每个 batch 不能超过 2GB,所以可能需要调节下 writer_batch_size
  2. Array3D(dtype=“uint8”, shape=(512, 512, 3) shape 的第二个维度不能为 None(动态轴)。

二、下载

  1. 保证服务器能够 curl 到 huggingface 是最关键的。
  2. 通过 ping google.comcurl https://huggingface.co 来测下网络有没有问题。
from datasets import load_dataset
from PIL import Image
import numpy as np
import os

HF_TOKEN = 'hf_yourtoken' # 如果是 private 需要用特定的 token 来鉴权

# Load the dataset from Hugging Face Hub
dataset = load_dataset("your/data_repo", token=HF_TOKEN)

# Directory where the images will be saved
save_dir = "/local/dir"

# Ensure the directory exists
os.makedirs(save_dir, exist_ok=True)

# Function to convert numpy array to PIL image
def convert_to_image(image_array):
    return Image.fromarray(np.uint8(image_array))

# 用于测试,先下载前 10 张看看,不过 load_dataset 会直接将所有数据(.parquet)都下载下来
# Save the first 10 images 
for i, example in enumerate(dataset['train'].select(range(10))):  # Select the first 10 examples
    image_array = example['image']  # Get the numpy array for the image
    image = convert_to_image(image_array)  # Convert to PIL image
    filename = example['filename']  # Get the filename (without extension)
    
    # Construct the full file path and save the image as a PNG
    image_path = os.path.join(save_dir, f"{filename}.png")
    image.save(image_path)
    
    print(f"Saved {filename}.png to {save_dir}")

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值