Hugging Face中,许多带图片的数据集,保存的格式为parquet。
以coco2014为例:
因此,接下来将通过两种方式,记录下从hugging face上下载.parquet文件并转换为jsonl的两种方式。
方式一:.parquet
直接网页下载到本地:
-
点击上图中的下载按钮,逐个下载每一个.parquet文件
-
下载图片,转换jsonl格式文件
具体步骤:
1. 加载数据集,查看内容格式
import pandas as pd
# 读取单个 Parquet 文件
df = pd.read_parquet('validation-00000-of-00014.parquet')
# 如果是多个文件在一个文件夹里
# df = pd.read_parquet('dataset_directory/')
print(df.head())
print(type(row['caption']), row['caption']) # 确定一下caption的格式:根据显示,为numpy.ndarray
确认内容:
显示前5条:
image license \
0 {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x... 2
1 {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x... 4
2 {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x... 2
3 {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x... 1
4 {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x... 1
coco_url height width \
0 http://images.cocodataset.org/val2014/COCO_val... 478 640
1 http://images.cocodataset.org/val2014/COCO_val... 640 565
2 http://images.cocodataset.org/val2014/COCO_val... 426 640
3 http://images.cocodataset.org/val2014/COCO_val... 480 640
4 http://images.cocodataset.org/val2014/COCO_val... 374 500
date_captured flickr_url \
0 2013-11-18 09:22:23 http://farm7.staticflickr.com/6024/6016274664_...
1 2013-11-15 12:34:05 http://farm6.staticflickr.com/5023/5881310882_...
2 2013-11-15 03:08:44 http://farm5.staticflickr.com/4087/5078192399_...
3 2013-11-20 21:35:54 http://farm5.staticflickr.com/4096/4785440231_...
4 2013-11-17 01:42:35 http://farm1.staticflickr.com/90/206826531_339...
id caption
0 42 [This wire metal rack holds several pairs of s...
1 73 [A motorcycle parked in a parking space next t...
2 74 [A picture of a dog laying on the ground., Dog...
3 133 [A loft bed with a dresser underneath it., A b...
4 136 [Two giraffes in a room with people looking at...
<class 'numpy.ndarray'> ['A kitchen filled with furniture and a stove top oven.'
'Two people working in a kitchen with various cooking devices.'
'a kitchen with a table with lots of pots and pans'
'Two people that are working on something in a room.'
'Two people at a kitchen counter near a wood burning stove.']
其中:
字段名 | 含义 | 备注 |
image | 图片数据(字节)或元数据字典 | 是一个字典,里面通常包含 'bytes' |
license | 图片版权许可证 ID | 整数 |
coco_url | 图片在 COCO 官网的 URL | 可选 |
height | 图片高度 | 整数 |
width | 图片宽度 | 整数 |
date_captured | 图片拍摄时间 | 可选 |
flickr_url | 图片来源 URL | 可选 |
id | 图片 ID | 整数 |
caption | 图片描述 | 每条是字符串,如:["A dog..."] |
2. 转换脚本
#######Parquet格式
import pandas as pd
import json
import os
from PIL import Image
from io import BytesIO
# 配置路径
parquet_path = 'validation-00000-of-00014.parquet' # parquet 文件
output_image_dir = 'val2014' # 保存图片的目录
output_jsonl_path = 'val2014.jsonl' # JSONL 保存路径
# 创建图片输出目录
os.makedirs(output_image_dir, exist_ok=True)
# 读取 parquet 文件
df = pd.read_parquet(parquet_path)
# 计数器
success_count = 0
# 写 JSONL 文件
with open(output_jsonl_path, 'w', encoding='utf-8') as jsonl_file:
for idx, row in df.iterrows():
try:
# 提取图像数据
image_data = row['image']['bytes']
image_id = row['id']
height = row['height']
width = row['width']
license_id = row['license']
caption_raw = row['caption']
if isinstance(caption_raw, (list, np.ndarray)) and len(caption_raw) > 0:
caption = caption_raw[0]
else:
caption = ""
# 构造图片路径
image_filename = f"image_{image_id}.jpg"
image_path = os.path.join(output_image_dir, image_filename)
image_key = f"{output_image_dir}/{image_filename}"
# 保存图片
img = Image.open(BytesIO(image_data))
img.save(image_path)
# 构造 JSONL 记录
record = {
"image": image_key,
"height": height,
"width": width,
"caption": caption,
"id": image_id,
"license": license_id
}
# 写入 JSONL 文件
jsonl_file.write(json.dumps(record, ensure_ascii=False) + '\n')
success_count += 1
except Exception as e:
print(f"❌ Error processing row {idx} (id={row.get('id')}): {e}")
continue
print(f"\n✅ 完成!共成功保存 {success_count} 张图像并生成 JSONL")
print(f"🖼 图片保存在: {output_image_dir}")
print(f"📄 JSONL 保存为: {output_jsonl_path}")
经转换,图片将保存在val2014文件夹下,生成的jsonl内容示例如下:
{"image": "val2014/image_42.jpg", "height": 478, "width": 640, "caption": "This wire metal rack holds several pairs of shoes and sandals", "id": 42, "license": 2}
{"image": "val2014/image_73.jpg", "height": 640, "width": 565, "caption": "A motorcycle parked in a parking space next to another motorcycle.", "id": 73, "license": 4}
{"image": "val2014/image_74.jpg", "height": 426, "width": 640, "caption": "A picture of a dog laying on the ground.", "id": 74, "license": 2}
{"image": "val2014/image_133.jpg", "height": 480, "width": 640, "caption": "A loft bed with a dresser underneath it.", "id": 133, "license": 1}
方式二:.arrow
1. 下载数据集
直接通过load_dataset下载
from datasets import load_dataset
ds = load_dataset("AbdoTW/COCO_2014")
#ds = load_dataset("AbdoTW/COCO_2014", split="validation", cache_dir="...") #可以增加split,cache_dir等
ds.save_to_disk("COCO/val")
# 将数据保存为 HF dataset 格式(可以后续用 load_from_disk 加载)
# 只有这里 save to disk 后面才能用load from disk. 否则需直接使用ds
查看前5行
# 转成 pandas DataFrame
df = ds.to_pandas()
# 显示前几行
print(df.head())
2. 转换成jsonl格式
import json
import os
from PIL import Image
from datasets import load_from_disk
from io import BytesIO
# 配置参数
dataset_path = "COCO/val" # 保存为 HF Dataset 的路径
#### 注意是.arrow文件所在的上层文件夹,而不是直接的.arrow,如下;
#dataset_path = "COCO/val/data-00000-of-00014.arrow"
output_image_dir = 'val2014' # 图片输出目录
output_jsonl_path = 'val2014.jsonl' # JSONL输出路径
# 创建输出目录
os.makedirs(output_image_dir, exist_ok=True)
# 加载 HuggingFace Dataset
dataset = load_from_disk(dataset_path)
# 初始化计数器
success_count = 0
# 写入 JSONL 文件
with open(output_jsonl_path, 'w', encoding='utf-8') as jsonl_file:
for idx, row in enumerate(dataset):
try:
img = row['image'] # ✅ 注意这已经是 PIL.Image.Image
image_id = row['id']
height = row.get('height', None)
width = row.get('width', None)
license_id = row.get('license', None)
caption_raw = row.get('caption', "")
# 提取 caption(如果是列表只取第一个)
if isinstance(caption_raw, (list, tuple)) and len(caption_raw) > 0:
caption = caption_raw[0]
else:
caption = caption_raw
# 构造文件名和路径
image_filename = f"image_{image_id}.jpg"
image_path = os.path.join(output_image_dir, image_filename)
image_key = f"{output_image_dir}/{image_filename}"
# 保存图片
img.save(image_path)
# 写 JSONL 数据
record = {
"image": image_key,
"height": height,
"width": width,
"caption": caption,
"id": image_id,
"license": license_id
}
jsonl_file.write(json.dumps(record, ensure_ascii=False) + '\n')
success_count += 1
except Exception as e:
print(f"❌ Error processing row {idx} (id={row.get('id')}): {e}")
continue
print(f"\n✅ 完成!共成功保存 {success_count} 张图像并生成 JSONL")
print(f"🖼 图片保存在: {output_image_dir}")
print(f"📄 JSONL 保存为: {output_jsonl_path}")
经转换,图片将保存在val2014文件夹下,生成的jsonl内容示例如下:
{"image": "val2014/image_42.jpg", "height": 478, "width": 640, "caption": "This wire metal rack holds several pairs of shoes and sandals", "id": 42, "license": 2}
{"image": "val2014/image_73.jpg", "height": 640, "width": 565, "caption": "A motorcycle parked in a parking space next to another motorcycle.", "id": 73, "license": 4}
{"image": "val2014/image_74.jpg", "height": 426, "width": 640, "caption": "A picture of a dog laying on the ground.", "id": 74, "license": 2}
{"image": "val2014/image_133.jpg", "height": 480, "width": 640, "caption": "A loft bed with a dresser underneath it.", "id": 133, "license": 1}
数据集转换完成啦~