官方例子:https://github.com/huggingface/lerobot/blob/main/examples/1_load_lerobot_dataset.py
https://github.com/NVIDIA/Isaac-GR00T/blob/main/getting_started/LeRobot_compatible_data_schema.md
使用SO100机械臂进行数据采集后,得到如下格式:
.
├─meta
│ ├─episodes.jsonl
│ ├─modality.json # -> GR00T LeRobot specific
│ ├─info.json
│ └─tasks.jsonl
├─videos
│ └─chunk-000
│ └─observation.images.ego_view # -> ego_view 是自定义摄像头的名字
│ └─episode_000001.mp4
│ └─episode_000000.mp4
└─data
└─chunk-000
├─episode_000001.parquet
└─episode_000000.parquet
其中parquet的字段
字段列表: [‘action’, ‘observation.state’, ‘timestamp’, ‘frame_index’, ‘episode_index’, ‘index’, ‘task_index’]
一、读取本地LeRobotDataset
方法1: LeRobotDatasetMetadata读取数据采集总体信息
from pathlib import Path
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata
from datasets import load_dataset
dataset_root = Path("/home/zengxy/.cache/huggingface/lerobot/zeng/pen3")
# 1. 先加载本地元数据
meta = LeRobotDatasetMetadata(
repo_id="zeng/pen3",
root=dataset_root,
force_cache_sync=False # 重要!不要强制同步线上数据
)
# 查看元数据信息
print(meta)
# 2. 读取具体的Parquet文件(以第一个episode为例)
episode_index = 0
parquet_file = meta.get_data_file_path(episode_index)
print("Parquet 文件路径:", parquet_file)
# 使用datasets加载parquet文件,
hf_dataset = load_dataset("parquet", data_files=str(dataset_root / parquet_file), split="train")
#hf_dataset.set_format("torch")
# 检查数据字段
print("字段列表:", hf_dataset.column_names)
# 打印数据示例 , 时刻的参数
print(hf_dataset[0])
# 输出全部数据
for sample in hf_dataset:
print(sample)
meta返回值
LeRobotDatasetMetadata({
Repository ID: 'zeng/pen3',
Total episodes: '10',
Total frames: '2778',
Features: '['action', 'observation.state', 'observation.images.laptop', 'timestamp', 'frame_index', 'episode_index', 'index', 'task_index']',
})',
具体某一时刻数值返回值
Parquet 文件路径: data/chunk-000/episode_000000.parquet
字段列表: ['action', 'observation.state', 'timestamp', 'frame_index', 'episode_index', 'index', 'task_index']
{
'action': tensor([ 25.1367, 148.3594, 146.5137, 62.9297, -17.1387, 19.4232]),
'observation.state': tensor([ 20.5664, 153.3691, 149.8535, 64.0723, -17.4902, 19.5410]),
'timestamp': tensor(0.), 'frame_index': tensor(0), 'episode_index': tensor(0), 'index': tensor(0), 'task_index': tensor(0)}
方法 2:只读取parquet
from datasets import load_dataset
from pathlib import Path
dataset_root = Path("/home/zengxy/.cache/huggingface/lerobot/zeng/pen3")
parquet_file = dataset_root / "data/chunk-000/episode_000000.parquet"
dataset = load_dataset("parquet", data_files=str(parquet_file), split="train")
# 打印所有字段(keys)
print(dataset.column_names)
# 输出全部数据
for sample in dataset:
print(sample)
方法3: 使用 pyarrow 库(适合大文件,性能高)
pip install pyarrow
import pyarrow.parquet as pq
from pathlib import Path
parquet_file = Path("/home/zengxy/.cache/huggingface/lerobot/zeng/pen3/data/chunk-000/episode_000000.parquet")
table = pq.read_table(parquet_file)
# 转换成字典列表并输出
data = table.to_pylist()
for row in data:
print(row)
print(data[0].keys())