问题及需求
下载nuscenes完整数据集,并检查下载和解压过程中是否出现问题,简单验证数据集的完整性
当前环境
-
Ubuntu 20.04.6 LTS
-
carla0.9.14
-
python 3.8.19
问题解决
问题解决1--下载并解压完整nuscenes数据集
批量下载命令 download.sh
其中网址可以在nuscenes官网找到,建议运行前复制下网址在浏览器运行,检测网址是否有效
运行下载命令在你想要下载数据集的位置
下载时间很久,建议批量下载,省心方便,在每一条wget指令后面加上 “&”
批量下载命令 download.sh
wget -c -O v1.0-trainval_meta.tgz "https://d36yt3mvayqw5m.cloudfront.net/public/v1.0/v1.0-trainval_meta.tgz"&
wget -c -O v1.0-trainval01_blobs.tgz "https://motional-nuscenes.s3.amazonaws.com/public/v1.0/v1.0-trainval01_blobs.tgz"&
wget -c -O v1.0-trainval02_blobs.tgz "https://motional-nuscenes.s3.amazonaws.com/public/v1.0/v1.0-trainval02_blobs.tgz"&
wget -c -O v1.0-trainval03_blobs.tgz "https://motional-nuscenes.s3.amazonaws.com/public/v1.0/v1.0-trainval03_blobs.tgz"&
wget -c -O v1.0-trainval04_blobs.tgz "https://motional-nuscenes.s3.amazonaws.com/public/v1.0/v1.0-trainval04_blobs.tgz"&
wget -c -O v1.0-trainval05_blobs.tgz "https://motional-nuscenes.s3.amazonaws.com/public/v1.0/v1.0-trainval05_blobs.tgz"&
wget -c -O v1.0-trainval06_blobs.tgz "https://motional-nuscenes.s3.amazonaws.com/public/v1.0/v1.0-trainval06_blobs.tgz"&
wget -c -O v1.0-trainval07_blobs.tgz "https://motional-nuscenes.s3.amazonaws.com/public/v1.0/v1.0-trainval07_blobs.tgz"&
wget -c -O v1.0-trainval08_blobs.tgz "https://motional-nuscenes.s3.amazonaws.com/public/v1.0/v1.0-trainval08_blobs.tgz"&
wget -c -O v1.0-trainval09_blobs.tgz "https://motional-nuscenes.s3.amazonaws.com/public/v1.0/v1.0-trainval09_blobs.tgz"&
wget -c -O v1.0-trainval10_blobs.tgz "https://motional-nuscenes.s3.amazonaws.com/public/v1.0/v1.0-trainval10_blobs.tgz"&
wget -c -O v1.0-test_meta.tgz "https://d36yt3mvayqw5m.cloudfront.net/public/v1.0/v1.0-test_meta.tgz"&
wget -c -O v1.0-test_blobs.tgz "https://motional-nuscenes.s3.amazonaws.com/public/v1.0/v1.0-test_blobs.tgz"
解压命令
tar -xvf 加文件名,如下所示
tar -xvf v1.0-trainval10_blobs.tgz
问题解决2--验证nuscenes数据集完整性
运行下面的代码
思路为检测数据集配置文件中的数据总量和samples文件夹下的数据总量是否相等,若相等,可简单证明当前数据集完整性,若不相等,证明下载或解压过程中出现丢失情况
同时本代码还统计了训练集train、 验证集val、 测试集test 中的标记数据总量
代码统计得:
训练集train标记总量:nbr_train_scene: 28130
验证集val标记总量:nbr_val_scene: 6019
测试集test标记总量:nbr_test_scene: 6008
训练集train和验证集val标记总量:nbr_trainval: 34149
nuscenes数据集标记总量:num_sample: 40157
程序代码
from nuscenes.utils import splits
from nuscenes.nuscenes import NuScenes
import os
import mmcv
# nusc = NuScenes(version='v1.0-mini', dataroot="/data/sunbs/StreamPETR/data/nuscenes", verbose=True)
nusc_trainval = NuScenes(version='v1.0-trainval', dataroot="/data/sunbs/nuscenes", verbose=True)
nusc_test = NuScenes(version='v1.0-test', dataroot="/data/sunbs/nuscenes", verbose=True)
def get_available_scenes(nusc):
"""Get available scenes from the input nuscenes class.
Given the raw data, get the information of available scenes for
further info generation.
Args:
nusc (class): Dataset class in the nuScenes dataset.
Returns:
available_scenes (list[dict]): List of basic information for the
available scenes.
"""
available_scenes = []
print('total scene num: {}'.format(len(nusc.scene)))
for scene in nusc.scene:
scene_token = scene['token']
scene_rec = nusc.get('scene', scene_token)
sample_rec = nusc.get('sample', scene_rec['first_sample_token'])
sd_rec = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
has_more_frames = True
scene_not_exist = False
while has_more_frames:
lidar_path, boxes, _ = nusc.get_sample_data(sd_rec['token'])
lidar_path = str(lidar_path)
if os.getcwd() in lidar_path:
# path from lyftdataset is absolute path
lidar_path = lidar_path.split(f'{os.getcwd()}/')[-1]
# relative path
if not mmcv.is_filepath(lidar_path):
scene_not_exist = True
break
else:
break
if scene_not_exist:
continue
available_scenes.append(scene)
print('exist scene num: {}'.format(len(available_scenes)))
return available_scenes
def get_nbr_scenes_trainval(nusc):
# splits中的场景名
train_scenes_name = splits.train
val_scenes_name = splits.val
# 当前数据集中可用的场景,包含场景具体信息
available_scenes = get_available_scenes(nusc)
# 当前数据集中可用的场景名
available_scene_names = [s['name'] for s in available_scenes]
# 将可用场景名与splits中的场景名对比
train_scenes_name = list(
filter(lambda x: x in available_scene_names, train_scenes_name))
val_scenes_name = list(
filter(lambda x: x in available_scene_names, val_scenes_name))
# 获取训练场景的token
train_scene_token = list([
available_scenes[available_scene_names.index(s)]['token']
for s in train_scenes_name
])
val_scene_token = list([
available_scenes[available_scene_names.index(s)]['token']
for s in val_scenes_name
])
nbr_train_scene = 0
for train_token in train_scene_token:
train_scene_one = nusc.get("scene",train_token)
nbr_train_scene += train_scene_one['nbr_samples']
nbr_val_scene = 0
for val_token in val_scene_token:
val_scene_one = nusc.get("scene",val_token)
nbr_val_scene += val_scene_one['nbr_samples']
nbr_trainval = nbr_train_scene + nbr_val_scene
return nbr_train_scene, nbr_val_scene, nbr_trainval
def get_nbr_scenes_test(nusc):
# splits中的场景名
test_scenes_name = splits.test
# 当前数据集中可用的场景,包含场景具体信息
available_scenes = get_available_scenes(nusc)
# 当前数据集中可用的场景名
available_scene_names = [s['name'] for s in available_scenes]
# 将可用场景名与splits中的场景名对比
test_scenes_name = list(
filter(lambda x: x in available_scene_names, test_scenes_name))
# 获取训练场景的token
test_scene_token = list([
available_scenes[available_scene_names.index(s)]['token']
for s in test_scenes_name
])
nbr_test_scene = 0
for test_token in test_scene_token:
test_scene_one = nusc.get("scene",test_token)
nbr_test_scene += test_scene_one['nbr_samples']
return nbr_test_scene
nbr_train_scene, nbr_val_scene, nbr_trainval = get_nbr_scenes_trainval(nusc_trainval)
nbr_test_scene = get_nbr_scenes_test(nusc_test)
print(f"nbr_train_scene: {nbr_train_scene}")
print(f"nbr_val_scene: {nbr_val_scene}")
print(f"nbr_test_scene: {nbr_test_scene}")
print("----------------------------------------")
nbr_all = nbr_trainval + nbr_test_scene
print(f"nbr_trainval: {nbr_trainval}")
print(f"nbr_all: {nbr_all}")
print("----------------------------------------")
sample_path = "/data/sunbs/nuscenes/samples/CAM_FRONT/"
files = os.listdir(sample_path)
num_sample = len(files)
print(f"num_sample: {num_sample}")
print("----------------------------------------")
if nbr_all == num_sample:
print("nbr_all == num_sample, 当前数据集数量没有问题")
else:
print("nbr_all != num_sample, 当前数据集数量存在问题,请重新检查")
程序结果
Loading NuScenes tables for version v1.0-trainval...
23 category,
8 attribute,
4 visibility,
64386 instance,
12 sensor,
10200 calibrated_sensor,
2631083 ego_pose,
68 log,
850 scene,
34149 sample,
2631083 sample_data,
1166187 sample_annotation,
4 map,
Done loading in 45.248 seconds.
======
Reverse indexing ...
Done reverse indexing in 12.2 seconds.
======
Loading NuScenes tables for version v1.0-test...
23 category,
8 attribute,
4 visibility,
0 instance,
12 sensor,
1800 calibrated_sensor,
462901 ego_pose,
15 log,
150 scene,
6008 sample,
462901 sample_data,
0 sample_annotation,
4 map,
Done loading in 6.996 seconds.
======
Reverse indexing ...
Done reverse indexing in 1.3 seconds.
======
total scene num: 850
exist scene num: 850
total scene num: 150
exist scene num: 150
nbr_train_scene: 28130
nbr_val_scene: 6019
nbr_test_scene: 6008
----------------------------------------
nbr_trainval: 34149
nbr_all: 40157
----------------------------------------
num_sample: 40157
----------------------------------------
nbr_all == num_sample, 当前数据集数量没有问题