Waymo Open Dataset（WOD）感知数据集数据格式介绍

Ly.Leo

已于 2023-10-04 20:52:35 修改

阅读量502

点赞数

文章标签：数据库数据结构数据集

于 2023-10-04 20:28:45 首次发布

[Perception Data set] WAYMO Opensource Dataset

엔티그 2021.4.4.18:12

本文档节选翻译自这里，原文为韩语，使用谷歌翻译译为中文，纯机翻没有经过人工审核，辩证阅读。以下内容是原作者对dataset.proto文件的解释。

数据格式

每个数据都包含在帧消息中。

由于框架内的消息也具有依赖性，因此我们将解释从基本消息到框架的所有内容。不过，因为这是面向解释的代码，所以强调插入的值与数据类型根本不匹配。

message Label {
  // Upright box, zero pitch and roll.
  message Box {
    // Box coordinates in vehicle frame.
    optional double center_x = 1;
    optional double center_y = 2;
    optional double center_z = 3;

    // Dimensions of the box. length: dim x. width: dim y. height: dim z.
    optional double length = 5;
    optional double width = 4;
    optional double height = 6;

    // The heading of the bounding box (in radians).  The heading is the angle
    // required to rotate +x to the surface normal of the box front face. It is
    // normalized to [-pi, pi).
    optional double heading = 7;

    enum Type {
      TYPE_UNKNOWN = 0;
      // 7-DOF 3D (a.k.a upright 3D box).
      TYPE_3D = 1;
      // 5-DOF 2D. Mostly used for laser top down representation.
      TYPE_2D = 2;
      // Axis aligned 2D. Mostly used for image.
      TYPE_AA_2D = 3;
    }
  }
  
  optional Box box = 1;

  message Metadata {
    optional double speed_x = 1;
    optional double speed_y = 2;
    optional double accel_x = 3;
    optional double accel_y = 4;
  }
  
  optional Metadata metadata = 2;

  enum Type {
    TYPE_UNKNOWN = 0;
    TYPE_VEHICLE = 1;
    TYPE_PEDESTRIAN = 2;
    TYPE_SIGN = 3;
    TYPE_CYCLIST = 4;
  }
  
  optional Type type = 3;
  // Object ID.
  optional string id = 4;

  // The difficulty level of this label. The higher the level, the harder it is.
  enum DifficultyLevel {
    UNKNOWN = 0;
    LEVEL_1 = 1;
    LEVEL_2 = 2;
  }
  
  // Difficulty level for detection problem.
  optional DifficultyLevel detection_difficulty_level = 5;
  // Difficulty level for tracking problem.
  optional DifficultyLevel tracking_difficulty_level = 6;

  // The total number of lidar points in this box.
  optional int32 num_lidar_points_in_box = 7;

}

标签是一种表示带注释数据的消息类型。这包括框的位置、框的类型和对象的类型。还有激光雷达点数、识别难度等。

message MatrixShape {
  // Dimensions for the Matrix messages defined below. Must not be empty.
  //
  // The order of entries in 'dims' matters, as it indicates the layout of the
  // values in the tensor in-memory representation.
  //
  // The first entry in 'dims' is the outermost dimension used to lay out the
  // values; the last entry is the innermost dimension.  This matches the
  // in-memory layout of row-major matrices.
  repeated int32 dims = 1;
}

MatrixShape 声明维度的形状。这是因为随着维度的增加，所需的变量数量也会增加。

// Row-major matrix.
// Requires: data.size() = product(shape.dims()).
message MatrixFloat {
  repeated float data = 1 [packed = true];
  optional MatrixShape shape = 2;
}

MatrixFloat 将维度的形状声明为浮点数。

// Row-major matrix.
// Requires: data.size() = product(shape.dims()).
message MatrixFloat {
  repeated float data = 1 [packed = true];
  optional MatrixShape shape = 2;
}

MatrixInt32 将维度的形状声明为整数。

// Row-major matrix.
// Requires: data.size() = product(shape.dims()).
message MatrixInt32 {
  repeated int32 data = 1 [packed = true];
  optional MatrixShape shape = 2;
}

CameraName 是指安装在车辆上的摄像头。

message CameraName {
  enum Name {
    UNKNOWN = 0;
    FRONT = 1;
    FRONT_LEFT = 2;
    FRONT_RIGHT = 3;
    SIDE_LEFT = 4;
    SIDE_RIGHT = 5;
  }
}

LaserName 是指安装在车辆上的激光雷达。

// 'Laser' is used interchangeably with 'Lidar' in this file.
message LaserName {
  enum Name {
    UNKNOWN = 0;
    TOP = 1;
    FRONT = 2;
    SIDE_LEFT = 3;
    SIDE_RIGHT = 4;
    REAR = 5;
  }
}

变换是一个变化矩阵，用于将 3D 点从一帧变换到另一帧。

message Velocity {
  // Velocity in m/s.
  optional float v_x = 1;
  optional float v_y = 2;
  optional float v_z = 3;

  // Angular velocity in rad/s.
  optional double w_x = 4;
  optional double w_y = 5;
  optional double w_z = 6;
}

速度是指物体的速度。

message CameraCalibration {
  optional CameraName.Name name = 1;
  // 1d Array of [f_u, f_v, c_u, c_v, k{1, 2}, p{1, 2}, k{3}].
  // Note that this intrinsic corresponds to the images after scaling.
  // Camera model: pinhole camera.
  // Lens distortion:
  //   Radial distortion coefficients: k1, k2, k3.
  //   Tangential distortion coefficients: p1, p2.
  // k_{1, 2, 3}, p_{1, 2} follows the same definition as OpenCV.
  // https://en.wikipedia.org/wiki/Distortion_(optics)
  // https://docs.opencv.org/2.4/doc/tutorials/calib3d/camera_calibration/camera_calibration.html
  repeated double intrinsic = 2;
  // Camera frame to vehicle frame.
  optional Transform extrinsic = 3;
  // Camera image size.
  optional int32 width = 4;
  optional int32 height = 5;

  enum RollingShutterReadOutDirection {
    UNKNOWN = 0;
    TOP_TO_BOTTOM = 1;
    LEFT_TO_RIGHT = 2;
    BOTTOM_TO_TOP = 3;
    RIGHT_TO_LEFT = 4;
    GLOBAL_SHUTTER = 5;
  }
  optional RollingShutterReadOutDirection rolling_shutter_direction = 6;
}

CameraCalibration 在校准相机时显示相机的图像尺寸和内部设置。

message LaserCalibration {
  optional LaserName.Name name = 1;
  // If non-empty, the beam pitch (in radians) is non-uniform. When constructing
  // a range image, this mapping is used to map from beam pitch to range image
  // row.  If this is empty, we assume a uniform distribution.
  repeated double beam_inclinations = 2;
  // beam_inclination_{min,max} (in radians) are used to determine the mapping.
  optional double beam_inclination_min = 3;
  optional double beam_inclination_max = 4;
  // Lidar frame to vehicle frame.
  optional Transform extrinsic = 5;

}

LaserCalibration 显示 LiDAR 校准值。

message Context {
  // A unique name that identifies the frame sequence.
  optional string name = 1;
  repeated CameraCalibration camera_calibrations = 2;
  repeated LaserCalibration laser_calibrations = 3;
  // Some stats for the run segment used.
  message Stats {
    message ObjectCount {
      optional Label.Type type = 1;
      // The number of unique objects with the type in the segment.
      optional int32 count = 2;
    }
    repeated ObjectCount laser_object_counts = 1;
    repeated ObjectCount camera_object_counts = 5;
    // Day, Dawn/Dusk, or Night, determined from sun elevation.
    optional string time_of_day = 2;
    // Human readable location (e.g. CHD, SF) of the run segment.
    optional string location = 3;
    // Currently either Sunny or Rain.
    optional string weather = 4;
  }
  optional Stats stats = 4;
}

上下文表示帧的名称、当时相机和激光雷达的标定值、注释对象的数量、当时的天气和位置等。

message RangeImage {
  // Zlib compressed [H, W, 4] serialized version of MatrixFloat.
  // To decompress:
  // string val = ZlibDecompress(range_image_compressed);
  // MatrixFloat range_image;
  // range_image.ParseFromString(val);
  // Inner dimensions are:
  //   * channel 0: range
  //   * channel 1: intensity
  //   * channel 2: elongation
  //   * channel 3: is in any no label zone.
  optional bytes range_image_compressed = 2;

  // Lidar point to camera image projections. A point can be projected to
  // multiple camera images. We pick the first two at the following order:
  // [FRONT, FRONT_LEFT, FRONT_RIGHT, SIDE_LEFT, SIDE_RIGHT].
  //
  // Zlib compressed [H, W, 6] serialized version of MatrixInt32.
  // To decompress:
  // string val = ZlibDecompress(camera_projection_compressed);
  // MatrixInt32 camera_projection;
  // camera_projection.ParseFromString(val);
  // Inner dimensions are:
  //   * channel 0: CameraName.Name of 1st projection. Set to UNKNOWN if no
  //       projection.
  //   * channel 1: x (axis along image width)
  //   * channel 2: y (axis along image height)
  //   * channel 3: CameraName.Name of 2nd projection. Set to UNKNOWN if no
  //       projection.
  //   * channel 4: x (axis along image width)
  //   * channel 5: y (axis along image height)
  // Note: pixel 0 corresponds to the left edge of the first pixel in the image.
  optional bytes camera_projection_compressed = 3;

  // Zlib compressed [H, W, 6] serialized version of MatrixFloat.
  // To decompress:
  // string val = ZlibDecompress(range_image_pose_compressed);
  // MatrixFloat range_image_pose;
  // range_image_pose.ParseFromString(val);
  // Inner dimensions are [roll, pitch, yaw, x, y, z] represents a transform
  // from vehicle frame to global frame for every range image pixel.
  // This is ONLY populated for the first return. The second return is assumed
  // to have exactly the same range_image_pose_compressed.
  //
  // The roll, pitch and yaw are specified as 3-2-1 Euler angle rotations,
  // meaning that rotating from the navigation to vehicle frame consists of a
  // yaw, then pitch and finally roll rotation about the z, y and x axes
  // respectively. All rotations use the right hand rule and are positive
  // in the counter clockwise direction.
  optional bytes range_image_pose_compressed = 4;

  // Zlib compressed [H, W, 5] serialized version of MatrixFloat.
  // To decompress:
  // string val = ZlibDecompress(range_image_flow_compressed);
  // MatrixFloat range_image_flow;
  // range_image_flow.ParseFromString(val);
  // Inner dimensions are [vx, vy, vz, pointwise class].
  //
  // If the point is not annotated with scene flow information, class is set
  // to -1. A point is not annotated if it is in a no-label zone or if its label
  // bounding box does not have a corresponding match in the previous frame,
  // making it infeasible to estimate the motion of the point.
  // Otherwise, (vx, vy, vz) are velocity along (x, y, z)-axis for this point
  // and class is set to one of the following values:
  //  -1: no-flow-label, the point has no flow information.
  //   0:  unlabeled or "background,", i.e., the point is not contained in a
  //       bounding box.
  //   1: vehicle, i.e., the point corresponds to a vehicle label box.
  //   2: pedestrian, i.e., the point corresponds to a pedestrian label box.
  //   3: sign, i.e., the point corresponds to a sign label box.
  //   4: cyclist, i.e., the point corresponds to a cyclist label box.
  optional bytes range_image_flow_compressed = 5;

  // Deprecated, do not use.
  optional MatrixFloat range_image = 1 [deprecated = true];
}

RangeImage是指照片所代表的特征。通过针对每个特征调整通道，您可以获得您想要的数据。

// All timestamps in this proto are represented as seconds since Unix epoch.
message CameraImage {
  optional CameraName.Name name = 1;
  // JPEG image.
  optional bytes image = 2;
  // SDC pose.
  optional Transform pose = 3;
  // SDC velocity at 'pose_timestamp' below. The velocity value is represented
  // at *global* frame.
  // With this velocity, the pose can be extrapolated.
  // r(t+dt) = r(t) + dr/dt * dt where dr/dt = v_{x,y,z}.
  // dR(t)/dt = W*R(t) where W = SkewSymmetric(w_{x,y,z})
  // This differential equation solves to: R(t) = exp(Wt)*R(0) if W is constant.
  // When dt is small: R(t+dt) = (I+W*dt)R(t)
  // r(t) = (x(t), y(t), z(t)) is vehicle location at t in the global frame.
  // R(t) = Rotation Matrix (3x3) from the body frame to the global frame at t.
  // SkewSymmetric(x,y,z) is defined as the cross-product matrix in the
  // following:
  // https://en.wikipedia.org/wiki/Cross_product#Conversion_to_matrix_multiplication
  optional Velocity velocity = 4;
  // Timestamp of the `pose` above.
  optional double pose_timestamp = 5;

  // Rolling shutter params.
  // The following explanation assumes left->right rolling shutter.
  //
  // Rolling shutter cameras expose and read the image column by column, offset
  // by the read out time for each column. The desired timestamp for each column
  // is the middle of the exposure of that column as outlined below for an image
  // with 3 columns:
  // ------time------>
  // |---- exposure col 1----| read |
  // -------|---- exposure col 2----| read |
  // --------------|---- exposure col 3----| read |
  // ^trigger time                                ^readout end time
  //             ^time for row 1 (= middle of exposure of row 1)
  //                    ^time image center (= middle of exposure of middle row)
  // Shutter duration in seconds. Exposure time per column.
  optional double shutter = 6;
  // Time when the sensor was triggered and when last readout finished.
  // The difference between trigger time and readout done time includes
  // the exposure time and the actual sensor readout time.
  optional double camera_trigger_time = 7;
  optional double camera_readout_done_time = 8;
}

CameraImage表示照片拍摄的时间和环境、拍摄时的相机操作值等。

// The camera labels associated with a given camera image. This message
// indicates the ground truth information for the camera image
// recorded by the given camera. If there are no labeled objects in the image,
// then the labels field is empty.
message CameraLabels {
  optional CameraName.Name name = 1;
  repeated Label labels = 2;
}

CameraLabels 指示哪些注释存在于哪些相机上。

message Laser {
  optional LaserName.Name name = 1;
  optional RangeImage ri_return1 = 2;
  optional RangeImage ri_return2 = 3;
}

激光指示哪个 RangeImage 在哪个激光雷达上。

message Frame {
  // The following field numbers are reserved for third-party extensions. Users
  // may declare new fields in that range in their own .proto files without
  // having to edit the original file.
  extensions 1000 to max;

  // This context is the same for all frames belong to the same driving run
  // segment. Use context.name to identify frames belong to the same driving
  // segment. We do not store all frames from one driving segment in one proto
  // to avoid huge protos.
  optional Context context = 1;

  // Frame start time, which is the timestamp of the first top lidar spin
  // within this frame.
  optional int64 timestamp_micros = 2;
  // The vehicle pose.
  optional Transform pose = 3;
  repeated CameraImage images = 4;
  repeated Laser lasers = 5;
  repeated Label laser_labels = 6;
  // Lidar labels (laser_labels) projected to camera images. A projected
  // label is the smallest image axis aligned rectangle that can cover all
  // projected points from the 3d lidar label. The projected label is ignored if
  // the projection is fully outside a camera image. The projected label is
  // clamped to the camera image if it is partially outside.
  repeated CameraLabels projected_lidar_labels = 9;
  // NOTE: if a camera identified by CameraLabels.name has an entry in this
  // field, then it has been labeled, even though it is possible that there are
  // no labeled objects in the corresponding image, which is identified by a
  // zero sized CameraLabels.labels.
  repeated CameraLabels camera_labels = 8;
  // No label zones in the *global* frame.
  repeated Polygon2dProto no_label_zones = 7;
}

Frame 组合了上面的所有值并将它们显示为一帧。每个值都显示为键值。