hough_voting_gpu

hough_voting_gpu_op.cc

/* Copyright 2015 Google Inc. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

// Computing label Op

#include <stdio.h>
#include <cfloat>
#include <math.h>
#include <time.h>
#include <algorithm>
#include <Eigen/Geometry> 
#include "opencv2/opencv.hpp"

#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/tensor_shape.h"

#define VERTEX_CHANNELS 3

using namespace tensorflow;
typedef Eigen::ThreadPoolDevice CPUDevice;

REGISTER_OP("Houghvotinggpu")
    .Attr("T: {float, double}")
    .Attr("is_train: int")
    .Attr("threshold_vote: int")
    .Attr("skip_pixels: int")
    .Input("bottom_label: int32")
    .Input("bottom_vertex: T")
    .Input("bottom_extents: T")
    .Input("bottom_meta_data: T")
    .Input("bottom_gt: T")
    .Output("top_box: T")
    .Output("top_pose: T")
    .Output("top_target: T")
    .Output("top_weight: T")
    .Output("top_domain: int32");

REGISTER_OP("HoughvotinggpuGrad")
    .Attr("T: {float, double}")
    .Input("bottom_label: int32")
    .Input("bottom_vertex: T")
    .Input("grad: T")
    .Output("output_label: T")
    .Output("output_vertex: T");

int clamp(int val, int min_val, int max_val)
{
  return std::max(min_val, std::min(max_val, val));
}

void getBb3Ds(const float* extents, std::vector<std::vector<cv::Point3f>>& bb3Ds, int num_classes);
inline std::vector<cv::Point3f> getBB3D(const cv::Vec<float, 3>& extent);
inline cv::Rect getBB2D(int imageWidth, int imageHeight, const std::vector<cv::Point3f>& bb3D, const cv::Mat& camMat, const cv::Mat& rvec, const cv::Mat& tvec);
inline float getIoU(const cv::Rect& bb1, const cv::Rect bb2);
inline float angle_distance(cv::Point2f x, cv::Point2f n, cv::Point2f p);

void hough_voting(const int* labelmap, const float* vertmap, std::vector<std::vector<cv::Point3f>> bb3Ds,
  int batch, int height, int width, int num_classes, int is_train,
  float fx, float fy, float px, float py, std::vector<cv::Vec<float, 14> >& outputs);

void compute_target_weight(int height, int width, float* target, float* weight, std::vector<std::vector<cv::Point3f>> bb3Ds, 
  const float* poses_gt, int num_gt, int num_classes, float fx, float fy, float px, float py, std::vector<cv::Vec<float, 14> > outputs);

inline void compute_width_height(const int* labelmap, const float* vertmap, cv::Point2f center, 
  std::vector<std::vector<cv::Point3f>> bb3Ds, cv::Mat camMat, float inlierThreshold, 
  int height, int width, int channel, int num_classes, int & bb_width, int & bb_height, float & bb_distance);

// cuda functions
void HoughVotingLaucher(OpKernelContext* context,
    const int* labelmap, const float* vertmap, const float* extents, const float* meta_data, const float* gt,
    const int batch_index, const int height, const int width, const int num_classes, const int num_gt, 
    const int is_train, const float inlierThreshold, const int labelThreshold, const int votingThreshold, const int skip_pixels,
    float* top_box, float* top_pose, float* top_target, float* top_weight, int* top_domain, int* num_rois, const Eigen::GpuDevice& d);

void allocate_outputs(OpKernelContext* context, Tensor* top_box_tensor, Tensor* top_pose_tensor, Tensor* top_target_tensor, Tensor* top_weight_tensor, Tensor* top_domain_tensor, Tensor* top_rois_tensor, int num_classes)
{
  int num = 1024;
  int dims[2];

  dims[0] = num;
  dims[1] = 7;
  TensorShape output_shape;
  TensorShapeUtils::MakeShape(dims, 2, &output_shape);
  OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, output_shape, top_box_tensor));

  dims[1] = 7;
  TensorShape output_shape_1;
  TensorShapeUtils::MakeShape(dims, 2, &output_shape_1);
  OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, output_shape_1, top_pose_tensor));

  dims[1] = 4 * num_classes;
  TensorShape output_shape_2;
  TensorShapeUtils::MakeShape(dims, 2, &output_shape_2);
  OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, output_shape_2, top_target_tensor));
  OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, output_shape_2, top_weight_tensor));

  TensorShape output_shape_3;
  TensorShapeUtils::MakeShape(&num, 1, &output_shape_3);
  OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, output_shape_3, top_domain_tensor));

  int len = 1;
  TensorShape output_shape_4;
  TensorShapeUtils::MakeShape(&len, 1, &output_shape_4);
  OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, output_shape_4, top_rois_tensor));
}

void reset_outputs(float* top_box, float* top_pose, float* top_target, float* top_weight, int* top_domain, int* num_rois, int num_classes);
void copy_num_rois(int* num_rois, int* num_rois_device);

void copy_outputs(float* top_box, float* top_pose, float* top_target, float* top_weight, int* top_domain,
  float* top_box_final, float* top_pose_final, float* top_target_final, float* top_weight_final, int* top_domain_final, int num_classes, int num_rois);

void set_gradients(float* top_label, float* top_vertex, int batch_size, int height, int width, int num_classes);

template <typename Device, typename T>
class HoughvotinggpuOp : public OpKernel {
 public:
  explicit HoughvotinggpuOp(OpKernelConstruction* context) : OpKernel(context) {
    // Get the pool height
    OP_REQUIRES_OK(context,
                   context->GetAttr("is_train", &is_train_));
    // Check that pooled_height is positive
    OP_REQUIRES(context, is_train_ >= 0,
                errors::InvalidArgument("Need is_train >= 0, got ",
                                        is_train_));
    OP_REQUIRES_OK(context,
                   context->GetAttr("threshold_vote", &threshold_vote_));
    OP_REQUIRES_OK(context,
                   context->GetAttr("skip_pixels", &skip_pixels_));

  }

  // bottom_label: (batch_size, height, width)
  // bottom_vertex: (batch_size, height, width, 3 * num_classes)
  // top_box: (num, 7) i.e., batch_index, cls, x1, y1, x2, y2, score
  void Compute(OpKernelContext* context) override 
  {
    // Grab the input tensor
    const Tensor& bottom_label = context->input(0);
    const Tensor& bottom_vertex = context->input(1);
    const Tensor& bottom_extents = context->input(2);

    // format of the meta_data
    // intrinsic matrix: meta_data[0 ~ 8]
    // inverse intrinsic matrix: meta_data[9 ~ 17]
    // pose_world2live: meta_data[18 ~ 29]
    // pose_live2world: meta_data[30 ~ 41]
    // voxel step size: meta_data[42, 43, 44]
    // voxel min value: meta_data[45, 46, 47]
    const Tensor& bottom_meta_data = context->input(3);
    auto meta_data = bottom_meta_data.flat<T>();

    const Tensor& bottom_gt = context->input(4);
    const float* gt = bottom_gt.flat<float>().data();

    // data should have 5 dimensions.
    OP_REQUIRES(context, bottom_label.dims() == 3,
                errors::InvalidArgument("label must be 3-dimensional"));

    OP_REQUIRES(context, bottom_vertex.dims() == 4,
                errors::InvalidArgument("vertex must be 4-dimensional"));

    // batch size
    int batch_size = bottom_label.dim_size(0);
    // height
    int height = bottom_label.dim_size(1);
    // width
    int width = bottom_label.dim_size(2);
    // num of classes
    int num_classes = bottom_vertex.dim_size(3) / VERTEX_CHANNELS;
    int num_meta_data = bottom_meta_data.dim_size(3);
    int num_gt = bottom_gt.dim_size(0);

    // for each image, run hough voting
    std::vector<cv::Vec<float, 14> > outputs;
    const float* extents = bottom_extents.flat<float>().data();

    // bb3Ds
    std::vector<std::vector<cv::Point3f>> bb3Ds;
    getBb3Ds(extents, bb3Ds, num_classes);

    int index_meta_data = 0;
    float fx, fy, px, py;
    for (int n = 0; n < batch_size; n++)
    {
      const int* labelmap = bottom_label.flat<int>().data() + n * height * width;
      const float* vertmap = bottom_vertex.flat<float>().data() + n * height * width * VERTEX_CHANNELS * num_classes;
      fx = meta_data(index_meta_data + 0);
      fy = meta_data(index_meta_data + 4);
      px = meta_data(index_meta_data + 2);
      py = meta_data(index_meta_data + 5);
      hough_voting(labelmap, vertmap, bb3Ds, n, height, width, num_classes, is_train_, fx, fy, px, py, outputs);
      index_meta_data += num_meta_data;
    }

    if (outputs.size() == 0)
    {
      std::cout << "no detection" << std::endl;
      // add a dummy detection to the output
      cv::Vec<float, 14> roi;
      roi(0) = 0;
      roi(1) = -1;
      outputs.push_back(roi);
    }

    // Create output tensors
    // top_box
    int dims[2];
    dims[0] = outputs.size();
    dims[1] = 7;
    TensorShape output_shape;
    TensorShapeUtils::MakeShape(dims, 2, &output_shape);

    Tensor* top_box_tensor = NULL;
    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &top_box_tensor));
    float* top_box = top_box_tensor->template flat<float>().data();

    // top_pose
    dims[1] = 7;
    TensorShape output_shape_pose;
    TensorShapeUtils::MakeShape(dims, 2, &output_shape_pose);

    Tensor* top_pose_tensor = NULL;
    OP_REQUIRES_OK(context, context->allocate_output(1, output_shape_pose, &top_pose_tensor));
    float* top_pose = top_pose_tensor->template flat<float>().data();

    // top target
    dims[1] = 4 * num_classes;
    TensorShape output_shape_target;
    TensorShapeUtils::MakeShape(dims, 2, &output_shape_target);

    Tensor* top_target_tensor = NULL;
    OP_REQUIRES_OK(context, context->allocate_output(2, output_shape_target, &top_target_tensor));
    float* top_target = top_target_tensor->template flat<float>().data();
    memset(top_target, 0, outputs.size() * 4 * num_classes *sizeof(T));

    // top weight
    Tensor* top_weight_tensor = NULL;
    OP_REQUIRES_OK(context, context->allocate_output(3, output_shape_target, &top_weight_tensor));
    float* top_weight = top_weight_tensor->template flat<float>().data();
    memset(top_weight, 0, outputs.size() * 4 * num_classes *sizeof(T));

    // top domain
    int num = outputs.size();
    TensorShape output_shape_domain;
    TensorShapeUtils::MakeShape(&num, 1, &output_shape_domain);
    Tensor* top_domain_tensor = NULL;
    OP_REQUIRES_OK(context, context->allocate_output(4, output_shape_domain, &top_domain_tensor));
    int* top_domain = top_domain_tensor->template flat<int>().data();
    memset(top_domain, 0, outputs.size() * sizeof(int));
    
    for(int n = 0; n < outputs.size(); n++)
    {
      cv::Vec<float, 14> roi = outputs[n];

      for (int i = 0; i < 7; i++)
        top_box[n * 7 + i] = roi(i);

      for (int i = 0; i < 7; i++)
        top_pose[n * 7 + i] = roi(7 + i);

      if (num_gt == 0)
        top_domain[n] = 1;
      else
        top_domain[n] = 0;
    }

    if (is_train_)
      compute_target_weight(height, width, top_target, top_weight, bb3Ds, gt, num_gt, num_classes, fx, fy, px, py, outputs);
  }
 private:
  int is_train_;
  int threshold_vote_;
  int skip_pixels_;
};

REGISTER_KERNEL_BUILDER(Name("Houghvotinggpu").Device(DEVICE_CPU).TypeConstraint<float>("T"), HoughvotinggpuOp<CPUDevice, float>);

template <class T>
class HoughvotinggpuOp<Eigen::GpuDevice, T> : public OpKernel {
 public:
  typedef Eigen::GpuDevice Device;

  explicit HoughvotinggpuOp(OpKernelConstruction* context) : OpKernel(context) 
  {
    // Get the pool height
    OP_REQUIRES_OK(context,
                   context->GetAttr("is_train", &is_train_));
    // Check that pooled_height is positive
    OP_REQUIRES(context, is_train_ >= 0,
                errors::InvalidArgument("Need is_train >= 0, got ",
                                        is_train_));
    OP_REQUIRES_OK(context,
                   context->GetAttr("threshold_vote", &threshold_vote_));
    OP_REQUIRES_OK(context,
                   context->GetAttr("skip_pixels", &skip_pixels_));
  }

  void Compute(OpKernelContext* context) override 
  {
    // Grab the input tensor
    const Tensor& bottom_label = context->input(0);
    const Tensor& bottom_vertex = context->input(1);

    // data should have 5 dimensions.
    OP_REQUIRES(context, bottom_label.dims() == 3,
                errors::InvalidArgument("label must be 3-dimensional"));

    OP_REQUIRES(context, bottom_vertex.dims() == 4,
                errors::InvalidArgument("vertex must be 4-dimensional"));

    const Tensor& bottom_extents = context->input(2);
    const float* extents = bottom_extents.flat<float>().data();

    // format of the meta_data
    // intrinsic matrix: meta_data[0 ~ 8]
    // inverse intrinsic matrix: meta_data[9 ~ 17]
    // pose_world2live: meta_data[18 ~ 29]
    // pose_live2world: meta_data[30 ~ 41]
    // voxel step size: meta_data[42, 43, 44]
    // voxel min value: meta_data[45, 46, 47]
    const Tensor& bottom_meta_data = context->input(3);

    const Tensor& bottom_gt = context->input(4);
    const float* gt = bottom_gt.flat<float>().data();

    int batch_size = bottom_label.dim_size(0);
    int height = bottom_label.dim_size(1);
    int width = bottom_label.dim_size(2);
    int num_classes = bottom_vertex.dim_size(3) / VERTEX_CHANNELS;
    int num_meta_data = bottom_meta_data.dim_size(3);
    int num_gt = bottom_gt.dim_size(0);

    float inlierThreshold = 0.9;
    int labelThreshold = 500;
    Tensor top_box_tensor_tmp, top_pose_tensor_tmp, top_target_tensor_tmp, top_weight_tensor_tmp, top_domain_tensor_tmp, num_rois_tensor_tmp;
    allocate_outputs(context, &top_box_tensor_tmp, &top_pose_tensor_tmp, &top_target_tensor_tmp, &top_weight_tensor_tmp, 
      &top_domain_tensor_tmp, &num_rois_tensor_tmp, num_classes);
    float* top_box = top_box_tensor_tmp.flat<float>().data();
    float* top_pose = top_pose_tensor_tmp.flat<float>().data();
    float* top_target = top_target_tensor_tmp.flat<float>().data();
    float* top_weight = top_weight_tensor_tmp.flat<float>().data();
    int* top_domain = top_domain_tensor_tmp.flat<int>().data();
    int* num_rois_device = num_rois_tensor_tmp.flat<int>().data();
    reset_outputs(top_box, top_pose, top_target, top_weight, top_domain, num_rois_device, num_classes);

    for (int n = 0; n < batch_size; n++)
    {
      const int* labelmap = bottom_label.flat<int>().data() + n * height * width;
      const float* vertmap = bottom_vertex.flat<float>().data() + n * height * width * VERTEX_CHANNELS * num_classes;
      const float* meta_data = bottom_meta_data.flat<float>().data() + n * num_meta_data;
      HoughVotingLaucher(context, labelmap, vertmap, extents, meta_data, gt, n, height, width, num_classes, num_gt,
        is_train_, inlierThreshold, labelThreshold, threshold_vote_, skip_pixels_,
        top_box, top_pose, top_target, top_weight, top_domain, num_rois_device, context->eigen_device<Eigen::GpuDevice>());
    }

    int num_rois;
    copy_num_rois(&num_rois, num_rois_device);
    // dummy output
    if (num_rois == 0)
      num_rois = 1;

    // Create output tensors
    // top_box
    int dims[2];
    dims[0] = num_rois;
    dims[1] = 7;
    TensorShape output_shape;
    TensorShapeUtils::MakeShape(dims, 2, &output_shape);

    Tensor* top_box_tensor = NULL;
    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &top_box_tensor));
    float* top_box_final = top_box_tensor->flat<float>().data();

    // top_pose
    dims[1] = 7;
    TensorShape output_shape_pose;
    TensorShapeUtils::MakeShape(dims, 2, &output_shape_pose);

    Tensor* top_pose_tensor = NULL;
    OP_REQUIRES_OK(context, context->allocate_output(1, output_shape_pose, &top_pose_tensor));
    float* top_pose_final = top_pose_tensor->flat<float>().data();

    // top target
    dims[1] = 4 * num_classes;
    TensorShape output_shape_target;
    TensorShapeUtils::MakeShape(dims, 2, &output_shape_target);

    Tensor* top_target_tensor = NULL;
    OP_REQUIRES_OK(context, context->allocate_output(2, output_shape_target, &top_target_tensor));
    float* top_target_final = top_target_tensor->flat<float>().data();

    // top weight
    Tensor* top_weight_tensor = NULL;
    OP_REQUIRES_OK(context, context->allocate_output(3, output_shape_target, &top_weight_tensor));
    float* top_weight_final = top_weight_tensor->flat<float>().data();

    // top domain
    TensorShape output_shape_domain;
    TensorShapeUtils::MakeShape(&num_rois, 1, &output_shape_domain);
    Tensor* top_domain_tensor = NULL;
    OP_REQUIRES_OK(context, context->allocate_output(4, output_shape_domain, &top_domain_tensor));
    int* top_domain_final = top_domain_tensor->template flat<int>().data();

    copy_outputs(top_box, top_pose, top_target, top_weight, top_domain, top_box_final, 
      top_pose_final, top_target_final, top_weight_final, top_domain_final, num_classes, num_rois);
  }
 private:
  int is_train_;
  int threshold_vote_;
  int skip_pixels_;
};

REGISTER_KERNEL_BUILDER(Name("Houghvotinggpu").Device(DEVICE_GPU).TypeConstraint<float>("T"), HoughvotinggpuOp<Eigen::GpuDevice, float>);

// compute gradient
template <class Device, class T>
class HoughvotinggpuGradOp : public OpKernel {
 public:
  explicit HoughvotinggpuGradOp(OpKernelConstruction* context) : OpKernel(context) {
  }

  void Compute(OpKernelContext* context) override 
  {
    // Grab the input tensor
    const Tensor& bottom_label = context->input(0);
    const Tensor& bottom_vertex = context->input(1);

    // data should have 5 dimensions.
    OP_REQUIRES(context, bottom_label.dims() == 3,
                errors::InvalidArgument("label must be 3-dimensional"));

    OP_REQUIRES(context, bottom_vertex.dims() == 4,
                errors::InvalidArgument("vertex must be 4-dimensional"));

    // batch size
    int batch_size = bottom_label.dim_size(0);
    // height
    int height = bottom_label.dim_size(1);
    // width
    int width = bottom_label.dim_size(2);
    // num of classes
    int num_classes = bottom_vertex.dim_size(3) / VERTEX_CHANNELS;

    // construct the output shape
    TensorShape output_shape = bottom_label.shape();
    Tensor* top_label_tensor = NULL;
    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &top_label_tensor));
    float* top_label = top_label_tensor->flat<float>().data();

    TensorShape output_shape_1 = bottom_vertex.shape();
    Tensor* top_vertex_tensor = NULL;
    OP_REQUIRES_OK(context, context->allocate_output(1, output_shape_1, &top_vertex_tensor));
    float* top_vertex = top_vertex_tensor->flat<float>().data();

    set_gradients(top_label, top_vertex, batch_size, height, width, num_classes);
  }
};

// REGISTER_KERNEL_BUILDER(Name("HoughvotinggpuGrad").Device(DEVICE_CPU).TypeConstraint<float>("T"), HoughvotinggpuGradOp<CPUDevice, float>);
REGISTER_KERNEL_BUILDER(Name("HoughvotinggpuGrad").Device(DEVICE_GPU).TypeConstraint<float>("T"), HoughvotinggpuGradOp<Eigen::GpuDevice, float>);

void hough_voting(const int* labelmap, const float* vertmap, std::vector<std::vector<cv::Point3f>> bb3Ds, 
  int batch, int height, int width, int num_classes, int is_train,
  float fx, float fy, float px, float py, std::vector<cv::Vec<float, 14> >& outputs)
{
  float inlierThreshold = 0.9;
  int votingThreshold = 50;

  // camera matrix
  cv::Mat_<float> camMat = cv::Mat_<float>::zeros(3, 3);
  camMat(0, 0) = fx;
  camMat(1, 1) = fy;
  camMat(2, 2) = 1.f;
  camMat(0, 2) = px;
  camMat(1, 2) = py;

  // initialize hough space
  int* hough_space = (int*)malloc(sizeof(int) * height * width * num_classes);
  memset(hough_space, 0, height * width * num_classes);

  int* flags = (int*)malloc(sizeof(int) * num_classes);
  memset(flags, 0, num_classes);

  // for each pixel
  for (int x = 0; x < width; x++)
  {
    for (int y = 0; y < height; y++)
    {
      int c = labelmap[y * width + x];
      if (c > 0)
      {
        flags[c] = 1;
        // read the predict center direction
        int offset = VERTEX_CHANNELS * c + VERTEX_CHANNELS * num_classes * (y * width + x);
        float u = vertmap[offset];
        float v = vertmap[offset + 1];
        float norm = sqrt(u * u + v * v);
        u /= norm;
        v /= norm;

        // voting
	float delta = 1.0 / fabs(u);
        float cx = x;
        float cy = y;
        while(1)
        {
          cx += delta * u;
          cy += delta * v;
          int center_x = int(cx);
          int center_y = int(cy);
          if (center_x >= 0 && center_x < width && center_y >= 0 && center_y < height)
          {
            offset = c + num_classes * (center_y * width + center_x);
            hough_space[offset] += 1;
          }
          else
            break;
        }
      }
    }
  }

  // find the maximum in hough space
  for (int c = 1; c < num_classes; c++)
  {
    if (flags[c])
    {
      int max_vote = 0;
      int max_x, max_y;
      for (int x = 0; x < width; x++)
      {
        for (int y = 0; y < height; y++)
        {
          int offset = c + num_classes * (y * width + x);
          if (hough_space[offset] > max_vote)
          {
            max_vote = hough_space[offset];
            max_x = x;
            max_y = y;
          }
        }
      }
      if (max_vote < votingThreshold)
        continue;

      // center
      cv::Point2f center(max_x, max_y);
      int bb_width, bb_height;
      float bb_distance;
      compute_width_height(labelmap, vertmap, center, bb3Ds, camMat, inlierThreshold, height, width, c, num_classes, bb_width, bb_height, bb_distance);

      // construct output
      cv::Vec<float, 14> roi;
      roi(0) = batch;
      roi(1) = c;

      // bounding box
      float scale = 0.05;
      roi(2) = center.x - bb_width * (0.5 + scale);
      roi(3) = center.y - bb_height * (0.5 + scale);
      roi(4) = center.x + bb_width * (0.5 + scale);
      roi(5) = center.y + bb_height * (0.5 + scale);

      // score
      roi(6) = max_vote;

      // pose
      float rx = (center.x - px) / fx;
      float ry = (center.y - py) / fy;
      roi(7) = 1;
      roi(8) = 0;
      roi(9) = 0;
      roi(10) = 0;
      roi(11) = rx * bb_distance;
      roi(12) = ry * bb_distance;
      roi(13) = bb_distance;

      outputs.push_back(roi);

      if (is_train)
      {
        // add jittering rois
        float x1 = roi(2);
        float y1 = roi(3);
        float x2 = roi(4);
        float y2 = roi(5);
        float ww = x2 - x1;
        float hh = y2 - y1;

        // (-1, -1)
        roi(2) = x1 - 0.05 * ww;
        roi(3) = y1 - 0.05 * hh;
        roi(4) = roi(2) + ww;
        roi(5) = roi(3) + hh;
        outputs.push_back(roi);

        // (+1, -1)
        roi(2) = x1 + 0.05 * ww;
        roi(3) = y1 - 0.05 * hh;
        roi(4) = roi(2) + ww;
        roi(5) = roi(3) + hh;
        outputs.push_back(roi);

        // (-1, +1)
        roi(2) = x1 - 0.05 * ww;
        roi(3) = y1 + 0.05 * hh;
        roi(4) = roi(2) + ww;
        roi(5) = roi(3) + hh;
        outputs.push_back(roi);

        // (+1, +1)
        roi(2) = x1 + 0.05 * ww;
        roi(3) = y1 + 0.05 * hh;
        roi(4) = roi(2) + ww;
        roi(5) = roi(3) + hh;
        outputs.push_back(roi);

        // (0, -1)
        roi(2) = x1;
        roi(3) = y1 - 0.05 * hh;
        roi(4) = roi(2) + ww;
        roi(5) = roi(3) + hh;
        outputs.push_back(roi);

        // (-1, 0)
        roi(2) = x1 - 0.05 * ww;
        roi(3) = y1;
        roi(4) = roi(2) + ww;
        roi(5) = roi(3) + hh;
        outputs.push_back(roi);

        // (0, +1)
        roi(2) = x1;
        roi(3) = y1 + 0.05 * hh;
        roi(4) = roi(2) + ww;
        roi(5) = roi(3) + hh;
        outputs.push_back(roi);

        // (+1, 0)
        roi(2) = x1 + 0.05 * ww;
        roi(3) = y1;
        roi(4) = roi(2) + ww;
        roi(5) = roi(3) + hh;
        outputs.push_back(roi);
      }
    }
  }
}

inline float angle_distance(cv::Point2f x, cv::Point2f n, cv::Point2f p)
{
  return n.dot(x - p) / (cv::norm(n) * cv::norm(x - p));
}

inline void compute_width_height(const int* labelmap, const float* vertmap, cv::Point2f center, 
  std::vector<std::vector<cv::Point3f>> bb3Ds, cv::Mat camMat, float inlierThreshold, 
  int height, int width, int channel, int num_classes, int & bb_width, int & bb_height, float & bb_distance)
{
  float d = 0;
  int count = 0;

  // for each pixel
  std::vector<float> dx;
  std::vector<float> dy;
  for (int x = 0; x < width; x++)
  {
    for (int y = 0; y < height; y++)
    {
      if (labelmap[y * width + x] == channel)
      {
        cv::Point2f point(x, y);
  
        // read out object coordinate
        int offset = VERTEX_CHANNELS * channel + VERTEX_CHANNELS * num_classes * (y * width + x);
        float u = vertmap[offset];
        float v = vertmap[offset + 1];
        float distance = exp(vertmap[offset + 2]);
        float norm = sqrt(u * u + v * v);
        u /= norm;
        v /= norm;
        cv::Point2f direction(u, v);

        // inlier check
        if(angle_distance(center, direction, point) > inlierThreshold)
        {
          dx.push_back(fabs(point.x - center.x));
          dy.push_back(fabs(point.y - center.y));
          d += distance;
          count++;
        }
      }
    }
  }
  bb_distance = d / count;

  // estimate a projection
  cv::Mat tvec(3, 1, CV_64F);
  cv::Mat rvec(3, 1, CV_64F);
  for(int i = 0; i < 3; i++)
  {
    tvec.at<double>(i, 0) = 0;
    rvec.at<double>(i, 0) = 0;
  }
  tvec.at<double>(2, 0) = bb_distance;
  std::vector<cv::Point2f> bb2D;
  cv::projectPoints(bb3Ds[channel-1], rvec, tvec, camMat, cv::Mat(), bb2D);
    
  // get min-max of projected vertices
  int minX = 1e8;
  int maxX = -1e8;
  int minY = 1e8;
  int maxY = -1e8;
  for(int i = 0; i < bb2D.size(); i++)
  {
    minX = std::min((float) minX, bb2D[i].x);
    minY = std::min((float) minY, bb2D[i].y);
    maxX = std::max((float) maxX, bb2D[i].x);
    maxY = std::max((float) maxY, bb2D[i].y);
  }
  cv::Rect bb = cv::Rect(0, 0, (maxX - minX + 1), (maxY - minY + 1));

  std::vector<float>::iterator it;
  it = std::remove_if(dx.begin(), dx.end(), std::bind2nd(std::greater<float>(), std::max(bb.width, bb.height) ));
  dx.erase(it, dx.end()); 

  it = std::remove_if(dy.begin(), dy.end(), std::bind2nd(std::greater<float>(), std::max(bb.width, bb.height) ));
  dy.erase(it, dy.end()); 

  std::sort(dx.begin(), dx.end());
  std::sort(dy.begin(), dy.end());

  bb_width = 2 * dx[int(dx.size() * 0.95)];
  bb_height = 2 * dy[int(dy.size() * 0.95)];
}


// compute the pose target and weight
void compute_target_weight(int height, int width, float* target, float* weight, std::vector<std::vector<cv::Point3f>> bb3Ds, 
  const float* poses_gt, int num_gt, int num_classes, float fx, float fy, float px, float py, std::vector<cv::Vec<float, 14> > outputs)
{
  int num = outputs.size();
  float threshold = 0.2;

  // camera matrix
  cv::Mat_<float> camMat = cv::Mat_<float>::zeros(3, 3);
  camMat(0, 0) = fx;
  camMat(1, 1) = fy;
  camMat(2, 2) = 1.f;
  camMat(0, 2) = px;
  camMat(1, 2) = py;

  // compute the gt boxes
  std::vector<cv::Rect> bb2Ds_gt(num_gt);
  for (int i = 0; i < num_gt; i++)
  {
    Eigen::Quaternionf quaternion(poses_gt[i * 13 + 6], poses_gt[i * 13 + 7], poses_gt[i * 13 + 8], poses_gt[i * 13 + 9]);
    Eigen::Matrix3f rmatrix = quaternion.toRotationMatrix();
    cv::Mat rmat_trans = cv::Mat(3, 3, CV_32F, rmatrix.data());
    cv::Mat rmat;
    cv::transpose(rmat_trans, rmat);
    cv::Mat rvec(3, 1, CV_64F);
    cv::Rodrigues(rmat, rvec);
    cv::Mat tvec(3, 1, CV_64F);
    tvec.at<double>(0, 0) = poses_gt[i * 13 + 10];
    tvec.at<double>(1, 0) = poses_gt[i * 13 + 11];
    tvec.at<double>(2, 0) = poses_gt[i * 13 + 12];

    int objID = int(poses_gt[i * 13 + 1]);
    std::vector<cv::Point3f> bb3D = bb3Ds[objID-1];
    bb2Ds_gt[i] = getBB2D(width, height, bb3D, camMat, rvec, tvec);
  }

  for (int i = 0; i < num; i++)
  {
    cv::Vec<float, 14> roi = outputs[i];
    int batch_id = int(roi(0));
    int class_id = int(roi(1));

    // find the gt index
    int gt_ind = -1;
    for (int j = 0; j < num_gt; j++)
    {
      int gt_batch = int(poses_gt[j * 13 + 0]);
      int gt_id = int(poses_gt[j * 13 + 1]);
      if(class_id == gt_id && batch_id == gt_batch)
      {
        gt_ind = j;
        break;
      }
    }

    if (gt_ind == -1)
      continue;

    // compute bounding box overlap
    float x1 = roi(2);
    float y1 = roi(3);
    float x2 = roi(4);
    float y2 = roi(5);
    cv::Rect bb2D(x1, y1, x2-x1, y2-y1);

    float overlap = getIoU(bb2D, bb2Ds_gt[gt_ind]);
    if (overlap < threshold)
      continue;

    target[i * 4 * num_classes + 4 * class_id + 0] = poses_gt[gt_ind * 13 + 6];
    target[i * 4 * num_classes + 4 * class_id + 1] = poses_gt[gt_ind * 13 + 7];
    target[i * 4 * num_classes + 4 * class_id + 2] = poses_gt[gt_ind * 13 + 8];
    target[i * 4 * num_classes + 4 * class_id + 3] = poses_gt[gt_ind * 13 + 9];

    weight[i * 4 * num_classes + 4 * class_id + 0] = 1;
    weight[i * 4 * num_classes + 4 * class_id + 1] = 1;
    weight[i * 4 * num_classes + 4 * class_id + 2] = 1;
    weight[i * 4 * num_classes + 4 * class_id + 3] = 1;
  }
}


// get 3D bounding boxes
void getBb3Ds(const float* extents, std::vector<std::vector<cv::Point3f>>& bb3Ds, int num_classes)
{
  // for each object
  for (int i = 1; i < num_classes; i++)
  {
    cv::Vec<float, 3> extent;
    extent(0) = extents[i * 3];
    extent(1) = extents[i * 3 + 1];
    extent(2) = extents[i * 3 + 2];

    bb3Ds.push_back(getBB3D(extent));
  }
}


inline std::vector<cv::Point3f> getBB3D(const cv::Vec<float, 3>& extent)
{
  std::vector<cv::Point3f> bb;  
  float xHalf = extent[0] * 0.5;
  float yHalf = extent[1] * 0.5;
  float zHalf = extent[2] * 0.5;
    
  bb.push_back(cv::Point3f(xHalf, yHalf, zHalf));
  bb.push_back(cv::Point3f(-xHalf, yHalf, zHalf));
  bb.push_back(cv::Point3f(xHalf, -yHalf, zHalf));
  bb.push_back(cv::Point3f(-xHalf, -yHalf, zHalf));
    
  bb.push_back(cv::Point3f(xHalf, yHalf, -zHalf));
  bb.push_back(cv::Point3f(-xHalf, yHalf, -zHalf));
  bb.push_back(cv::Point3f(xHalf, -yHalf, -zHalf));
  bb.push_back(cv::Point3f(-xHalf, -yHalf, -zHalf));
    
  return bb;
}


inline cv::Rect getBB2D(int imageWidth, int imageHeight, const std::vector<cv::Point3f>& bb3D, const cv::Mat& camMat, const cv::Mat& rvec, const cv::Mat& tvec)
{    
  // project 3D bounding box vertices into the image
  std::vector<cv::Point2f> bb2D;
  cv::projectPoints(bb3D, rvec, tvec, camMat, cv::Mat(), bb2D);
    
  // get min-max of projected vertices
  int minX = imageWidth - 1;
  int maxX = 0;
  int minY = imageHeight - 1;
  int maxY = 0;
    
  for(unsigned j = 0; j < bb2D.size(); j++)
  {
    minX = std::min((float) minX, bb2D[j].x);
    minY = std::min((float) minY, bb2D[j].y);
    maxX = std::max((float) maxX, bb2D[j].x);
    maxY = std::max((float) maxY, bb2D[j].y);
  }
    
  // clamp at image border
  minX = clamp(minX, 0, imageWidth - 1);
  maxX = clamp(maxX, 0, imageWidth - 1);
  minY = clamp(minY, 0, imageHeight - 1);
  maxY = clamp(maxY, 0, imageHeight - 1);
    
  return cv::Rect(minX, minY, (maxX - minX + 1), (maxY - minY + 1));
}


inline float getIoU(const cv::Rect& bb1, const cv::Rect bb2)
{
  cv::Rect intersection = bb1 & bb2;
  return (intersection.area() / (float) (bb1.area() + bb2.area() - intersection.area()));
}

hough_voting_gpu_op.cu.cc
#if GOOGLE_CUDA

#define EIGEN_USE_GPU

#include <stdio.h>
#include <cfloat>
#include <time.h>
#include <thrust/extrema.h>
#include <Eigen/Geometry> 
#include <cublas_v2.h>
#include "hough_voting_gpu_op.h"

#define VERTEX_CHANNELS 3

#define CUDA_1D_KERNEL_LOOP(i, n)                            \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
       i += blockDim.x * gridDim.x)

// namespace tensorflow {
using namespace tensorflow;

__device__ inline float point2line(int cx, int cy, int x, int y, float u, float v)
{
  float n1 = -v;
  float n2 = u;

  return fabs(n1 * (cx - x) + n2 * (cy - y)) / sqrt(n1 * n1 + n2 * n2);
}


__device__ inline float angle_distance(int cx, int cy, int x, int y, float u, float v)
{
  float dx = cx - x;
  float dy = cy - y;
  float n1 = sqrt(u * u + v * v);
  float n2 = sqrt(dx * dx + dy * dy);
  float dot = u * dx + v * dy;
  float distance = dot / (n1 * n2);

  return distance;
}

__device__ inline float angle_distance_label(int cx, int cy, int x, int y, float u, float v, 
  int cls, const int height, const int width, const int* labelmap)
{
  float dx = cx - x;
  float dy = cy - y;
  float n1 = sqrt(u * u + v * v);
  float n2 = sqrt(dx * dx + dy * dy);
  float dot = u * dx + v * dy;
  float distance = dot / (n1 * n2);

  int num = 20;
  int count = 0;
  for (int i = 1; i <= num; i++)
  {
    float step = float(i) / float(num);
    int px = int(x + step * dx);
    int py = int(y + step * dy);
    if (px >= 0 && px < width && py >= 0 && py < height)
    {
      if (labelmap[py * width + px] == cls)
        count++;
    }
  }
  if ((float)count / float(num) < 0.8)
    distance = 0;

  return distance;
}

__device__ inline float IoU(float* a, float* b) 
{
  float left = fmax(a[0], b[0]), right = fmin(a[2], b[2]);
  float top = fmax(a[1], b[1]), bottom = fmin(a[3], b[3]);
  float width = fmax(right - left + 1, 0.f), height = fmax(bottom - top + 1, 0.f);
  float interS = width * height;
  float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
  float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
  return interS / (Sa + Sb - interS);
}

__device__ inline void project_box(int cls, const float* extents, const float* meta_data, float distance, float* threshold)
{
  float xHalf = extents[cls * 3 + 0] * 0.5;
  float yHalf = extents[cls * 3 + 1] * 0.5;
  float zHalf = extents[cls * 3 + 2] * 0.5;
  float bb3D[24];

  bb3D[0] = xHalf; bb3D[1] = yHalf; bb3D[2] = zHalf + distance;
  bb3D[3] = -xHalf; bb3D[4] = yHalf; bb3D[5] = zHalf + distance;
  bb3D[6] = xHalf; bb3D[7] = -yHalf; bb3D[8] = zHalf + distance;
  bb3D[9] = -xHalf; bb3D[10] = -yHalf; bb3D[11] = zHalf + distance;
  bb3D[12] = xHalf; bb3D[13] = yHalf; bb3D[14] = -zHalf + distance;
  bb3D[15] = -xHalf; bb3D[16] = yHalf; bb3D[17] = -zHalf + distance;
  bb3D[18] = xHalf; bb3D[19] = -yHalf; bb3D[20] = -zHalf + distance;
  bb3D[21] = -xHalf; bb3D[22] = -yHalf; bb3D[23] = -zHalf + distance;

  float fx = meta_data[0];
  float fy = meta_data[4];
  float px = meta_data[2];
  float py = meta_data[5];
  float minX = 1e8;
  float maxX = -1e8;
  float minY = 1e8;
  float maxY = -1e8;
  for (int i = 0; i < 8; i++)
  {
    float x = fx * (bb3D[i * 3] / bb3D[i * 3 + 2])  + px;
    float y = fy * (bb3D[i * 3 + 1] / bb3D[i * 3 + 2])  + py;
    minX = fmin(minX, x);
    minY = fmin(minY, y);
    maxX = fmax(maxX, x);
    maxY = fmax(maxY, y);
  }
  float width = maxX - minX + 1;
  float height = maxY - minY + 1;
  *threshold = fmax(width, height) * 0.6;
}


__device__ inline float compute_box_overlap(int cls, const float* extents, const float* meta_data, const float* pose, float* box)
{
  float xHalf = extents[cls * 3 + 0] * 0.5;
  float yHalf = extents[cls * 3 + 1] * 0.5;
  float zHalf = extents[cls * 3 + 2] * 0.5;

  Eigen::Matrix<float,8,3,Eigen::DontAlign> bb3D;
  bb3D(0, 0) = xHalf; bb3D(0, 1) = yHalf; bb3D(0, 2) = zHalf;
  bb3D(1, 0) = -xHalf; bb3D(1, 1) = yHalf; bb3D(1, 2) = zHalf;
  bb3D(2, 0) = xHalf; bb3D(2, 1) = -yHalf; bb3D(2, 2) = zHalf;
  bb3D(3, 0) = -xHalf; bb3D(3, 1) = -yHalf; bb3D(3, 2) = zHalf;
  bb3D(4, 0) = xHalf; bb3D(4, 1) = yHalf; bb3D(4, 2) = -zHalf;
  bb3D(5, 0) = -xHalf; bb3D(5, 1) = yHalf; bb3D(5, 2) = -zHalf;
  bb3D(6, 0) = xHalf; bb3D(6, 1)= -yHalf; bb3D(6, 2) = -zHalf;
  bb3D(7, 0) = -xHalf; bb3D(7, 1) = -yHalf; bb3D(7, 2) = -zHalf;

  // rotation
  Eigen::Quaternionf quaternion(pose[6], pose[7], pose[8], pose[9]);
  Eigen::Matrix3f rmatrix = quaternion.toRotationMatrix();
  Eigen::Matrix<float,3,8,Eigen::DontAlign> bb3D_new = rmatrix * bb3D.transpose();

  // projection
  float fx = meta_data[0];
  float fy = meta_data[4];
  float px = meta_data[2];
  float py = meta_data[5];
  float x1 = 1e8;
  float x2 = -1e8;
  float y1 = 1e8;
  float y2 = -1e8;
  for (int i = 0; i < 8; i++)
  {
    float X = bb3D_new(0, i) + pose[10];
    float Y = bb3D_new(1, i) + pose[11];
    float Z = bb3D_new(2, i) + pose[12];
    float x = fx * (X / Z)  + px;
    float y = fy * (Y / Z)  + py;
    x1 = fmin(x1, x);
    y1 = fmin(y1, y);
    x2 = fmax(x2, x);
    y2 = fmax(y2, y);
  }

  float box_gt[4];
  box_gt[0] = x1;
  box_gt[1] = y1;
  box_gt[2] = x2;
  box_gt[3] = y2;
  return IoU(box, box_gt);
}

__global__ void compute_arrays_kernel(const int nthreads, const int* labelmap,
    int* arrays, int* array_size, const int height, const int width) 
{
  CUDA_1D_KERNEL_LOOP(index, nthreads) 
  {
    int cls = labelmap[index];
    if (cls > 0)
    {
      int size = atomicAdd(array_size + cls, 1);
      int offset = cls * height * width + size;
      arrays[offset] = index;
    }
  }
}


__global__ void compute_hough_kernel(const int nthreads, float* hough_space, float* hough_data, const int* labelmap, 
    const float* vertmap, const float* extents, const float* meta_data, int* arrays, int* array_size, 
    int* class_indexes, const int height, const int width, const int num_classes, const int count, const float inlierThreshold, const int skip_pixels) 
{
  CUDA_1D_KERNEL_LOOP(index, nthreads) 
  {
    // (cls, cx, cy) is an element in the hough space
    int ind = index / (height * width);
    int cls = class_indexes[ind];
    int n = index % (height * width);
    int cx = n % width;
    int cy = n / width;

    int size = array_size[cls];
    float distance = 0;
    float bb_width = -1;
    float bb_height = -1;
    float threshold;
    for (int i = 0; i < size; i += skip_pixels)
    {
      int offset = cls * height * width + i;
      int location = arrays[offset];
      int x = location % width;
      int y = location / width;

      // read the direction
      offset = VERTEX_CHANNELS * cls + VERTEX_CHANNELS * num_classes * (y * width + x);
      float u = vertmap[offset];
      float v = vertmap[offset + 1];
      float d = exp(vertmap[offset + 2]);

      // vote
      if (angle_distance(cx, cy, x, y, u, v) > inlierThreshold)
      // if (point2line(cx, cy, x, y, u, v) < 1 && angle_distance_label(cx, cy, x, y, u, v, cls, height, width, labelmap) > 0)
      {
        project_box(cls, extents, meta_data, d, &threshold);
        float dx = fabsf(x - cx);
        float dy = fabsf(y - cy);
        if (dx < threshold && dy < threshold)
        {
          hough_space[index]++;
          distance += d;
        }
        if (dx > bb_width && dx < threshold && dy < threshold)
          bb_width = dx;
        if (dy > bb_height && dx < threshold && dy < threshold)
          bb_height = dy;
      }
    }

    if (hough_space[index] > 0)
    {
      distance /= hough_space[index];
      int offset = ind * height * width * 3 + 3 * (cy * width + cx);
      hough_data[offset] = distance;
      hough_data[offset + 1] = 2 * bb_height;
      hough_data[offset + 2] = 2 * bb_width;
    }
  }
}


__global__ void compute_max_indexes_kernel(const int nthreads, int* max_indexes, int* num_max, float* hough_space, 
  int height, int width, float threshold)
{
  CUDA_1D_KERNEL_LOOP(index, nthreads) 
  {
    // (ind, cx, cy) is an element in the hough space
    int ind = index / (height * width);
    int n = index % (height * width);
    int cx = n % width;
    int cy = n / width;
    int kernel_size = 3;

    if (hough_space[index] > threshold)
    {
      // check if the location is local maximum
      int flag = 0;
      for (int x = cx - kernel_size; x <= cx + kernel_size; x++)
      {
        for (int y = cy - kernel_size; y <= cy + kernel_size; y++)
        {
          if (x >= 0 && x < width && y >= 0 && y < height)
          {
            if (hough_space[ind * height * width + y * width + x] > hough_space[index])
            {
              flag = 1;
              break;
            }
          }
        }
      }

      if (flag == 0)
      {
        // add the location to max_indexes
        int max_index = atomicAdd(num_max, 1);
        max_indexes[max_index] = index;
      }
    }
  }
}


__global__ void compute_rois_kernel(const int nthreads, float* top_box, float* top_pose, float* top_target, float* top_weight, int* top_domain,
    const float* extents, const float* meta_data, const float* gt, float* hough_space, float* hough_data, int* max_indexes, int* class_indexes,
    int is_train, int batch_index, const int height, const int width, const int num_classes, const int num_gt, int* num_rois) 
{
  CUDA_1D_KERNEL_LOOP(index, nthreads) 
  {
    float scale = 0.05;
    int max_index = max_indexes[index];
    int ind = max_index / (height * width);
    int cls = class_indexes[ind];
    int n = max_index % (height * width);
    int x = n % width;
    int y = n / width;

    float fx = meta_data[0];
    float fy = meta_data[4];
    float px = meta_data[2];
    float py = meta_data[5];
    float rx = (x - px) / fx;
    float ry = (y - py) / fy;

    int offset = ind * height * width * 3 + 3 * (y * width + x);
    float bb_distance = hough_data[offset];
    float bb_height = hough_data[offset + 1];
    float bb_width = hough_data[offset + 2];

    if (is_train)
    {
      int roi_index = atomicAdd(num_rois, 9);
      top_box[roi_index * 7 + 0] = batch_index;
      top_box[roi_index * 7 + 1] = cls;
      top_box[roi_index * 7 + 2] = x - bb_width * (0.5 + scale);
      top_box[roi_index * 7 + 3] = y - bb_height * (0.5 + scale);
      top_box[roi_index * 7 + 4] = x + bb_width * (0.5 + scale);
      top_box[roi_index * 7 + 5] = y + bb_height * (0.5 + scale);
      top_box[roi_index * 7 + 6] = hough_space[max_index];

      for (int i = 0; i < 9; i++)
      {
        top_pose[(roi_index + i) * 7 + 0] = 1;
        top_pose[(roi_index + i) * 7 + 1] = 0;
        top_pose[(roi_index + i) * 7 + 2] = 0;
        top_pose[(roi_index + i) * 7 + 3] = 0;
        top_pose[(roi_index + i) * 7 + 4] = rx * bb_distance;
        top_pose[(roi_index + i) * 7 + 5] = ry * bb_distance;
        top_pose[(roi_index + i) * 7 + 6] = bb_distance;

        if (num_gt == 0)
          top_domain[roi_index + i] = 1;
        else
          top_domain[roi_index + i] = 0;
      }

      // find the gt index
      int gt_ind = -1;
      for (int i = 0; i < num_gt; i++)
      {
        int gt_batch = int(gt[i * 13 + 0]);
        int gt_id = int(gt[i * 13 + 1]);
        if(cls == gt_id && batch_index == gt_batch)
        {
          gt_ind = i;
          break;
        }
      }

      if (gt_ind != -1)
      {
        float overlap = compute_box_overlap(cls, extents, meta_data, gt + gt_ind * 13, top_box + roi_index * 7 + 2);
        if (overlap > 0.2)
        {
          for (int i = 0; i < 9; i++)
          {
            top_target[(roi_index + i) * 4 * num_classes + 4 * cls + 0] = gt[gt_ind * 13 + 6];
            top_target[(roi_index + i) * 4 * num_classes + 4 * cls + 1] = gt[gt_ind * 13 + 7];
            top_target[(roi_index + i) * 4 * num_classes + 4 * cls + 2] = gt[gt_ind * 13 + 8];
            top_target[(roi_index + i) * 4 * num_classes + 4 * cls + 3] = gt[gt_ind * 13 + 9];

            top_weight[(roi_index + i) * 4 * num_classes + 4 * cls + 0] = 1;
            top_weight[(roi_index + i) * 4 * num_classes + 4 * cls + 1] = 1;
            top_weight[(roi_index + i) * 4 * num_classes + 4 * cls + 2] = 1;
            top_weight[(roi_index + i) * 4 * num_classes + 4 * cls + 3] = 1;
          }
        }
        // else
        //  printf("small overlap\n");
      }
      // else
      //  printf("no gt pose\n");

      // add jittering boxes
      float x1 = top_box[roi_index * 7 + 2];
      float y1 = top_box[roi_index * 7 + 3];
      float x2 = top_box[roi_index * 7 + 4];
      float y2 = top_box[roi_index * 7 + 5];
      float ww = x2 - x1;
      float hh = y2 - y1;

      // (-1, -1)
      roi_index++;
      top_box[roi_index * 7 + 0] = batch_index;
      top_box[roi_index * 7 + 1] = cls;
      top_box[roi_index * 7 + 2] = x1 - 0.05 * ww;
      top_box[roi_index * 7 + 3] = y1 - 0.05 * hh;
      top_box[roi_index * 7 + 4] = top_box[roi_index * 7 + 2] + ww;
      top_box[roi_index * 7 + 5] = top_box[roi_index * 7 + 3] + hh;
      top_box[roi_index * 7 + 6] = hough_space[max_index];

      // (+1, -1)
      roi_index++;
      top_box[roi_index * 7 + 0] = batch_index;
      top_box[roi_index * 7 + 1] = cls;
      top_box[roi_index * 7 + 2] = x1 + 0.05 * ww;
      top_box[roi_index * 7 + 3] = y1 - 0.05 * hh;
      top_box[roi_index * 7 + 4] = top_box[roi_index * 7 + 2] + ww;
      top_box[roi_index * 7 + 5] = top_box[roi_index * 7 + 3] + hh;
      top_box[roi_index * 7 + 6] = hough_space[max_index];

      // (-1, +1)
      roi_index++;
      top_box[roi_index * 7 + 0] = batch_index;
      top_box[roi_index * 7 + 1] = cls;
      top_box[roi_index * 7 + 2] = x1 - 0.05 * ww;
      top_box[roi_index * 7 + 3] = y1 + 0.05 * hh;
      top_box[roi_index * 7 + 4] = top_box[roi_index * 7 + 2] + ww;
      top_box[roi_index * 7 + 5] = top_box[roi_index * 7 + 3] + hh;
      top_box[roi_index * 7 + 6] = hough_space[max_index];

      // (+1, +1)
      roi_index++;
      top_box[roi_index * 7 + 0] = batch_index;
      top_box[roi_index * 7 + 1] = cls;
      top_box[roi_index * 7 + 2] = x1 + 0.05 * ww;
      top_box[roi_index * 7 + 3] = y1 + 0.05 * hh;
      top_box[roi_index * 7 + 4] = top_box[roi_index * 7 + 2] + ww;
      top_box[roi_index * 7 + 5] = top_box[roi_index * 7 + 3] + hh;
      top_box[roi_index * 7 + 6] = hough_space[max_index];

      // (0, -1)
      roi_index++;
      top_box[roi_index * 7 + 0] = batch_index;
      top_box[roi_index * 7 + 1] = cls;
      top_box[roi_index * 7 + 2] = x1;
      top_box[roi_index * 7 + 3] = y1 - 0.05 * hh;
      top_box[roi_index * 7 + 4] = top_box[roi_index * 7 + 2] + ww;
      top_box[roi_index * 7 + 5] = top_box[roi_index * 7 + 3] + hh;
      top_box[roi_index * 7 + 6] = hough_space[max_index];

      // (-1, 0)
      roi_index++;
      top_box[roi_index * 7 + 0] = batch_index;
      top_box[roi_index * 7 + 1] = cls;
      top_box[roi_index * 7 + 2] = x1 - 0.05 * ww;
      top_box[roi_index * 7 + 3] = y1;
      top_box[roi_index * 7 + 4] = top_box[roi_index * 7 + 2] + ww;
      top_box[roi_index * 7 + 5] = top_box[roi_index * 7 + 3] + hh;
      top_box[roi_index * 7 + 6] = hough_space[max_index];

      // (0, +1)
      roi_index++;
      top_box[roi_index * 7 + 0] = batch_index;
      top_box[roi_index * 7 + 1] = cls;
      top_box[roi_index * 7 + 2] = x1;
      top_box[roi_index * 7 + 3] = y1 + 0.05 * hh;
      top_box[roi_index * 7 + 4] = top_box[roi_index * 7 + 2] + ww;
      top_box[roi_index * 7 + 5] = top_box[roi_index * 7 + 3] + hh;
      top_box[roi_index * 7 + 6] = hough_space[max_index];

      // (+1, 0)
      roi_index++;
      top_box[roi_index * 7 + 0] = batch_index;
      top_box[roi_index * 7 + 1] = cls;
      top_box[roi_index * 7 + 2] = x1 + 0.05 * ww;
      top_box[roi_index * 7 + 3] = y1;
      top_box[roi_index * 7 + 4] = top_box[roi_index * 7 + 2] + ww;
      top_box[roi_index * 7 + 5] = top_box[roi_index * 7 + 3] + hh;
      top_box[roi_index * 7 + 6] = hough_space[max_index];
    }
    else
    {
      int roi_index = atomicAdd(num_rois, 1);
      top_box[roi_index * 7 + 0] = batch_index;
      top_box[roi_index * 7 + 1] = cls;
      top_box[roi_index * 7 + 2] = x - bb_width * (0.5 + scale);
      top_box[roi_index * 7 + 3] = y - bb_height * (0.5 + scale);
      top_box[roi_index * 7 + 4] = x + bb_width * (0.5 + scale);
      top_box[roi_index * 7 + 5] = y + bb_height * (0.5 + scale);
      top_box[roi_index * 7 + 6] = hough_space[max_index];

      top_pose[roi_index * 7 + 0] = 1;
      top_pose[roi_index * 7 + 1] = 0;
      top_pose[roi_index * 7 + 2] = 0;
      top_pose[roi_index * 7 + 3] = 0;
      top_pose[roi_index * 7 + 4] = rx * bb_distance;
      top_pose[roi_index * 7 + 5] = ry * bb_distance;
      top_pose[roi_index * 7 + 6] = bb_distance;
    }
  }
}


void reset_outputs(float* top_box, float* top_pose, float* top_target, float* top_weight, int* top_domain, int* num_rois, int num_classes)
{
  int num = 1024;
  cudaMemset(top_box, 0, num * 7 * sizeof(float));
  cudaMemset(top_pose, 0, num * 7 * sizeof(float));
  cudaMemset(top_target, 0, num * 4 *num_classes * sizeof(float));
  cudaMemset(top_weight, 0, num * 4 * num_classes * sizeof(float));
  cudaMemset(top_domain, 0, num * sizeof(int));
  cudaMemset(num_rois, 0, sizeof(int));
}


void copy_num_rois(int* num_rois, int* num_rois_device)
{
  cudaMemcpy(num_rois, num_rois_device, sizeof(int), cudaMemcpyDeviceToHost);
}


void copy_outputs(float* top_box, float* top_pose, float* top_target, float* top_weight, int* top_domain,
  float* top_box_final, float* top_pose_final, float* top_target_final, float* top_weight_final, int* top_domain_final, int num_classes, int num_rois)
{
  cudaMemcpy(top_box_final, top_box, num_rois * 7 * sizeof(float), cudaMemcpyDeviceToDevice);
  cudaMemcpy(top_pose_final, top_pose, num_rois * 7 * sizeof(float), cudaMemcpyDeviceToDevice);
  cudaMemcpy(top_target_final, top_target, num_rois * 4 * num_classes * sizeof(float), cudaMemcpyDeviceToDevice);
  cudaMemcpy(top_weight_final, top_weight, num_rois * 4 * num_classes * sizeof(float), cudaMemcpyDeviceToDevice);
  cudaMemcpy(top_domain_final, top_domain, num_rois * sizeof(int), cudaMemcpyDeviceToDevice);
}


void set_gradients(float* top_label, float* top_vertex, int batch_size, int height, int width, int num_classes)
{
  cudaMemset(top_label, 0, batch_size * height * width * sizeof(float));
  cudaMemset(top_vertex, 0, batch_size * height * width * 3 * num_classes * sizeof(float));
}


void HoughVotingLaucher(OpKernelContext* context,
    const int* labelmap, const float* vertmap, const float* extents, const float* meta_data, const float* gt,
    const int batch_index, const int height, const int width, const int num_classes, const int num_gt, 
    const int is_train, const float inlierThreshold, const int labelThreshold, const int votingThreshold, const int skip_pixels, 
    float* top_box, float* top_pose, float* top_target, float* top_weight, int* top_domain, int* num_rois, const Eigen::GpuDevice& d)
{
  const int kThreadsPerBlock = 1024;
  int output_size;
  cudaError_t err;

  // step 1: compute a label index array for each class
  int dims[2];
  dims[0] = num_classes;
  dims[1] = height * width;
  TensorShape output_shape_arrays;
  TensorShapeUtils::MakeShape(dims, 2, &output_shape_arrays);
  Tensor arrays_tensor;
  OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, output_shape_arrays, &arrays_tensor));
  int* arrays = arrays_tensor.flat<int>().data();

  TensorShape output_shape_array_sizes;
  TensorShapeUtils::MakeShape(&num_classes, 1, &output_shape_array_sizes);
  Tensor array_sizes_tensor;
  OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, output_shape_array_sizes, &array_sizes_tensor));
  int* array_sizes = array_sizes_tensor.flat<int>().data();
  cudaMemset(array_sizes, 0, num_classes * sizeof(int));

  output_size = height * width;
  compute_arrays_kernel<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
                       kThreadsPerBlock, 0, d.stream()>>>(
      output_size, labelmap, arrays, array_sizes, height, width);
  cudaThreadSynchronize();

  // compute class indexes
  int* array_sizes_host = (int*)malloc(num_classes * sizeof(int));
  int* class_indexes_host = (int*)malloc(num_classes * sizeof(int));
  cudaMemcpy(array_sizes_host, array_sizes, num_classes * sizeof(int), cudaMemcpyDeviceToHost);
  int count = 0;
  for (int c = 1; c < num_classes; c++)
  {
    if (array_sizes_host[c] > labelThreshold)
    {
      class_indexes_host[count] = c;
      count++;
    }
    // else
    //  printf("class %d with only pixels %d\n", c, array_sizes_host[c]);
  }

  if (count == 0)
  {
    free(array_sizes_host);
    free(class_indexes_host);
    return;
  }

  TensorShape output_shape_class_indexes;
  TensorShapeUtils::MakeShape(&count, 1, &output_shape_class_indexes);
  Tensor class_indexes_tensor;
  OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, output_shape_class_indexes, &class_indexes_tensor));
  int* class_indexes = class_indexes_tensor.flat<int>().data();
  cudaMemcpy(class_indexes, class_indexes_host, count * sizeof(int), cudaMemcpyHostToDevice);

  err = cudaGetLastError();
  if(cudaSuccess != err)
  {
    fprintf( stderr, "cudaCheckError() failed compute label index: %s\n", cudaGetErrorString( err ) );
    exit( -1 );
  }

  // step 2: compute the hough space
  int hdims[4];
  hdims[0] = count;
  hdims[1] = height;
  hdims[2] = width;
  hdims[3] = 1;
  TensorShape output_shape_hough_space;
  TensorShapeUtils::MakeShape(hdims, 4, &output_shape_hough_space);
  Tensor hough_space_tensor;
  OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, output_shape_hough_space, &hough_space_tensor));
  float* hough_space = hough_space_tensor.flat<float>().data(); 
  if (cudaMemset(hough_space, 0, count * height * width * sizeof(float)) != cudaSuccess)
    fprintf(stderr, "reset error\n");

  hdims[3] = 3;
  TensorShape output_shape_hough_data;
  TensorShapeUtils::MakeShape(hdims, 4, &output_shape_hough_data);
  Tensor hough_data_tensor;
  OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, output_shape_hough_data, &hough_data_tensor));
  float* hough_data = hough_data_tensor.flat<float>().data(); 
  if (cudaMemset(hough_data, 0, count * height * width * 3 * sizeof(float)) != cudaSuccess)
    fprintf(stderr, "reset error\n");

  output_size = count * height * width;
  compute_hough_kernel<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
                       kThreadsPerBlock, 0, d.stream()>>>(
      output_size, hough_space, hough_data, labelmap, vertmap, extents, meta_data,
      arrays, array_sizes, class_indexes, height, width, num_classes, count, inlierThreshold, skip_pixels);
  cudaThreadSynchronize();

  err = cudaGetLastError();
  if(cudaSuccess != err)
  {
    fprintf( stderr, "cudaCheckError() failed compute hough space: %s\n", cudaGetErrorString( err ) );
    exit( -1 );
  }

  // step 3: find the maximum in hough space
  int dim = 1;
  TensorShape output_shape_num_max;
  TensorShapeUtils::MakeShape(&dim, 1, &output_shape_num_max);
  Tensor num_max_tensor;
  OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, output_shape_num_max, &num_max_tensor));
  int* num_max = num_max_tensor.flat<int>().data();
  if (cudaMemset(num_max, 0, sizeof(int)) != cudaSuccess)
    fprintf(stderr, "reset error\n");

  dim = 1024;
  TensorShape output_shape_max_indexes;
  TensorShapeUtils::MakeShape(&dim, 1, &output_shape_max_indexes);
  Tensor max_indexes_tensor;
  OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, output_shape_max_indexes, &max_indexes_tensor));
  int* max_indexes = max_indexes_tensor.flat<int>().data(); 
  if (cudaMemset(max_indexes, 0, dim * sizeof(int)) != cudaSuccess)
    fprintf(stderr, "reset error\n");

  if (votingThreshold > 0)
  {
    output_size = count * height * width;
    compute_max_indexes_kernel<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
                       kThreadsPerBlock, 0, d.stream()>>>(
      output_size, max_indexes, num_max, hough_space, height, width, votingThreshold);
    cudaThreadSynchronize();
  }
  else
  {
    int* max_indexes_host = (int*)malloc(count * sizeof(int));
    memset(max_indexes_host, 0, count * sizeof(int));
    for (int i = 0; i < count; i++)
    {
      float *hmax = thrust::max_element(thrust::device, hough_space + i * height * width, hough_space + (i+1) * height * width);
      max_indexes_host[i] = hmax - hough_space;
    }
    cudaMemcpy(num_max, &count, sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(max_indexes, max_indexes_host, count * sizeof(int), cudaMemcpyHostToDevice);
    free(max_indexes_host);
  }
  err = cudaGetLastError();
  if(cudaSuccess != err)
  {
    fprintf( stderr, "cudaCheckError() failed compute maximum: %s\n", cudaGetErrorString( err ) );
    exit( -1 );
  }

  // step 4: compute outputs
  int num_max_host;
  cudaMemcpy(&num_max_host, num_max, sizeof(int), cudaMemcpyDeviceToHost);
  output_size = num_max_host;
  compute_rois_kernel<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
                       kThreadsPerBlock, 0, d.stream()>>>(
      output_size, top_box, top_pose, top_target, top_weight, top_domain,
      extents, meta_data, gt, hough_space, hough_data, max_indexes, class_indexes,
      is_train, batch_index, height, width, num_classes, num_gt, num_rois);
  cudaThreadSynchronize();
  
  // clean up
  free(array_sizes_host);
  free(class_indexes_host);

  err = cudaGetLastError();
  if(cudaSuccess != err)
  {
    fprintf( stderr, "cudaCheckError() failed compute outputs: %s\n", cudaGetErrorString( err ) );
    exit( -1 );
  }
}

// }  // namespace tensorflow

#endif  // GOOGLE_CUDA

hough_voting_gpu_op.h

#if !GOOGLE_CUDA
#error This file must only be included when building with Cuda support
#endif

#ifndef TENSORFLOW_USER_OPS_HOUGHVOTING_OP_GPU_H_
#define TENSORFLOW_USER_OPS_HOUGHVOTING_OP_GPU_H_

#define EIGEN_USE_GPU

#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/platform/types.h"

namespace tensorflow {

void HoughVotingLaucher(OpKernelContext* context,
    const int* labelmap, const float* vertmap, const float* extents, const float* meta_data, const float* gt,
    const int batch_index, const int height, const int width, const int num_classes, const int num_gt, 
    const int is_train, const float inlierThreshold, const int votingThreshold, 
    float* top_box, float* top_pose, float* top_target, float* top_weight, int* top_domain, int* num_rois, const Eigen::GpuDevice& d);

void reset_outputs(float* top_box, float* top_pose, float* top_target, float* top_weight, int* top_domain, int* num_rois, int num_classes);

void copy_num_rois(int* num_rois, int* num_rois_device);

void copy_outputs(float* top_box, float* top_pose, float* top_target, float* top_weight, int* top_domain,
  float* top_box_final, float* top_pose_final, float* top_target_final, float* top_weight_final, int* top_domain_final, int num_classes, int num_rois);

void set_gradients(float* top_label, float* top_vertex, int batch_size, int height, int width, int num_classes);

}  // namespace tensorflow

#endif  // TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_GPU_H_

The compile code use as make.sh

TF_INC=$(python -c 'import tensorflow as tf; print(tf.sysconfig.get_include())')
TF_LIB=$(python -c 'import tensorflow as tf; print(tf.sysconfig.get_lib())')

CUDA_PATH=/usr/local/cuda
nvcc -std=c++11 -c -o hough_voting_gpu_op.cu.o hough_voting_gpu_op.cu.cc \
	-I $TF_INC -I$TF_INC/external/nsync/public -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC -arch=sm_61

g++ -std=c++11 -shared  -D_GLIBCXX_USE_CXX11_ABI=0 -o hough_voting_gpu.so hough_voting_gpu_op.cc \
	hough_voting_gpu_op.cu.o -I $TF_INC -I$TF_INC/external/nsync/public -fPIC -lcudart -lcublas -lopencv_imgproc -lopencv_calib3d -lopencv_core -L $CUDA_PATH/lib64 -L$TF_LIB -ltensorflow_framework
echo 'hough_voting_gpu_layer'

The python file has these four

hough_voting_gpu_op.py

import tensorflow as tf
import os.path as osp

filename = osp.join(osp.dirname(__file__), 'hough_voting_gpu.so')
_hough_voting_gpu_module = tf.load_op_library(filename)
hough_voting_gpu = _hough_voting_gpu_module.houghvotinggpu
hough_voting_gpu_grad = _hough_voting_gpu_module.houghvotinggpu_grad

hough_voting_gpu_op_grad.py

import tensorflow as tf
from tensorflow.python.framework import ops
import os
sys.path.insert(0,os.path.dirname(__file__))
import hough_voting_gpu_op

@ops.RegisterShape("Houghvotinggpu")
def _hough_voting_gpu_shape(op):

  dims_vertex = op.inputs[1].get_shape().as_list()
  num_classes = dims_vertex[3] / 3

  output_shape_0 = tf.TensorShape([None, 7])
  output_shape_1 = tf.TensorShape([None, 7])
  output_shape_2 = tf.TensorShape([None, 4 * num_classes])
  output_shape_3 = tf.TensorShape([None, 4 * num_classes])
  output_shape_4 = tf.TensorShape([None])
  return [output_shape_0, output_shape_1, output_shape_2, output_shape_3, output_shape_4]

@ops.RegisterGradient("Houghvotinggpu")
def _hough_voting_gpu_grad(op, grad, tmp, tmp1, tmp2, _):
  """The gradients for `Houghvotinggpu`.
  Args:
    op: The `backproject` `Operation` that we are differentiating, which we can use
      to find the inputs and outputs of the original op.
    grad: Gradient with respect to the output of the `backproject` op.
  Returns:
    Gradients with respect to the input of `backproject`.
  """

  bottom_prob = op.inputs[0]
  bottom_vertex = op.inputs[1]

  # compute gradient
  data_grad_prob, data_grad_vertex = hough_voting_gpu_op.hough_voting_gpu_grad(bottom_prob, bottom_vertex, grad)

  return [data_grad_prob, data_grad_vertex, None, None, None]  # List of one Tensor, since we have two input

__init__.py

test.py

from IPython import embed
import tensorflow as tf
import numpy as np
import hough_voting_gpu_op
import hough_voting_gpu_op_grad


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值