







pip install mesh_renderer

python API


def look_at(eye, center, world_up):
  """Computes camera viewing matrices.

  Functionality mimes gluLookAt (third_party/GL/glu/include/GLU/glu.h).


    eye: 2-D float32 tensor with shape [batch_size, 3] containing the XYZ world

        space position of the camera.输入相机在世界坐标系中的三维位置。

    center: 2-D float32 tensor with shape [batch_size, 3] containing a position

        along the center of the camera's gaze.输入相机中心向前方向上一点的坐标。

    world_up: 2-D float32 tensor with shape [batch_size, 3] specifying the

        world's up direction; the output camera will have no tilt with respect

        to this direction.输入世界坐标系向上方向上的向量。


    A [batch_size, 4, 4] float tensor containing a right-handed camera

    extrinsics matrix that maps points from world space to points in eye space.
  camera_matrices = tf.matmul(R, T)
  return camera_matrices


R=\begin{bmatrix} ToSide_x & ToSide_y & ToSide_z& 0\\ CamUp_x& CamUp_y & CamUp_z & 0\\ -forward_x& -forward_y & -forward_z & 0\\ 0& 0 & 0 & 1 \end{bmatrix}

T=\begin{bmatrix} 1 & 0 & 0 & -eye_x\\ 0 & 1 & 0 & -eye_y\\ 0 & 0 & 1 & -eye_z\\ 0 & 0 & 0 & 1 \end{bmatrix}


def perspective(aspect_ratio, fov_y, near_clip, far_clip):
 """Computes perspective transformation matrices.
  Functionality mimes gluPerspective (third_party/GL/glu/include/GLU/glu.h).
    aspect_ratio: float value specifying the image aspect ratio (width/height).
    fov_y: 1-D float32 Tensor with shape [batch_size] specifying output vertical
        field of views in degrees.
    near_clip: 1-D float32 Tensor with shape [batch_size] specifying near
        clipping plane distance.
    far_clip: 1-D float32 Tensor with shape [batch_size] specifying far clipping
        plane distance.
    A [batch_size, 4, 4] float tensor that maps from right-handed points in eye
    space to left-handed points in clip space.
  return perspective_transform


T=\begin{bmatrix} focal_x/2*imageWidth & 0 & 0 & 0\\ 0 & focal_y/2*imageHeight & 0 & 0\\ 0 & 0 & -(farClip+nearClip)/depthRange & -2farClip*nearClip/depthRange\\ 0 & 0 & 1 & 0 \end{bmatrix}

def transform_homogeneous(matrices, vertices):
  """Applies batched 4x4 homogenous matrix transformations to 3-D vertices.
  The vertices are input and output as as row-major, but are interpreted as
  column vectors multiplied on the right-hand side of the matrices. More
  explicitly, this function computes (MV^T)^T.
  Vertices are assumed to be xyz, and are extended to xyzw with w=1.
    matrices: a [batch_size, 4, 4] tensor of matrices.
    输入将物体从世界坐标系转到相机clip space空间的转换矩阵
    vertices: a [batch_size, N, 3] tensor of xyz vertices.
    a [batch_size, N, 4] tensor of xyzw vertices.
    ValueError: if matrices or vertices have the wrong number of dimensions.
  if len(matrices.shape) != 3:
    raise ValueError(
        'matrices must have 3 dimensions (missing batch dimension?)')
  if len(vertices.shape) != 3:
    raise ValueError(
        'vertices must have 3 dimensions (missing batch dimension?)')
  homogeneous_coord = tf.ones(
      [tf.shape(vertices)[0], tf.shape(vertices)[1], 1], dtype=tf.float32)
  vertices_homogeneous = tf.concat([vertices, homogeneous_coord], 2)
  return tf.matmul(vertices_homogeneous, matrices, transpose_b=True)




def rasterize(world_space_vertices, attributes, triangles, camera_matrices,

              image_width, image_height, background_value):

  """Rasterizes a mesh and computes interpolated vertex attributes.
  Applies projection matrices and then calls rasterize_clip_space().
    world_space_vertices: 3-D float32 tensor of xyz positions with shape
      [batch_size, vertex_count, 3].
    attributes: 3-D float32 tensor with shape [batch_size, vertex_count,
      attribute_count]. Each vertex attribute is interpolated across the
      triangle using barycentric interpolation.
    triangles: 2-D int32 tensor with shape [triangle_count, 3]. Each triplet
      should contain vertex indices describing a triangle such that the
      triangle's normal points toward the viewer if the forward order of the
      triplet defines a clockwise winding of the vertices. Gradients with
      respect to this tensor are not available.
    camera_matrices: 3-D float tensor with shape [batch_size, 4, 4] containing
      model-view-perspective projection matrices.
    image_width: int specifying desired output image width in pixels.
    image_height: int specifying desired output image height in pixels.
    background_value: a 1-D float32 tensor with shape [attribute_count]. Pixels
      that lie outside all triangles take this value.
    A 4-D float32 tensor with shape [batch_size, image_height, image_width,attribute_count], containing the interpolated vertex attributes at each pixel.
    ValueError: An invalid argument to the method is detected.
  clip_space_vertices = camera_utils.transform_homogeneous(
      camera_matrices, world_space_vertices)
  return rasterize_clip_space(clip_space_vertices, attributes, triangles,
                              image_width, image_height, background_value)


def rasterize_clip_space(clip_space_vertices, attributes, triangles,
                         image_width, image_height, background_value):
  """Rasterizes the input mesh expressed in clip-space (xyzw) coordinates.
  Interpolates vertex attributes using perspective-correct interpolation and
  clips triangles that lie outside the viewing frustum.
    clip_space_vertices: 3-D float32 tensor of homogenous vertices (xyzw) with
      shape [batch_size, vertex_count, 4].
    attributes: 3-D float32 tensor with shape [batch_size, vertex_count,
      attribute_count]. Each vertex attribute is interpolated across the
      triangle using barycentric interpolation.
    triangles: 2-D int32 tensor with shape [triangle_count, 3]. Each triplet
      should contain vertex indices describing a triangle such that the
      triangle's normal points toward the viewer if the forward order of the
      triplet defines a clockwise winding of the vertices. Gradients with
      respect to this tensor are not available.
    image_width: int specifying desired output image width in pixels.
    image_height: int specifying desired output image height in pixels.
    background_value: a 1-D float32 tensor with shape [attribute_count]. Pixels
      that lie outside all triangles take this value.
    A 4-D float32 tensor with shape [batch_size, image_height, image_width,
    attribute_count], containing the interpolated vertex attributes at
    each pixel.
    ValueError: An invalid argument to the method is detected.


c++ API



namespace tf_mesh_renderer {
// Copied from tensorflow/core/platform/default/integral_types.h
// to avoid making this file depend on tensorflow.
typedef int int32;
typedef long long int64;
// Computes the triangle id, barycentric coordinates, and z-buffer at each pixel
// in the image.
// vertices: A flattened 2D array with 4*vertex_count elements.
//     Each contiguous triplet is the XYZW location of the vertex with that
//     triplet's id. The coordinates are assumed to be OpenGL-style clip-space
//     (i.e., post-projection, pre-divide), where X points right, Y points up,
//     Z points away.
// triangles: A flattened 2D array with 3*triangle_count elements.
//     Each contiguous triplet is the three vertex ids indexing into vertices
//     describing one triangle with clockwise winding.
// triangle_count: The number of triangles stored in the array triangles.
// triangle_ids: A flattened 2D array with image_height*image_width elements.
//     At return, each pixel contains a triangle id in the range
//     [0, triangle_count). The id value is also 0 if there is no triangle
//     at the pixel. The barycentric_coordinates must be checked to
//     distinguish the two cases.
// barycentric_coordinates: A flattened 3D array with
//     image_height*image_width*3 elements. At return, contains the triplet of
//     barycentric coordinates at each pixel in the same vertex ordering as
//     triangles. If no triangle is present, all coordinates are 0.
// z_buffer: A flattened 2D array with image_height*image_width elements. At
//     return, contains the normalized device Z coordinates of the rendered
//     triangles.
void RasterizeTrianglesImpl(const float* vertices, const int32* triangles,
                            int32 triangle_count, int32 image_width,
                            int32 image_height, int32* triangle_ids,
                            float* barycentric_coordinates, float* z_buffer);
}  // namespace tf_mesh_renderer


#include <algorithm>
#include <cmath>

#include "rasterize_triangles_impl.h"

namespace tf_mesh_renderer {
namespace {
// Takes the minimum of a, b, and c, rounds down, and converts to an integer
// in the range [low, high].
inline int ClampedIntegerMin(float a, float b, float c, int low, int high) {
  return std::min(
      std::max(static_cast<int>(std::floor(std::min(std::min(a, b), c))), low),

// Takes the maximum of a, b, and c, rounds up, and converts to an integer
// in the range [low, high].
inline int ClampedIntegerMax(float a, float b, float c, int low, int high) {
  return std::min(
      std::max(static_cast<int>(std::ceil(std::max(std::max(a, b), c))), low),

// Computes a 3x3 matrix inverse without dividing by the determinant.
// Instead, makes an unnormalized matrix inverse with the correct sign
// by flipping the sign of the matrix if the determinant is negative.
// By leaving out determinant division, the rows of M^-1 only depend on two out
// of three of the columns of M; i.e., the first row of M^-1 only depends on the
// second and third columns of M, the second only depends on the first and
// third, etc. This means we can compute edge functions for two neighboring
// triangles independently and produce exactly the same numerical result up to
// the sign. This in turn means we can avoid cracks in rasterization without
// using fixed-point arithmetic.
// See http://mathworld.wolfram.com/MatrixInverse.html
void ComputeUnnormalizedMatrixInverse(const float a11, const float a12,
                                      const float a13, const float a21,
                                      const float a22, const float a23,
                                      const float a31, const float a32,
                                      const float a33, float m_inv[9]) {
  m_inv[0] = a22 * a33 - a32 * a23;
  m_inv[1] = a13 * a32 - a33 * a12;
  m_inv[2] = a12 * a23 - a22 * a13;
  m_inv[3] = a23 * a31 - a33 * a21;
  m_inv[4] = a11 * a33 - a31 * a13;
  m_inv[5] = a13 * a21 - a23 * a11;
  m_inv[6] = a21 * a32 - a31 * a22;
  m_inv[7] = a12 * a31 - a32 * a11;
  m_inv[8] = a11 * a22 - a21 * a12;

  // The first column of the unnormalized M^-1 contains intermediate values for
  // det(M).
  const float det = a11 * m_inv[0] + a12 * m_inv[3] + a13 * m_inv[6];

  // Transfer the sign of the determinant.
  if (det < 0.0f) {
    for (int i = 0; i < 9; ++i) {
      m_inv[i] = -m_inv[i];

// Computes the edge functions from M^-1 as described by Olano and Greer,
// "Triangle Scan Conversion using 2D Homogeneous Coordinates."
// This function combines equations (3) and (4). It first computes
// [a b c] = u_i * M^-1, where u_0 = [1 0 0], u_1 = [0 1 0], etc.,
// then computes edge_i = aX + bY + c
void ComputeEdgeFunctions(const float px, const float py, const float m_inv[9],
                          float values[3]) {
  for (int i = 0; i < 3; ++i) {
    const float a = m_inv[3 * i + 0];
    const float b = m_inv[3 * i + 1];
    const float c = m_inv[3 * i + 2];
    values[i] = a * px + b * py + c;

// Determines whether the point p lies inside a front-facing triangle.
// Counts pixels exactly on an edge as inside the triangle, as long as the
// triangle is not degenerate. Degenerate (zero-area) triangles always fail the
// inside test.
bool PixelIsInsideTriangle(const float edge_values[3]) {
  // Check that the edge values are all non-negative and that at least one is
  // positive (triangle is non-degenerate).
  return (edge_values[0] >= 0 && edge_values[1] >= 0 && edge_values[2] >= 0) &&
         (edge_values[0] > 0 || edge_values[1] > 0 || edge_values[2] > 0);
}  // namespace

void RasterizeTrianglesImpl(const float* vertices, const int32* triangles,
                            int32 triangle_count, int32 image_width,
                            int32 image_height, int32* triangle_ids,
                            float* barycentric_coordinates, float* z_buffer) {
  const float half_image_width = 0.5 * image_width;
  const float half_image_height = 0.5 * image_height;
  float unnormalized_matrix_inverse[9];
  float b_over_w[3];
  for (int32 triangle_id = 0; triangle_id < triangle_count; ++triangle_id) {
    // 获取三角面片顶点id
	const int32 v0_x_id = 4 * triangles[3 * triangle_id];
    const int32 v1_x_id = 4 * triangles[3 * triangle_id + 1];
    const int32 v2_x_id = 4 * triangles[3 * triangle_id + 2];
	// 获取三角面片顶点clipSpace下的z坐标
    const float v0w = vertices[v0_x_id + 3];
    const float v1w = vertices[v1_x_id + 3];
    const float v2w = vertices[v2_x_id + 3];
    // Early exit: if all w < 0, triangle is entirely behind the eye.
    if (v0w < 0 && v1w < 0 && v2w < 0) {

    const float v0x = vertices[v0_x_id];
    const float v0y = vertices[v0_x_id + 1];
    const float v1x = vertices[v1_x_id];
    const float v1y = vertices[v1_x_id + 1];
    const float v2x = vertices[v2_x_id];
    const float v2y = vertices[v2_x_id + 1];

    ComputeUnnormalizedMatrixInverse(v0x, v1x, v2x, v0y, v1y, v2y, v0w, v1w,
                                     v2w, unnormalized_matrix_inverse);

    // Initialize the bounding box to the entire screen.
    int left = 0, right = image_width, bottom = 0, top = image_height;
    // If the triangle is entirely inside the screen, project the vertices to
    // pixel coordinates and find the triangle bounding box enlarged to the
    // nearest integer and clamped to the image boundaries.

    if (v0w > 0 && v1w > 0 && v2w > 0) {
	  // 当nearClip为0时,等同于相机透视投影
      const float p0x = (v0x / v0w + 1.0) * half_image_width;
      const float p1x = (v1x / v1w + 1.0) * half_image_width;
      const float p2x = (v2x / v2w + 1.0) * half_image_width;
      const float p0y = (v0y / v0w + 1.0) * half_image_height;
      const float p1y = (v1y / v1w + 1.0) * half_image_height;
      const float p2y = (v2y / v2w + 1.0) * half_image_height;
      left = ClampedIntegerMin(p0x, p1x, p2x, 0, image_width);
      right = ClampedIntegerMax(p0x, p1x, p2x, 0, image_width);
      bottom = ClampedIntegerMin(p0y, p1y, p2y, 0, image_height);
      top = ClampedIntegerMax(p0y, p1y, p2y, 0, image_height);

    // Iterate over each pixel in the bounding box.
    for (int iy = bottom; iy < top; ++iy) {
      for (int ix = left; ix < right; ++ix) {
        const float px = ((ix + 0.5) / half_image_width) - 1.0;
        const float py = ((iy + 0.5) / half_image_height) - 1.0;
        const int pixel_idx = iy * image_width + ix;
        ComputeEdgeFunctions(px, py, unnormalized_matrix_inverse, b_over_w);
        if (!PixelIsInsideTriangle(b_over_w)) {
        const float one_over_w = b_over_w[0] + b_over_w[1] + b_over_w[2];
        const float b0 = b_over_w[0] / one_over_w;
        const float b1 = b_over_w[1] / one_over_w;
        const float b2 = b_over_w[2] / one_over_w;
        const float v0z = vertices[v0_x_id + 2];
        const float v1z = vertices[v1_x_id + 2];
        const float v2z = vertices[v2_x_id + 2];
        // Since we computed an unnormalized w above, we need to recompute
        // a properly scaled clip-space w value and then divide clip-space z
        // by that.
        const float clip_z = b0 * v0z + b1 * v1z + b2 * v2z;
        const float clip_w = b0 * v0w + b1 * v1w + b2 * v2w;
        const float z = clip_z / clip_w;
        // Skip the pixel if it is farther than the current z-buffer pixel or
        // beyond the near or far clipping plane.
        if (z < -1.0 || z > 1.0 || z > z_buffer[pixel_idx]) {
        triangle_ids[pixel_idx] = triangle_id;
        z_buffer[pixel_idx] = z;
        barycentric_coordinates[3 * pixel_idx + 0] = b0;
        barycentric_coordinates[3 * pixel_idx + 1] = b1;
        barycentric_coordinates[3 * pixel_idx + 2] = b2;

}  // namespace tf_mesh_renderer




在openGL中可以调用gluLookAt(eye, center, up)函数,相机位置在视角eye,up在相机同一个平面上,视线指向center,这与mesh_renderer一致。

相机位置C=eye(Cx, Cy, Cz)

镜头朝向的单位向量N=center - eye(Nx, Ny, Nz)

相机向上的向量V= up - eye

相机坐标系x轴方向U= N x V



现在的目标就是将世界坐标系下的点p(objx, objy, objz)转化为相机坐标系下坐标(obju, objv, objn)。而p在uvn坐标系下的坐标即为点p在u、v、n三个轴上的投影分量,而向量在某个单位向量上的投影,可以通过点乘求取。即可以通过p点到相机位置center(Cx, Cy, Cz)的向量,与u、v、n分别点乘求取。


(obju, objv, objn, 1) = (objx, objy, objz, 1) * \begin{bmatrix} 1 & 0 & 0 & 0\\ 0& 1& 0 & 0\\ 0& 0& 1 & 0\\ -Cx& -Cy & -Cz & 1 \end{bmatrix}*\begin{bmatrix} ux & vx & nx & 0\\ uy& vy& ny & 0\\ uz& vz& nz & 0\\ 0& 0 & 0 & 1 \end{bmatrix}




Unsupervised Training for 3D Morphable Model Regression. Kyle Genova, Forrester Cole, Aaron Maschinot, Aaron Sarna, Daniel Vlasic, and William T. Freeman. CVPR 2018, pp. 8377-8386.

Triangle Scan Conversion using 2D Homogeneous Coordinates.HWWS 1997.





