Source code for graphics.NeuralMeshRenderer

# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
# A PyTorch implementation of Neural 3D Mesh Renderer
#
# Copyright (c) 2017 Hiroharu Kato
# Copyright (c) 2018 Nikos Kolotouros
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from typing import Optional

import torch

import kaolin as kal
from .DifferentiableRenderer import DifferentiableRenderer
from .Lighting import compute_ambient_light
from .Lighting import compute_directional_light


[docs]class NeuralMeshRenderer(DifferentiableRenderer): r"""A class implementing the \emph{Neural Mesh Renderer} from the following CVPR 2018 paper: Neural 3D Mesh Renderer Hiroharu Kato, Yoshitaka Ushiku, and Tatsuya Harada Link: https://arxiv.org/abs/1711.07566 """ def __init__( self, image_size: int = 256, anti_aliasing: bool = True, bg_color: torch.Tensor = torch.zeros(3), fill_back: bool = True, camera_mode: str = 'projection', K=None, rmat=None, tvec=None, perspective_distort: bool = True, viewing_angle: float = 30., camera_direction: torch.Tensor = torch.FloatTensor([0, 0, 1]), near: float = 0.1, far: float = 100, light_intensity_ambient: float = 0.5, light_intensity_directional: float = 0.5, light_color_ambient: torch.Tensor = torch.ones(3), light_color_directional: torch.Tensor = torch.ones(3), light_direction: torch.Tensor = torch.FloatTensor([0, 1, 0]), device: str = 'cpu'): r"""Initalize the NeuralMeshRenderer object. NOTE: NeuralMeshRenderer works only in GPU mode! Args: image_size (int): Size of the (square) image to be rendered. anti_aliasing (bool): Whether or not to perform anti-aliasing (default: True) bg_color (torch.Tensor): Background color of rendered image (size: math:`3`, default: :math:`\left[0, 0, 0\right]`) fill_back (bool): Whether or not to fill color to the back side of each triangle as well (sometimes helps, when the triangles in the mesh are not properly oriented.) (default: True) camera_mode (str): Choose from among `projection`, `look`, and `look_at`. In the `projection` mode, the camera is at the origin, and its optical axis is aligned with the positive Z-axis. In the `look_at` mode, the object (not the camera) is placed at the origin. The camera "looks at" the object from a predefined "eye" location, which is computed from the `viewing_angle` (another input to this function). In the `look` mode, only the direction in which the camera needs to look is specified. It does not necessarily look towards the origin, as it allows the specification of a custom "upwards" direction (default: 'projection'). K (torch.Tensor): Camera intrinsics matrix. Note that, unlike standard notation, K here is a 4 x 4 matrix (with the last row and last column drawn from the 4 x 4 identity matrix) (default: None) rmat (torch.Tensor): Rotation matrix (again, 4 x 4, as opposed to the usual 3 x 3 convention). tvec (torch.Tensor): Translation vector (3 x 1). Note that the (negative of the) tranlation is applied before rotation, to be consistent with the projective geometry convention of transforming a 3D point X by doing torch.matmul(R.transpose(), X - t) (default: None) perspective_distort (bool): Whether or not to perform perspective distortion (to simulate field-of-view based distortion effects) (default: True). viewing_angle (float): Angle at which the object is to be viewed (assumed to be in degrees!) (default: 30.) camera_direction (float): Direction in which the camera is facing (used only in the `look` and `look_at` modes) (default: :math:`[0, 0, 1]`) near (float): Near clipping plane (for depth values) (default: 0.1) far (float): Far clipping plane (for depth values) (default: 100) light_intensity_ambient (float): Intensity of ambient light (in the range :math:`\left[ 0, 1 \right]`) (default: 0.5). light_intensity_directional (float): Intensity of directional light (in the range :math:`\left[ 0, 1 \right]`) (default: 0.5). light_color_ambient (torch.Tensor): Color of ambient light (default: :math:`\left[ 0, 0, 0 \right]`) light_color_directional (torch.Tensor): Color of directional light (default: :math:`\left[ 0, 0, 0 \right]`) light_direction (torch.Tensor): Light direction, for directional light (default: :math:`\left[ 0, 1, 0 \right]`) device (torch.Tensor): Device on which all tensors are stored. NOTE: Although the default device is set to 'cpu', at the moment, rendering will work only if the device is CUDA enabled. Eg. 'cuda:0'. """ super(NeuralMeshRenderer, self).__init__() # Size of the image to be generated. self.image_size = image_size # Whether or not to enable anti-aliasing # If enabled, we render an image that is twice as large as the required # size, and then downsample it. self.anti_aliasing = anti_aliasing # Background color of the rendered image. self.bg_color = bg_color # Whether or not to fill in color to the back faces of each triangle. # Usually helps, especially when some of the triangles in the mesh # have improper orientation specifications. self.fill_back = fill_back # Device on which tensors of the class reside. At present, this function # only works when the device is CUDA enabled, such as a GPU. self.device = device # camera_mode specifies how the scene is to be set up. self.camera_mode = camera_mode # If the mode is 'projection', use the input camera intrinsics and # extrinsics. if self.camera_mode == 'projection': self.K = K self.rmat = rmat self.tvec = tvec # If the mode is 'look' or 'look_at', use the viewing angle to determine # perspective distortion and camera position and orientation. elif self.camera_mode in ['look', 'look_at']: # Whether or not to perform perspective distortion. self.perspective_distort = perspective_distort # TODO: Add comments here self.viewing_angle = viewing_angle # TODO: use kal.deg2rad instead self.eye = torch.FloatTensor([0, 0, -(1.\ / torch.tan(kal.mathutils.pi * self.viewing_angle / 180)\ + 1)]).to(self.device) # Direction in which the camera's optical axis is facing self.camera_direction = torch.FloatTensor([0, 0, 1]).to( self.device) # Near and far clipping planes. self.near = near self.far = far # Ambient and directional lighting parameters. self.light_intensity_ambient = light_intensity_ambient self.light_intensity_directional = light_intensity_directional self.light_color_ambient = light_color_ambient.to(device) self.light_color_directional = light_color_directional.to(device) self.light_direction = light_direction.to(device) # TODO: Add comments here. self.rasterizer_eps = 1e-3 def forward(self, vertices, faces, textures=None, mode=None, K=None, rmat=None, tvec=None): return self.render(vertices, faces, textures, mode, K, rmat, tvec) if mode is None: # If nothing is specified, render rgb, depth, and alpha channels return self.render(vertices, faces, textures, K, rmat, tvec, dist_coeffs, orig_size) elif mode is 'rgb': # Render RGB channels only return self.render_rgb(vertices, faces, textures, K, rmat, tvec, dist_coeffs, orig_size) elif mode is 'silhouette': # Render only a silhouette, without RGB colors return self.render_silhouette(vertices, faces, textures, K, rmat, tvec, dist_coeffs, orig_size) elif mode is 'depth': # Render depth image return self.render_depth(vertices, faces, textures, K, rmat, tvec, dist_coeffs, orig_size) else: raise ValueError('Mode {0} not implemented.'.format(mode)) def render(self, vertices, faces, textures=None, mode=None, K=None, rmat=None, tvec=None): r"""Renders the RGB, depth, and alpha channels. Args: vertices (torch.Tensor): Vertices of the mesh (shape: :math:`B \times V \times 3`), where :math:`B` is the batchsize, and :math:`V` is the number of vertices in the mesh. faces (torch.Tensor): Faces of the mesh (shape: :math:`B \times F \times 3`), where :math:`B` is the batchsize, and :math:`F` is the number of faces in the mesh. textures (torch.Tensor): Mesh texture (shape: :math:`B \times F \times 4 \times 4 \times 4 \times 3`) mode (str): Renderer mode (choices: 'rgb', 'silhouette', 'depth', None) (default: None). If the mode is None, the rgb, depth, and alpha channels are all rendered. In the rgb mode, only the rgb image channels are rendered. In the silhouette mode, only a silhouette image is rendered. In the depth mode, only a depth image is rendered. K (torch.Tensor): Camera intrinsics (default: None) (shape: :math:`B \times 4 \times 4` or :math:`4 \times 4`) rmat (torch.Tensor): Rotation matrix (default: None) (shape: :math:`B \times 4 \times 4` or :math:`4 \times 4`) tvec (torch.Tensor): Translation vector (default: None) (shape: :math:`B \times 3` or :math:`3`) Returns: (torch.Tensor): rendered RGB image channels (torch.Tensor): rendered depth channel (torch.Tensor): rendered alpha channel Each of the channels is of shape `self.image_size` x `self.image_size`. """ # Fill the back faces of each triangle, if needed if self.fill_back: faces = torch.cat((faces, faces[:, :, list(reversed(range( faces.shape[-1])))]), dim=1) textures = torch.cat( (textures, textures.permute(0, 1, 4, 3, 2, 5)), dim=1) print(faces.shape) print(textures.shape) print('Lighting') # Lighting (not needed when we are rendering only depth/silhouette # images). if mode not in ['depth', 'silhouette']: textures = self.lighting(vertices, faces, textures) print(textures.shape) # Transform vertices to the camera frame vertices = self.transform_to_camera_frame(vertices) # Project the vertices from the camera coordinate frame to the image. vertices = self.project_to_image(vertices) # Rasterization out = self.rasterize(vertices, faces, textures) return out['rgb'], out['depth'], out['alpha'] def lighting(self, vertices, faces, textures): r"""Applies ambient and directional lighting to the mesh. """ faces_lighting = self.vertices_to_faces(vertices, faces) # textures = lighting( # faces_lighting, # textures, # self.light_intensity_ambient, # self.light_intensity_directional, # self.light_color_ambient, # self.light_color_directional, # self.light_direction) ambient_lighting = Lighting.compute_ambient_light( faces_lighting, textures, self.light_intensity_ambient, self.light_color_ambient) directional_lighting = Lighting.compute_directional_light( faces_lighting, textures, self.light_intensity_directional, self.light_color_directional) return ambient_lighting * textures + directional_lighting * textures def shading(self): r"""Does nothing. """ pass def transform_to_camera_frame(self, vertices): r"""Transforms the mesh vertices to the camera frame, based on the camera mode to be used. Args: vertices (torch.Tensor): Mesh vertices (shape: :math:`B \times V \times 3`), where `B` is the batchsize, and `V` is the number of mesh vertices. Returns: vertices (torch.Tensor): Transformed vertices into the camera coordinate frame (shape: :math:`B \times V \times 3`). """ if self.camera_mode == 'look_at': vertices = self.look_at(vertices, self.eye) # # Perspective distortion # if self.perspective_distort: # vertices = perspective_distort(vertices, angle=self.viewing_angle) elif self.camera_mode == 'look': vertices = self.look(vertices, self.eye, self.camera_direction) # # Perspective distortion # if self.perspective_distort: # vertices = perspective_distort(vertices, angle=self.viewing_angle) elif self.camera_mode == 'projection': if K is None: K = self.K if rmat is None: rmat = self.rmat if tvec is None: tvec = self.tvec # vertices = perspective_projection(vertices, K, rmat, tvec) return vertices def project_to_image(self, vertices): r"""Projects the mesh vertices from the camera coordinate frame down to the image. Args: vertices (torch.Tensor): Mesh vertices (shape: :math:`B \times V \times 3`), where `B` is the batchsize, and `V` is the number of mesh vertices. Returns: vertices (torch.Tensor): Projected image coordinates (u, v) for each vertex, with an appended depth channel. (shape: :math:`B \times V \times 3`), where :math:`B` is the batchsize and :math:`V` is the number of vertices. """ # TODO: Replace all of these by perspective_projection. Use different # rmat, tvec combinations, based on the mode, but use a consistent # projection function across all modes. Helps avoid redundancy. if self.camera_mode == 'look_at': vertices = self.perspective_distortion(vertices, angle=self.viewing_angle) elif self.camera_mode == 'look': vertices = self.perspective_distortion(vertices, angle=self.viewing_angle) elif self.camera_mode == 'projection': vertices = perspective_projection(vertices, K, rmat, tvec) return vertices def rasterize(self, vertices, faces, textures, mode=None): r"""Performs rasterization, i.e., conversion of triangles to pixels. Args: vertices (torch.Tensor): Vertices of the mesh (shape: :math:`B \times V \times 3`), where :math:`B` is the batchsize, and :math:`V` is the number of vertices in the mesh. faces (torch.Tensor): Faces of the mesh (shape: :math:`B \times F \times 3`), where :math:`B` is the batchsize, and :math:`F` is the number of faces in the mesh. textures (torch.Tensor): Mesh texture (shape: :math:`B \times F \times 4 \times 4 \times 4 \times 3`) """ faces = self.vertices_to_faces(vertices, faces) # If mode is unspecified, render rgb, depth, and alpha channels if mode is None: out = kal.graphics.nmr.rasterizer.rasterize_rgbad(faces, textures, self.image_size, self.anti_aliasing, self.near, self.far, self.rasterizer_eps, self.bg_color) return out['rgb'], out['depth'], out['alpha'] # Render RGB channels only elif mode == 'rgb': images = kal.graphics.nmr.rasterize(faces, textures, self.image_size, self.anti_aliasing, self.near, self.far, self.rasterizer_eps, self.background_color) return images # Render depth image elif mode == 'depth': images = kal.graphics.nmr.rasterize_silhouettes(faces, self.image_size, self.anti_aliasing) # Render only a silhouette, without RGB colors elif mode == 'silhouette': depth = kal.graphics.nmr.rasterize_depth(faces, self.image_size, self.anti_aliasing) return depth else: raise ValueError('Mode {0} not implemented.'.format(mode)) def look_at(self, vertices, eye, at=torch.FloatTensor([0, 0, 0]), up=torch.FloatTensor([0, 1, 0])): r"""Camera "looks at" an object whose center is at the tensor represented by "at". And "up" is the upwards direction. """ import torch.nn.functional as F device = vertices.device eye = eye.to(device) at = at.to(device) up = up.to(device) batchsize = vertices.shape[0] if eye.dim() == 1: eye = eye[None, :].repeat(batchsize, 1) if at.dim() == 1: at = at[None, :].repeat(batchsize, 1) if up.dim() == 1: up = up[None, :].repeat(batchsize, 1) # Create new axes # eps is chosen as 1e-5 because that's what the authors use # in their (Chainer) implementation z_axis = F.normalize(at - eye, eps=1e-5) x_axis = F.normalize(torch.cross(up, z_axis), eps=1e-5) y_axis = F.normalize(torch.cross(z_axis, x_axis), eps=1e-5) # Create rotation matrices R = torch.cat((x_axis[:, None, :], y_axis[:, None, :], z_axis[:, None, :]), dim=1) # Apply # [B, V, 3] -> [B, V, 3] -> [B, V, 3] if vertices.shape != eye.shape: eye = eye[:, None, :] vertices = vertices - eye vertices = torch.matmul(vertices, R.transpose(1, 2)) return vertices def look(self, vertices, eye, direction=torch.FloatTensor([0, 1, 0]), up=None): r"""Apply the "look" transformation to the vertices. """ import torch.nn.functional as F device = vertices.device direction = direction.to(device) if up is None: up = torch.FloatTensor([0, 1, 0]).to(device) if eye.dim() == 1: eye = eye[None, :] if direction.dim() == 1: direction = direction[None, :] if up.dim() == 1: up = up[None, :] # Create new axes z_axis = F.normalize(direction, eps=1e-5) x_axis = F.normalize(torch.cross(up, z_axis), eps=1e-5) y_axis = F.normalize(torch.cross(z_axis, x_axis), eps=1e-5) # Create rotation matrix (B x 3 x 3) R = torch.cat((x_axis[:, None, :], y_axis[:, None, :], z_axis[:, None, :]), dim=1) # Apply if vertices.shape != eye.shape: eye = eye[:, None, :] vertices = vertices - eye vertices = torch.matmul(vertices, R.transpose(1, 2)) return vertices def perspective_distortion(self, vertices, angle=30.): r"""Compute perspective distortion from a given viewing angle. """ device = vertices.device angle = torch.FloatTensor([angle * 180 / kal.mathutils.pi]).to(device) width = torch.tan(angle) width = width[:, None] z = vertices[:, :, 2] x = vertices[:, :, 0] / (z * width) y = vertices[:, :, 1] / (z * width) vertices = torch.stack((x, y, z), dim=2) return vertices def vertices_to_faces(self, vertices, faces): r""" vertices (torch.Tensor): shape: math:`B \times V \times 3` faces (torch.Tensor): shape: math:`B \times F \times 3` """ B = vertices.shape[0] V = vertices.shape[1] # print(vertices.dim(), faces.dim()) # print(vertices.shape[0], faces.shape[0]) # print(vertices.shape[2], faces.shape[2]) device = vertices.device faces = faces + (torch.arange(B).to(device) * V)[:, None, None] vertices = vertices.reshape(B * V, 3) return vertices[faces]
if __name__ == '__main__': import kaolin from kaolin.graphics.Transformations import get_eye_from_spherical_coords import os import tqdm filename_input = 'examples/renderers/test/data/banana.obj' camera_distance = torch.Tensor([2]) elevation = torch.Tensor([30]) mesh = kaolin.rep.TriangleMesh.from_obj(filename_input) vertices = mesh.vertices faces = mesh.faces.long() face_textures = (faces).clone() vertices = vertices[None, :, :].cuda() faces = faces[None, :, :].cuda() face_textures[None, :, :].cuda() vertices_max = vertices.max() vertices_min = vertices.min() vertices_middle = (vertices_max + vertices_min)/2. vertices = vertices - vertices_middle coef = 5 vertices = vertices * coef textures = torch.ones(1, faces.shape[1], 2, 2, 2, 3, dtype=torch.float32).cuda() renderer = NeuralMeshRenderer(camera_mode='look_at') renderer.eye = get_eye_from_spherical_coords(camera_distance, elevation, torch.Tensor([0.])) images, _, _ = renderer(vertices, faces, textures=textures) # loop = tqdm.tqdm(list(range(0, 360, 4))) # loop.set_description('Drawing NMR') # # # writer = imageio.get_writer(os.path.join(output_directory_nmr, 'rotation.gif'), mode='I') # for num, azimuth in enumerate(loop): # renderer.eye = get_eye_from_spherical_coords(camera_distance, elevation, torch.Tensor([azimuth])) # images, _, _ = renderer.render(vertices, faces, textures=textures) # image = images.detach().cpu().numpy()[0].transpose((1, 2, 0)) # # # writer.append_data((255*image).astype(np.uint8)) # # writer.close()