I am working on the ShapeNetCore dataset which consists of 3D models with texture and color information. Link to ShapeNetCore repository - https://shapenet.org/
I am trying to render the RGB and depth information of these .obj (3D) files using blender through multiple predefined camera viewpoints. The idea is to capture view-dependent partial RGB-D information of the object to use for 3D perception tasks later.
I am following the steps from the repository - https://github.com/yinyunie/depth_renderer.
The camera viewpoints are created as the vertices of a regular dodecahedron as follows -
phi = (1 + math.sqrt(5)) / 2. # golden_ratio
circumradius = math.sqrt(3)
distance = circumradius*1.2
# this creates the vertices of a regular dodecahedron
dodecahedron = [[-1, -1, -1],
[ 1, -1, -1],
[ 1, 1, -1],
[-1, 1, -1],
[-1, -1, 1],
[ 1, -1, 1],
[ 1, 1, 1],
[-1, 1, 1],
[0, -phi, -1 / phi],
[0, -phi, 1 / phi],
[0, phi, -1 / phi],
[0, phi, 1 / phi],
[-1 / phi, 0, -phi],
[-1 / phi, 0, phi],
[ 1 / phi, 0, -phi],
[ 1 / phi, 0, phi],
[-phi, -1 / phi, 0],
[-phi, 1 / phi, 0],
[ phi, -1 / phi, 0],
[ phi, 1 / phi, 0]]
# get Azimuth, Elevation angles
# Azimuth varies from -pi to pi
# Elevation from -pi/2 to pi/2
view_points = open('./view_points.txt', 'w+')
for vertice in dodecahedron:
elevation = math.asin(vertice[2] / circumradius)
azimuth = math.atan2(vertice[1], vertice[0])
view_points.write('%f %f %f %f\n' % (azimuth, elevation, 0., distance))
view_points.close()
After this, I render the RGB as .png file and the depth as .exr file. This is the code -
vp = viewpoint
cam_location = camera_location(vp.azimuth, vp.elevation, vp.distance)
cam_rot = camera_rot_XYZEuler(vp.azimuth, vp.elevation, vp.tilt)
cam_obj = bpy.data.objects['Camera']
cam_obj.location[0] = cam_location[0]
cam_obj.location[1] = cam_location[1]
cam_obj.location[2] = cam_location[2]
cam_obj.rotation_euler[0] = cam_rot[0]
cam_obj.rotation_euler[1] = cam_rot[1]
cam_obj.rotation_euler[2] = cam_rot[2]
if g_background_image_path == 'TRANSPARENT':
bpy.context.scene.render.alpha_mode = g_background_image_path
else:
background_images = os.listdir(g_background_image_path)
image_name = random.choice(background_images)
image_path = os.path.join(g_background_image_path, image_name)
image_node = bpy.context.scene.node_tree.nodes[0]
image_node.image = bpy.data.images.load(image_path)
img_file_output_node = bpy.context.scene.node_tree.nodes[4]
img_file_output_node.file_slots[0].path = 'color_###.png' # blender placeholder #
depth_file_output_node = bpy.context.scene.node_tree.nodes[5]
depth_file_output_node.file_slots[0].path = 'depth_###.exr' # blender placeholder #
#start rendering
bpy.context.scene.frame_set(viewpoint_id + 1)
bpy.ops.render.render(write_still=True)
# write camera info
cam_K_file = os.path.join(cam_K_path, 'cam_K.txt')
if (not os.path.isfile(cam_K_file)) or (len(os.listdir(cam_RT_path))<total_view_nums):
K, RT = get_3x4_P_matrix_from_blender(cam_obj)
np.savetxt(cam_K_file, K)
np.savetxt(os.path.join(cam_RT_path, 'cam_RT_{0:03d}.txt'.format(viewpoint_id + 1)), RT)
print('Camera parameters written.')
In this code, the object is rendered from the 20 views that I specified.
The code below shows how blender constructs and stores the camera projection matrix (P). Basically, its the 3x4
extrinsic matrix (R|t) and the 3x3
intrinsic matrix.
'''Build intrinsic camera parameters from Blender camera data
See notes on this in
blender.stackexchange.com/questions/15102/what-is-blenders-camera-projection-matrix-model
as well as
https://blender.stackexchange.com/a/120063/3581
'''
def get_calibration_matrix_K_from_blender(camd):
if camd.type != 'PERSP':
raise ValueError('Non-perspective cameras not supported')
scene = bpy.context.scene
f_in_mm = camd.lens
scale = scene.render.resolution_percentage / 100
resolution_x_in_px = scale * scene.render.resolution_x
resolution_y_in_px = scale * scene.render.resolution_y
sensor_size_in_mm = get_sensor_size(camd.sensor_fit, camd.sensor_width, camd.sensor_height)
sensor_fit = get_sensor_fit(
camd.sensor_fit,
scene.render.pixel_aspect_x * resolution_x_in_px,
scene.render.pixel_aspect_y * resolution_y_in_px
)
pixel_aspect_ratio = scene.render.pixel_aspect_y / scene.render.pixel_aspect_x
if sensor_fit == 'HORIZONTAL':
view_fac_in_px = resolution_x_in_px
else:
view_fac_in_px = pixel_aspect_ratio * resolution_y_in_px
pixel_size_mm_per_px = sensor_size_in_mm / f_in_mm / view_fac_in_px
s_u = 1 / pixel_size_mm_per_px
s_v = 1 / pixel_size_mm_per_px / pixel_aspect_ratio
# Parameters of intrinsic calibration matrix K
u_0 = resolution_x_in_px / 2 - camd.shift_x * view_fac_in_px
v_0 = resolution_y_in_px / 2 + camd.shift_y * view_fac_in_px / pixel_aspect_ratio
skew = 0 # only use rectangular pixels
K = Matrix(
((s_u, skew, u_0),
( 0, s_v, v_0),
( 0, 0, 1)))
return K
'''
Returns camera rotation and translation matrices from Blender.
There are 3 coordinate systems involved:
1. The World coordinates: "world"
- right-handed
2. The Blender camera coordinates: "bcam"
- x is horizontal
- y is up
- right-handed: negative z look-at direction
3. The desired computer vision camera coordinates: "cv"
- x is horizontal
- y is down (to align to the actual pixel coordinates
used in digital images)
- right-handed: positive z look-at direction
'''
def get_3x4_RT_matrix_from_blender(cam):
# bcam stands for blender camera
R_blender2shapenet = Matrix(
((1, 0, 0),
(0, 0, -1),
(0, 1, 0)))
R_bcam2cv = Matrix(
((1, 0, 0),
(0, 1, 0),
(0, 0, -1)))
# Transpose since the rotation is object rotation,
# and we want coordinate rotation
# R_world2bcam = cam.rotation_euler.to_matrix().transposed()
# T_world2bcam = -1*R_world2bcam * location
#
# Use matrix_world instead to account for all constraints
location, rotation = cam.matrix_world.decompose()[0:2]
R_world2bcam = rotation.to_matrix().transposed()
# Convert camera location to translation vector used in coordinate changes
# T_world2bcam = -1*R_world2bcam*cam.location
# Use location from matrix_world to account for constraints:
T_world2bcam = -1*R_world2bcam * location
# Build the coordinate transform matrix from world to computer vision camera
R_world2cv = R_bcam2cv*R_world2bcam*R_blender2shapenet
T_world2cv = R_bcam2cv*T_world2bcam
# put into 3x4 matrix
RT = Matrix((
R_world2cv[0][:] + (T_world2cv[0],),
R_world2cv[1][:] + (T_world2cv[1],),
R_world2cv[2][:] + (T_world2cv[2],)
))
return RT
def get_3x4_P_matrix_from_blender(cam):
K = get_calibration_matrix_K_from_blender(cam.data)
RT = get_3x4_RT_matrix_from_blender(cam)
return K, RT
Full code is in this file - https://github.com/yinyunie/depth_renderer/blob/main/render_all.py
I am trying to now load/register the partial pointcloud from multiple viewpoints in the world coordinate frame, and in an ideal case all the pointclouds should be registered together.
I create the pointcloud using the rgb image, depth image, and camera intrinsic (cam_K -> 3x3 matrix). However, the pointclouds that I create would be in their respective camera frame, and therefore I use the view-dependent cam_RT matrix that blender generated to apply an inverse transformation to the pointcloud, to get them from the camera frame to the world coordinate frame. My code is below ->
def get_point_cloud(depth_map, cam_K, cam_RT, rgb_img):
'''
get point cloud from depth maps
:param depth_map: depth map list
:param cam_K: corresponding camera intrinsic
:param cam_RT: corresponding camera rotations and translations
:param rgb_img: corresponding rgb images
:return: aligned point cloud in the canonical system with color intensities.
'''
u, v = np.meshgrid(range(depth_map.shape[1]), range(depth_map.shape[0]))
u = u.reshape([1, -1])[0]
v = v.reshape([1, -1])[0]
z = depth_map[v, u]
# remove infinitive pixels
non_inf_indices = np.argwhere(z < np.inf).T[0]
color_indices = rgb_img[v, u][non_inf_indices]
z = z[non_inf_indices]
u = u[non_inf_indices]
v = v[non_inf_indices]
# calculate coordinates
x = (u - cam_K[0][2]) * z / cam_K[0][0]
y = (v - cam_K[1][2]) * z / cam_K[1][1]
point_cam = np.vstack([x, y, z]).T
point_canonical = (point_cam - cam_RT[:, -1]).dot(cam_RT[:,:-1])
cam_pos = - cam_RT[:, -1].dot(cam_RT[:,:-1])
focal_point = ([0, 0, 1] - cam_RT[:, -1]).dot(cam_RT[:,:-1])
up = np.array([0,-1,0]).dot(cam_RT[:,:-1])
cam_pos = {'pos':cam_pos, 'fp':focal_point, 'up':up}
# create pointcloud from point cam
pcd_cam = o3d.geometry.PointCloud()
pcd_cam.points = o3d.utility.Vector3dVector(point_cam)
pcd_cam.colors = o3d.utility.Vector3dVector(color_indices/255.0)
# create pointcloud from point can
pcd_can = o3d.geometry.PointCloud()
pcd_can.points = o3d.utility.Vector3dVector(point_canonical)
pcd_can.colors = o3d.utility.Vector3dVector(color_indices/255.0)
return pcd_cam, pcd_can
However, I notice a slight misalignment between the registered pointclouds from different viewpoints.
I also try to load the 3D mesh as a pointcloud (which I believe should be in the world frame as well) using -> mesh = o3d.io.read_triangle_mesh(obj_path).sample_points_uniformly(10000).farthest_point_down_sample(5000)
, and when I try to register the two view-dependent partial pointclouds with the object pcd (in the world frame), all three of them are a bit misaligned.
Previously, when I have worked with real rgb-d scans recorded using Intel Realsense camera, I have never faced these issues before, and the pointclouds have registered together. The backend that I am using to visualize these pointclouds is I believe OpenGL based (Jupyter notebook running on the browser).
I am still not clear what the issue is -> 1) is there an additional transform that I need to apply, 2) mean centering to fix the translation?, 3) problem with how the depth is recorded?.
I have also attached the obj model and the RGB, depth renders along with the camera matrices here.
I finally got the back-projected pointclouds to register together in the canonical frame. I was reading the exr files using custom code and that was the reason for the precision loss in the depth values. Here is the code after which all the pointclouds register perfectly in the canonical frame.
import imageio
import sys
sys.path.append('../')
from data_config import *
from tools.read_and_write import *
import random
def read_exr(exr_file_list):
print(f'The exr file list is : {exr_file_list}')
'''
read exr files and output a matrix.
:param exr_file_list:
:return:
'''
if isinstance(exr_file_list, str):
exr_file_list = [exr_file_list]
im_list = []
for exr_file in exr_file_list:
if not exr_file.endswith('exr'):
raise TypeError('file is not with the format of .exr.')
im_list.append(imageio.imread(exr_file, format='exr')[:,:,0])
return np.array(im_list)
# get the pointcloud given the rgb image and the depth map
def get_point_cloud(depth_maps, cam_Ks, cam_RTs, rgb_imgs=None):
'''
get point cloud from depth maps
:param depth_maps: depth map list
:param cam_Ks: corresponding camera intrinsics
:param cam_RTs: corresponding camera rotations and translations
:param rgb_imgs: corresponding rgb images
:return: aligned point clouds in the canonical system with color intensities.
'''
point_list_canonical = []
camera_positions = []
color_intensities = []
if not isinstance(rgb_imgs, np.ndarray):
rgb_imgs = 32*np.ones([depth_maps.shape[0], depth_maps.shape[1], depth_maps.shape[2], 3], dtype=np.uint8)
for depth_map, rgb_img, cam_K, cam_RT in zip(depth_maps, rgb_imgs, cam_Ks, cam_RTs):
u, v = np.meshgrid(range(depth_map.shape[1]), range(depth_map.shape[0]))
u = u.reshape([1, -1])[0]
v = v.reshape([1, -1])[0]
z = depth_map[v, u]
# remove infinitive pixels
non_inf_indices = np.argwhere(z < np.inf).T[0]
color_indices = rgb_img[v, u][non_inf_indices]
z = z[non_inf_indices]
u = u[non_inf_indices]
v = v[non_inf_indices]
# calculate coordinates
x = (u - cam_K[0][2]) * z / cam_K[0][0]
y = (v - cam_K[1][2]) * z / cam_K[1][1]
point_cam = np.vstack([x, y, z]).T
point_canonical = (point_cam - cam_RT[:, -1]).dot(cam_RT[:,:-1])
cam_pos = - cam_RT[:, -1].dot(cam_RT[:,:-1])
focal_point = ([0, 0, 1] - cam_RT[:, -1]).dot(cam_RT[:,:-1])
up = np.array([0,-1,0]).dot(cam_RT[:,:-1])
cam_pos = {'pos':cam_pos, 'fp':focal_point, 'up':up}
# point_clouds.append(np.expand_dims(point_canonical, axis=0))
point_list_canonical.append(point_canonical)
camera_positions.append(cam_pos)
color_intensities.append(color_indices)
return point_list_canonical, camera_positions, color_intensities
depth_sample_dir = '02818832/e91c2df09de0d4b1ed4d676215f46734'
n_views = 20
assert n_views <= total_view_nums # there are total 20 views surrounding an object.
view_ids = range(1, n_views+1)
shapenet_rendering_path = '../datasets/shapenet_sample_renderings'
metadata_dir = os.path.join(shapenet_rendering_path, depth_sample_dir)
print(f'metadata_dir : {metadata_dir}')
dist_map_dir = [os.path.join(metadata_dir, 'depth_{0:03d}.exr'.format(view_id)) for view_id in view_ids]
print(f'dist map dir : {dist_map_dir}')
camera_path = '../datasets/camera_settings'
cam_RT_dir = [os.path.join(camera_path, 'cam_RT','cam_RT_{0:03d}.txt'.format(view_id)) for view_id in view_ids]
dist_maps = read_exr(dist_map_dir)
cam_K = np.loadtxt(os.path.join(camera_path, 'cam_K/cam_K.txt'))
cam_RTs = read_txt(cam_RT_dir)
depth_maps = np.float32(dist_to_dep(dist_maps, [cam_K]*len(view_ids)))
rgb_images = [os.path.join(metadata_dir, 'color_{0:03d}.png'.format(view_id)) for view_id in view_ids]
point_list_canonical, camera_positions, color_intensities = get_point_cloud(depth_maps, [cam_K]*len(view_ids), cam_RTs, rgb_imgs=read_image(rgb_images))
# visualize the pointcloud
pcd_index = random.randint(0, len(point_list_canonical)-1)
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(np.vstack(point_list_canonical))
pcd.colors = o3d.utility.Vector3dVector(np.vstack(color_intensities)/255.0)
# pcd_combined = o3d.geometry.PointCloud()
# pcd_combined.points = o3d.utility.Vector3dVector(np.vstack(point_list_canonical))
o3d.visualization.draw_geometries([pcd])