diff --git a/SMIL_torch_batch.py b/SMIL_torch_batch.py new file mode 100644 index 0000000..fdeb69c --- /dev/null +++ b/SMIL_torch_batch.py @@ -0,0 +1,249 @@ +# https://github.com/CalciferZh/SMPL +import torch +import torch.nn as nn +import pickle +import numpy as np +import scipy.sparse + + +class SMIL(nn.Module): + def with_zeros(self, x): + """ + Append a [0, 0, 0, 1] vector to a batch of [3, 4] matrices. + + Parameter: + --------- + x: Tensor to be appended of shape [N, 3, 4] + + Return: + ------ + Tensor after appending of shape [N, 4, 4] + + """ + ret = torch.cat([x, self.e4.expand(x.shape[0], 1, -1)], dim=1) + return ret + + def pack(self, x): + """ + Append zero tensors of shape [4, 3] to a batch of [4, 1] shape tensors. + + Parameter: + ---------- + x: A tensor of shape [batch_size, 4, 1] + + Return: + ------ + A tensor of shape [batch_size, 4, 4] after appending. + + """ + ret = torch.cat( + (torch.zeros((x.shape[0], x.shape[1], 4, 3), dtype=x.dtype, device=x.device), x), + dim=3 + ) + return ret + + def rodrigues(self, r): + """ + Rodrigues' rotation formula that turns axis-angle tensor into rotation + matrix in a batch-ed manner. + + Parameter: + ---------- + r: Axis-angle rotation tensor of shape [N, 1, 3]. + + Return: + ------- + Rotation matrix of shape [N, 3, 3]. + """ + theta = torch.norm(r, dim=(1, 2), keepdim=True) + # avoid division by zero + torch.max(theta, theta.new_full((1,), torch.finfo(theta.dtype).tiny), out=theta) + #The .tiny has to be uploaded to GPU, but self.regress_joints is such a big bottleneck it is not felt. + + r_hat = r / theta + z_stick = torch.zeros_like(r_hat[:, 0, 0]) + m = torch.stack( + (z_stick, -r_hat[:, 0, 2], r_hat[:, 0, 1], + r_hat[:, 0, 2], z_stick, -r_hat[:, 0, 0], + -r_hat[:, 0, 1], r_hat[:, 0, 0], z_stick), dim=1) + m = m.reshape(-1, 3, 3) + + dot = torch.bmm(r_hat.transpose(1, 2), r_hat) # Batched outer product. + # torch.matmul or torch.stack([torch.ger(r, r) for r in r_hat.squeeze(1)] works too. + cos = theta.cos() + R = cos * self.eye + (1 - cos) * dot + theta.sin() * m + return R + + def __init__(self, model_path='./model.pkl', sparse=True): + super().__init__() + + self.parent = None + self.model_path = None + if model_path is not None: + with open(model_path, 'rb') as f: + self.model_path = model_path + params = pickle.load(f) + # The first three can be added simply: + registerbuffer = lambda name: self.register_buffer(name, + torch.as_tensor(params[name])) + registerbuffer('weights') + registerbuffer('posedirs') + registerbuffer('v_template') + registerbuffer('shapedirs') + + # Now for the more difficult...: + # We have to convert f from uint32 to int32. (This is the indexbuffer) + self.register_buffer('f', torch.as_tensor(params['f'].astype(np.int32))) + self.register_buffer('kintree_table', torch.as_tensor(params['kintree_table'].astype(np.int32))) + + # J_regressor is a sparse tensor. This is (experimentally) supported in PyTorch. + J_regressor = params['J_regressor'] + if scipy.sparse.issparse(J_regressor): + # If tensor is sparse (Which it is with SMPL/SMIL) + J_regressor = J_regressor.tocoo() + J_regressor = torch.sparse_coo_tensor([J_regressor.row, J_regressor.col], + J_regressor.data, + J_regressor.shape) + if not sparse: + J_regressor = J_regressor.to_dense() + else: + J_regressor = torch.as_tensor(J_regressor) + self.register_buffer('J_regressor', J_regressor) + + self.register_buffer('e4', self.posedirs.new_tensor([0, 0, 0, 1])) # Cache this. (Saves a lot of time) + self.register_buffer('eye', torch.eye(3, dtype=self.e4.dtype, device=self.e4.device)) # And this. + self.set_parent() + + # Make sure the tree map is reconstructed if/when model is loaded. + self._register_state_dict_hook(self.set_parent) + + def set_parent(self, *args, **kwargs): + # Get kintree_table from state dict. + # Make kinematic tree relations. + id_to_col = {self.kintree_table[1, i].item(): i for i in range(self.kintree_table.shape[1])} + self.parent = { + i: id_to_col[self.kintree_table[0, i].item()] + for i in range(1, self.kintree_table.shape[1]) + } + # Must return None, since only a state dict or None return value is permitted for state dict hooks. + + + def save_obj(self, verts, obj_mesh_name): + with open(obj_mesh_name, 'w') as fp: + for v in verts: + fp.write('v %f %f %f\n' % (v[0], v[1], v[2])) + + for f in self.f: # Faces are 1-based, not 0-based in obj files + fp.write('f %d %d %d\n' % (f[0] + 1, f[1] + 1, f[2] + 1)) + + def regress_joints(self, vertices): + """The J_regressor matrix transforms vertices to joints.""" + # Given the template + pose blend shapes. + batch_size = vertices.shape[0] + + # We could get the result as torch.matmul(self.J_regressor, vertices) or + # torch.stack([self.J_regressor.mm(verts) for verts in vertices]) in case J_regressor is sparse. + # But turns out there is a solution faster than both of the above: + batch_vertices = vertices.transpose(0, 1).reshape(self.J_regressor.shape[1], -1) + batch_results = self.J_regressor.mm(batch_vertices) + batch_results = batch_results.reshape(self.J_regressor.shape[0], batch_size, -1).transpose(0, 1) + return batch_results + + def rotate_translate(self, rotation_matrix, translation): + transform = torch.cat((rotation_matrix, translation.unsqueeze(2)), 2) + return self.with_zeros(transform) + + def forward(self, beta, pose, trans=None, simplify=False): + """This module takes betas and poses in a batched manner. + A pose is 3 * K + 3 (= self.kintree_table.shape[1] * 3) parameters, where K is the number of joints. + A beta is a vector of size self.shapedirs.shape[2], that parameterizes the body shape. + Since this is batched, multiple betas and poses should be concatenated along zeroth dimension. + See http://files.is.tue.mpg.de/black/papers/SMPL2015.pdf for more info. + """ + batch_size = beta.shape[0] # Size of zeroth dimension. + + # The body shape is decomposed with principal component analysis from many subjects, + # where self.v_template is the average value. Then shapedirs is a subset of the orthogonal directions, and + # a the betas are the values when the subject is projected onto these. v_shaped is the "restored" subject. + v_shaped = torch.tensordot(beta, self.shapedirs, dims=([1], [2])) + self.v_template + + # We turn the rotation vectors into rotation matrices. + R_cube = self.rodrigues(pose.reshape(-1, 1, 3)).reshape(batch_size, -1, 3, 3) + J = self.regress_joints(v_shaped) # Joints in T-pose (for limb lengths) + + if not simplify: + # Add pose blend shapes. (How joint angles morphs the surface) + # Now calculate how joints affects the body shape. + lrotmin = R_cube[:, 1:] - self.eye + lrotmin = lrotmin.reshape(batch_size, -1) + v_shaped += torch.tensordot(lrotmin, self.posedirs, dims=([1], [2])) + + # Now we have the un-posed body shape. Convert to homogeneous coordinates. + rest_shape_h = torch.cat((v_shaped, v_shaped.new_ones(1).expand(*v_shaped.shape[:-1], 1)), 2) + + G = [self.rotate_translate(R_cube[:, 0], J[:, 0])] + for i in range(1, self.kintree_table.shape[1]): + G.append( + torch.bmm( + G[self.parent[i]], + self.rotate_translate(R_cube[:, i], J[:, i] - J[:, self.parent[i]]))) + G = torch.stack(G, 1) + G = G - self.pack(torch.matmul(G, torch.cat([J, J.new_zeros(1).expand(*J.shape[:2], 1)], dim=2).unsqueeze(-1))) + + # T = torch.tensordot(self.weights, G, dims=([1], [1])) + # v = T.reshape(-1, 4, 4).bmm(rest_shape_h.reshape(-1, 4, 1)).reshape(batch_size, -1, 4) + + # Two next lines are a memory bottleneck. + T = torch.tensordot(G, self.weights, dims=([1], [1])).permute(0, 3, 1, 2) + + v = torch.matmul(T, torch.reshape(rest_shape_h, (batch_size, -1, 4, 1))).reshape(batch_size, -1, 4) + Jtr = self.regress_joints(v) + + if trans is not None: + trans = trans.unsqueeze(1) + v[..., :3] += trans + Jtr[..., :3] += trans + + return v, Jtr + +def time_numpy(body_model, poses): + return [body_model.set_params(pose.pose, pose.beta) for pose in poses] + +def time_pytorch(body_model, betas, poses): + for i in range(100): + v = body_model(vbeta, vpose) + +if __name__ == '__main__': + from smil_np import SMILModel + import timeit + + # The best configurations are: + # device = cuda, dtype = half, sparse = false + # device = cuda, dtype = float, sparse = true + + device = torch.device('cuda') # torch.device('cuda') + dtype = torch.float + sparse = True if dtype is not torch.half else False # sparse Half Tensors are not supported (yet). + + SMILNP = SMILModel('./model.pkl') + SMILPY = SMIL('./model.pkl', sparse=sparse).to(device) + SMILPY = SMILPY.to(device, dtype, non_blocking=True) + + from file_utils import * + poses = find_mini_rgbd(os.path.join('MINI-RGBD_web')) + poses = [MicroRGBD(pose) for pose in poses[:4000]] # If there is a memory error reduce size here. + + vbeta = torch.tensor(np.array([pose.beta for pose in poses])).to(device, dtype, non_blocking=True) + vpose = torch.tensor(np.array([pose.pose for pose in poses])).to(device, dtype, non_blocking=True) + vtrans = torch.tensor(np.array([pose.trans for pose in poses])).to(device, dtype, non_blocking=True) + v, Jtr = SMILPY(vbeta, vpose, vtrans) # Do the thing. + time_pytorch(SMILPY, vbeta, vpose) + + # SMILNP.set_params(poses[i].pose, poses[i].beta, poses[i].trans) + # print(Jtr.cpu()[i].numpy() - SMILNP.Jtr, Jtr[i].shape, SMILNP.Jtr.shape) # See if there are any rounding errors. + + #with torch.cuda.profiler.profile() as prof: + # SMILPY(vbeta, vpose) # Warmup CUDA memory allocator and profiler + # with torch.autograd.profiler.emit_nvtx(): + # SMILPY(vbeta, vpose) +