Anonymous
Сопоставить билинейные выборочные объекты с обратно воксельными
Сообщение
Anonymous » 25 сен 2024, 13:14
В настоящее время я работаю над проектом, в котором разрабатываю класс специально для билинейной выборки. Моя цель — сопоставить функции, извлеченные в процессе билинейной выборки, с соответствующими местами в созданной сетке вокселов.
Код: Выделить всё
class BilinearSamplingTransform(nn.Module):
def __init__(self, model_cfg):
super().__init__()
self.model_cfg = model_cfg
self.norm = self.model_cfg.NORMALIZER
in_channel = self.model_cfg.IN_CHANNEL
out_channel = self.model_cfg.OUT_CHANNEL
self.image_size = self.model_cfg.IMAGE_SIZE
self.feature_size = self.model_cfg.FEATURE_SIZE
self.xbound = self.model_cfg.XBOUND
self.ybound = self.model_cfg.YBOUND
self.zbound = self.model_cfg.ZBOUND
self.dbound = self.model_cfg.DBOUND
downsample = self.model_cfg.DOWNSAMPLE
self.C = out_channel
self.grid =self.create_voxel_grid() # [D, H, W, 3]
self.D = self.grid.shape[0]
def create_voxel_grid(self):
"""
Create a voxel grid based on image size, feature size, and depth.
Returns:
voxel_grid (torch.Tensor): A 3D voxel grid containing the voxel coordinates [D, H, W, 3].
"""
fZ, fX, fY = self.zbound[2], self.xbound[2], self.ybound[2]
# Width (X) values: Linearly spaced positions along the xbound to cover the feature width
xs = torch.linspace(self.xbound[0], self.xbound[1], self.xbound[2], dtype=torch.float).view(1, 1, fY).expand(fZ, fX, fY)
# Height (Y) values: Linearly spaced positions along the ybound to cover the feature height
ys = torch.linspace(self.ybound[0], self.ybound[1], self.ybound[2], dtype=torch.float).view(1, fX, 1).expand(fZ, fX, fY)
zs = torch.linspace(self.zbound[0], self.zbound[1], self.zbound[2], dtype=torch.float).view(fZ, 1, 1).expand(fZ, fX, fY)
# Stack the voxel grid along the depth (ds), height (ys), and width (xs) axes
voxel_grid = torch.stack((xs, ys, zs), -1) # Shape: [D, H, W, 3]
return nn.Parameter(voxel_grid, requires_grad=False)
def align_grid_with_pt(self, **kwargs):
points = self.grid
B, D, W, H = 1, points.size(0), points.size(1), points.size(2)
# Handle extra rotations
if "extra_rots" in kwargs:
extra_rots = kwargs["extra_rots"]
points = points.view(B, -1, 3) # Flatten spatial dimensions
points = torch.bmm(points, extra_rots.transpose(1, 2)) # Batch matrix multiplication
points = points.view(B, D, W, H, 3) # Correct reshaping using known dimensions
# Handle extra translations
if "extra_trans" in kwargs:
extra_trans = kwargs["extra_trans"]
points += extra_trans.view(B, 1, 1, 1, 3) # Broadcast over spatial dimensions
# Visualization function here
return points
def get_geometry(self,points_gt, camera2lidar_rots, camera2lidar_trans, intrins, post_rots, post_trans, **kwargs):
camera2lidar_rots = camera2lidar_rots.to(torch.float)
camera2lidar_trans = camera2lidar_trans.to(torch.float)
intrins = intrins.to(torch.float)
post_rots = post_rots.to(torch.float)
post_trans = post_trans.to(torch.float)
B, N, _ = camera2lidar_trans.shape
# cam_to_lidar
points = torch.cat((points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3], points[:, :, :, :, :, 2:3]), 5)
combine = camera2lidar_rots.matmul(torch.inverse(intrins))
points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1)
points += camera2lidar_trans.view(B, N, 1, 1, 1, 3)
if "extra_rots" in kwargs:
extra_rots = kwargs["extra_rots"]
points = extra_rots.view(B, 1, 1, 1, 1, 3, 3).repeat(1, N, 1, 1, 1, 1, 1) \
.matmul(points.unsqueeze(-1)).squeeze(-1)
if "extra_trans" in kwargs:
extra_trans = kwargs["extra_trans"]
points += extra_trans.view(B, 1, 1, 1, 1, 3).repeat(1, N, 1, 1, 1, 1)
return points
def forward(self, batch_dict):
img_org = batch_dict['camera_imgs']
x = batch_dict['image_fpn']
x = x[0]
BN, C, H, W = x.size()
img = x.view(int(BN / 6), 6, C, H, W) # B,N,C,H,W
points = batch_dict['points'] # only for visualisation but won't be used
camera_intrinsics = batch_dict['camera_intrinsics'] # intrinsic
camera2lidar = batch_dict['camera2lidar'] # extrinsic
img_aug_matrix = batch_dict['img_aug_matrix']
lidar_aug_matrix = batch_dict['lidar_aug_matrix']
lidar2image = batch_dict['lidar2image'] # extrinsics and intrinsics
lidar2camera = batch_dict['lidar2camera']# extrinsics only
intrins = camera_intrinsics[..., :3, :3]
post_rots = img_aug_matrix[..., :3, :3]
post_trans = img_aug_matrix[..., :3, 3]
camera2lidar_rots = camera2lidar[..., :3, :3]
camera2lidar_trans = camera2lidar[..., :3, 3]
extra_rots = lidar_aug_matrix[..., :3, :3]
extra_trans = lidar_aug_matrix[..., :3, 3]
batch_size = BN // 6
all_projected_points = []
TransMode = 'lidar2image' # lidar2camera
N = 6
plot_project = False
if plot_project and N == 6:
fig, axs = plt.subplots(2, 3, figsize=(35, 20))
#fig2, axs2 = plt.subplots(2, 3, figsize=(35, 20))
for b in range(batch_size):
####### In depthlss, first they removing the augmentation, we don't need to do it because we are already in the world coordinate
D, H, W, _= self.grid.shape
cur_img_aug_matrix = img_aug_matrix[b]
cur_lidar_aug_matrix = lidar_aug_matrix[b]
cur_lidar2image = lidar2image[b]
cur_lidar2camera = lidar2camera[b]
######### ####### ####### ####### ####### pts transformed ####### ####### ####### ####### ####### #######
batch_mask = points[:, 0] == b
cur_coords = points[batch_mask][:, 1:4]
cur_coords_before = cur_coords.clone() #ground truth or start vector
######### ####### ####### ####### ####### voxel transformed ####### ####### ####### ####### ####### #####
grid_transformed = self.align_grid_with_pt(extra_rots=extra_rots, extra_trans=extra_trans)
cur_coords_voxel = grid_transformed.view(D * H * W, 3) #[M=163840,3]
######### ####### ####### ####### ####### ####### ####### ####### ####### ####### ####### ####### #######
def inverse_aug(cur_coords_c, TransformationMode="lidar2camera"):
cur_coords_c -= cur_lidar_aug_matrix[:3, 3]
cur_coords_c = torch.inverse(cur_lidar_aug_matrix[:3, :3]).matmul(
cur_coords_c.transpose(1, 0)
)
if TransformationMode=="lidar2camera":
cur_coords_c = cur_lidar2camera[:, :3, :3].matmul(cur_coords_c)
cur_coords_c += cur_lidar2camera[:, :3, 3].reshape(-1, 3, 1)
elif TransformationMode=="lidar2image":
cur_coords_c = cur_lidar2image[:, :3, :3].matmul(cur_coords_c)
cur_coords_c += cur_lidar2image[:, :3, 3].reshape(-1, 3, 1)
return cur_coords_c
cur_coords_before= inverse_aug(cur_coords_before, "None") ## original
#cur_coords= inverse_aug(cur_coords, TransMode) # point based transformation
cur_coords = inverse_aug(cur_coords_voxel, TransMode) #voxel based transformation [N,C,M]
######### ####### ####### ####### ####### ####### ####### ####### ####### ####### #######
# visualize_lidar_to_camera( cur_coords_before.permute(1,0) , cur_coords[0,:,:].permute(1,0), cur_coords_voxel[0,:,:].permute(1,0) )
######### ####### ####### ####### ####### ####### ####### ####### ####### ####### #######
# filter points in front of the cameras
depth = cur_coords[:, 2, :] # Z-coordinates in the camera space
valid_depth_mask = depth >= 0.1 #[N,M]
cur_coords = cur_coords[:,:,valid_depth_mask.any(axis=0)] #[N,C,M']
###########################################################
cur_coords[:, 2, :] = torch.clamp(cur_coords[:, 2, :], 1e-5, 1e5)
cur_coords[:, :2, :] /= cur_coords[:, 2:3, :]
# do image aug
cur_coords = cur_img_aug_matrix[:, :3, :3].matmul(cur_coords)
cur_coords += cur_img_aug_matrix[:, :3, 3].reshape(-1, 3, 1)
cur_coords = cur_coords[:, :2, :].transpose(1, 2)
# normalize coords
cur_coords = cur_coords[..., [1, 0]]
#for n in range(0, N):
camera_view = img_org[b, :, :, :, :].permute(0, 2, 3, 1).cpu().detach().numpy()
camera_view_tensor = img_org[b, :, :, :, :].permute(0, 2, 3, 1)
# filter points outside of images
N, H_img, W_img, C = camera_view_tensor.shape
on_img = (
(cur_coords[..., 0] < self.image_size[0])
& (cur_coords[..., 0] >= 0)
& (cur_coords[..., 1] < self.image_size[1])
& (cur_coords[..., 1] >= 0)
)
valid_points = cur_coords[:, on_img[1]]
######### Normalize Valid Points Between [-1, 1] ########
normalized_points = torch.zeros_like(valid_points)
normalized_points[:,:, 0] = 2.0 * (valid_points[:, :, 0] / (H_img - 1)) - 1.0 # Normalize y-coordinates
normalized_points[:,:, 1] = 2.0 * (valid_points[:, :, 1] / (W_img - 1)) - 1.0 # [N, M',2]
grid = normalized_points.unsqueeze(1).cuda() # Shape [S, H_out, W_out, 2]
features_list = []
for i in range(0,N):
img_s =camera_view_tensor[i].unsqueeze(0).permute(0, 3, 1, 2) #[B, C, H_in, W_in]
grid_s = grid[i].unsqueeze(0)
features_points = F.grid_sample(img_s, grid_s,mode='bilinear',
align_corners=None) # (B,N,C,H_out,W_out)
features_with_location = torch.cat([features_points, grid_s], dim=-1)
features_list.append(features_points)
features_points = torch.stack(features_list, dim=1) #[B = 1,S = 6,C= 3,H= 1,W =22965]
Цель:
Сопоставление объектов с вокселами: какие шаги необходимы для размещения извлеченных объектов в правильных местах в сетке вокселей?
Подробнее здесь:
https://stackoverflow.com/questions/790 ... voxel-back
1727259279
Anonymous
В настоящее время я работаю над проектом, в котором разрабатываю класс специально для билинейной выборки. Моя цель — сопоставить функции, извлеченные в процессе билинейной выборки, с соответствующими местами в созданной сетке вокселов. [code]class BilinearSamplingTransform(nn.Module): def __init__(self, model_cfg): super().__init__() self.model_cfg = model_cfg self.norm = self.model_cfg.NORMALIZER in_channel = self.model_cfg.IN_CHANNEL out_channel = self.model_cfg.OUT_CHANNEL self.image_size = self.model_cfg.IMAGE_SIZE self.feature_size = self.model_cfg.FEATURE_SIZE self.xbound = self.model_cfg.XBOUND self.ybound = self.model_cfg.YBOUND self.zbound = self.model_cfg.ZBOUND self.dbound = self.model_cfg.DBOUND downsample = self.model_cfg.DOWNSAMPLE self.C = out_channel self.grid =self.create_voxel_grid() # [D, H, W, 3] self.D = self.grid.shape[0] def create_voxel_grid(self): """ Create a voxel grid based on image size, feature size, and depth. Returns: voxel_grid (torch.Tensor): A 3D voxel grid containing the voxel coordinates [D, H, W, 3]. """ fZ, fX, fY = self.zbound[2], self.xbound[2], self.ybound[2] # Width (X) values: Linearly spaced positions along the xbound to cover the feature width xs = torch.linspace(self.xbound[0], self.xbound[1], self.xbound[2], dtype=torch.float).view(1, 1, fY).expand(fZ, fX, fY) # Height (Y) values: Linearly spaced positions along the ybound to cover the feature height ys = torch.linspace(self.ybound[0], self.ybound[1], self.ybound[2], dtype=torch.float).view(1, fX, 1).expand(fZ, fX, fY) zs = torch.linspace(self.zbound[0], self.zbound[1], self.zbound[2], dtype=torch.float).view(fZ, 1, 1).expand(fZ, fX, fY) # Stack the voxel grid along the depth (ds), height (ys), and width (xs) axes voxel_grid = torch.stack((xs, ys, zs), -1) # Shape: [D, H, W, 3] return nn.Parameter(voxel_grid, requires_grad=False) def align_grid_with_pt(self, **kwargs): points = self.grid B, D, W, H = 1, points.size(0), points.size(1), points.size(2) # Handle extra rotations if "extra_rots" in kwargs: extra_rots = kwargs["extra_rots"] points = points.view(B, -1, 3) # Flatten spatial dimensions points = torch.bmm(points, extra_rots.transpose(1, 2)) # Batch matrix multiplication points = points.view(B, D, W, H, 3) # Correct reshaping using known dimensions # Handle extra translations if "extra_trans" in kwargs: extra_trans = kwargs["extra_trans"] points += extra_trans.view(B, 1, 1, 1, 3) # Broadcast over spatial dimensions # Visualization function here return points def get_geometry(self,points_gt, camera2lidar_rots, camera2lidar_trans, intrins, post_rots, post_trans, **kwargs): camera2lidar_rots = camera2lidar_rots.to(torch.float) camera2lidar_trans = camera2lidar_trans.to(torch.float) intrins = intrins.to(torch.float) post_rots = post_rots.to(torch.float) post_trans = post_trans.to(torch.float) B, N, _ = camera2lidar_trans.shape # cam_to_lidar points = torch.cat((points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3], points[:, :, :, :, :, 2:3]), 5) combine = camera2lidar_rots.matmul(torch.inverse(intrins)) points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1) points += camera2lidar_trans.view(B, N, 1, 1, 1, 3) if "extra_rots" in kwargs: extra_rots = kwargs["extra_rots"] points = extra_rots.view(B, 1, 1, 1, 1, 3, 3).repeat(1, N, 1, 1, 1, 1, 1) \ .matmul(points.unsqueeze(-1)).squeeze(-1) if "extra_trans" in kwargs: extra_trans = kwargs["extra_trans"] points += extra_trans.view(B, 1, 1, 1, 1, 3).repeat(1, N, 1, 1, 1, 1) return points def forward(self, batch_dict): img_org = batch_dict['camera_imgs'] x = batch_dict['image_fpn'] x = x[0] BN, C, H, W = x.size() img = x.view(int(BN / 6), 6, C, H, W) # B,N,C,H,W points = batch_dict['points'] # only for visualisation but won't be used camera_intrinsics = batch_dict['camera_intrinsics'] # intrinsic camera2lidar = batch_dict['camera2lidar'] # extrinsic img_aug_matrix = batch_dict['img_aug_matrix'] lidar_aug_matrix = batch_dict['lidar_aug_matrix'] lidar2image = batch_dict['lidar2image'] # extrinsics and intrinsics lidar2camera = batch_dict['lidar2camera']# extrinsics only intrins = camera_intrinsics[..., :3, :3] post_rots = img_aug_matrix[..., :3, :3] post_trans = img_aug_matrix[..., :3, 3] camera2lidar_rots = camera2lidar[..., :3, :3] camera2lidar_trans = camera2lidar[..., :3, 3] extra_rots = lidar_aug_matrix[..., :3, :3] extra_trans = lidar_aug_matrix[..., :3, 3] batch_size = BN // 6 all_projected_points = [] TransMode = 'lidar2image' # lidar2camera N = 6 plot_project = False if plot_project and N == 6: fig, axs = plt.subplots(2, 3, figsize=(35, 20)) #fig2, axs2 = plt.subplots(2, 3, figsize=(35, 20)) for b in range(batch_size): ####### In depthlss, first they removing the augmentation, we don't need to do it because we are already in the world coordinate D, H, W, _= self.grid.shape cur_img_aug_matrix = img_aug_matrix[b] cur_lidar_aug_matrix = lidar_aug_matrix[b] cur_lidar2image = lidar2image[b] cur_lidar2camera = lidar2camera[b] ######### ####### ####### ####### ####### pts transformed ####### ####### ####### ####### ####### ####### batch_mask = points[:, 0] == b cur_coords = points[batch_mask][:, 1:4] cur_coords_before = cur_coords.clone() #ground truth or start vector ######### ####### ####### ####### ####### voxel transformed ####### ####### ####### ####### ####### ##### grid_transformed = self.align_grid_with_pt(extra_rots=extra_rots, extra_trans=extra_trans) cur_coords_voxel = grid_transformed.view(D * H * W, 3) #[M=163840,3] ######### ####### ####### ####### ####### ####### ####### ####### ####### ####### ####### ####### ####### def inverse_aug(cur_coords_c, TransformationMode="lidar2camera"): cur_coords_c -= cur_lidar_aug_matrix[:3, 3] cur_coords_c = torch.inverse(cur_lidar_aug_matrix[:3, :3]).matmul( cur_coords_c.transpose(1, 0) ) if TransformationMode=="lidar2camera": cur_coords_c = cur_lidar2camera[:, :3, :3].matmul(cur_coords_c) cur_coords_c += cur_lidar2camera[:, :3, 3].reshape(-1, 3, 1) elif TransformationMode=="lidar2image": cur_coords_c = cur_lidar2image[:, :3, :3].matmul(cur_coords_c) cur_coords_c += cur_lidar2image[:, :3, 3].reshape(-1, 3, 1) return cur_coords_c cur_coords_before= inverse_aug(cur_coords_before, "None") ## original #cur_coords= inverse_aug(cur_coords, TransMode) # point based transformation cur_coords = inverse_aug(cur_coords_voxel, TransMode) #voxel based transformation [N,C,M] ######### ####### ####### ####### ####### ####### ####### ####### ####### ####### ####### # visualize_lidar_to_camera( cur_coords_before.permute(1,0) , cur_coords[0,:,:].permute(1,0), cur_coords_voxel[0,:,:].permute(1,0) ) ######### ####### ####### ####### ####### ####### ####### ####### ####### ####### ####### # filter points in front of the cameras depth = cur_coords[:, 2, :] # Z-coordinates in the camera space valid_depth_mask = depth >= 0.1 #[N,M] cur_coords = cur_coords[:,:,valid_depth_mask.any(axis=0)] #[N,C,M'] ########################################################### cur_coords[:, 2, :] = torch.clamp(cur_coords[:, 2, :], 1e-5, 1e5) cur_coords[:, :2, :] /= cur_coords[:, 2:3, :] # do image aug cur_coords = cur_img_aug_matrix[:, :3, :3].matmul(cur_coords) cur_coords += cur_img_aug_matrix[:, :3, 3].reshape(-1, 3, 1) cur_coords = cur_coords[:, :2, :].transpose(1, 2) # normalize coords cur_coords = cur_coords[..., [1, 0]] #for n in range(0, N): camera_view = img_org[b, :, :, :, :].permute(0, 2, 3, 1).cpu().detach().numpy() camera_view_tensor = img_org[b, :, :, :, :].permute(0, 2, 3, 1) # filter points outside of images N, H_img, W_img, C = camera_view_tensor.shape on_img = ( (cur_coords[..., 0] < self.image_size[0]) & (cur_coords[..., 0] >= 0) & (cur_coords[..., 1] < self.image_size[1]) & (cur_coords[..., 1] >= 0) ) valid_points = cur_coords[:, on_img[1]] ######### Normalize Valid Points Between [-1, 1] ######## normalized_points = torch.zeros_like(valid_points) normalized_points[:,:, 0] = 2.0 * (valid_points[:, :, 0] / (H_img - 1)) - 1.0 # Normalize y-coordinates normalized_points[:,:, 1] = 2.0 * (valid_points[:, :, 1] / (W_img - 1)) - 1.0 # [N, M',2] grid = normalized_points.unsqueeze(1).cuda() # Shape [S, H_out, W_out, 2] features_list = [] for i in range(0,N): img_s =camera_view_tensor[i].unsqueeze(0).permute(0, 3, 1, 2) #[B, C, H_in, W_in] grid_s = grid[i].unsqueeze(0) features_points = F.grid_sample(img_s, grid_s,mode='bilinear', align_corners=None) # (B,N,C,H_out,W_out) features_with_location = torch.cat([features_points, grid_s], dim=-1) features_list.append(features_points) features_points = torch.stack(features_list, dim=1) #[B = 1,S = 6,C= 3,H= 1,W =22965] [/code] Цель: Сопоставление объектов с вокселами: какие шаги необходимы для размещения извлеченных объектов в правильных местах в сетке вокселей? Подробнее здесь: [url]https://stackoverflow.com/questions/79022319/map-bilinear-sampled-features-to-voxel-back[/url]