【BEV 视图变换】Fast-Ray(2): 代码复现+画图解释基于查找表LUT、多视角到单个三维体素转换

paper：Fast-BEV: A Fast and Strong Bird’s-Eye View Perception Baseline
code：https://github.com/Sense-GVT/Fast-BEV
致谢: 感谢我司傅同学提供的复现源码

一、完整复现代码(可一键运行)和效果图

Fast-Ray pipeline：
1.创建uv coord + semantic channels (2d image + 语义通道64)
2.构建离线LUT表(voxel coord -> uv coord, 3d->2d)（基于深度均匀假设，没做深度估计，多个3d点对应一个2d图像点）
(1) 逆体素化过程：根据世界范围划分和刻度实现 voxel coord -> world coord
(2)逆视锥化过程：使用投影矩阵(相机内外参)实现 world coord -> uv coord
(3)构建LUT表
3.对LUT表进行在线查询(uv coord -> voxel coord, 2d->3d)
4.pool(voxel coord -> bev coord)(去掉Z轴)

在这里插入图片描述

import numpy as np
import matplotlib.pyplot as plt# 查找表的特点：# 1.它不依赖于与数据相关的深度信息，因为摄像头位置和它们的内外参参数在感知系统构建时就已确定，并且对于每次输入都是相同的。# 2.因此，无需为每次输入重新计算投影索引，而是可以预先计算固定的投影索引并将其作为静态查找表存储起来。# 3.在推理（inference）过程中，可以通过查询这个查找表来获得投影索引，这是一个低成本的操作。# 4.无论是处理单帧还是多帧图像，都可以轻松预先计算相机的内外参数，并根据这些参数预先对齐到当前帧。
def build_LUT_CPU(n_voxels, voxel_size, orgin, projection, n_images, height, width):"""构建一个查找表(LUT),将BEV空间中的每个体素反投影到图像空间中的像素。Args:n_voxels: BEV空间中每个维度上的体素数量，例如 [100,100,4]voxel_size: 每个体素的大小，例如[0.5,0.5,1.5]orgin: BEV空间的原点坐标，例如[0,0,0]projection: 相机投影矩阵，形状为(n_images,3,4)n_images: 图像数量height: 图像高度width: 图像宽度Returns:LUT：查找表，形状为(n_voxels[0],n_voxels[1],n_voxels[2],2)valid: 每个体素是否有效，形状为(n_voxels[0],n_voxels[1],n_voxels[2])"""n_x_voxels, n_y_voxels, n_z_voxels = n_voxels # [100,100,4]size_x, size_y, size_z = voxel_size # [0.5,0.5,1.5]orgin_x, orgin_y, orgin_z = orgin # BEV空间的原点坐标nrof_voxels = n_x_voxels * n_y_voxels * n_z_voxels # voxel总数量LUT = np.full((nrof_voxels,2),-1,dtype=np.int32) # [400000,2]valid = np.zeros(nrof_voxels,dtype=np.int32) # [40000]offset = 0 # LUT中的偏移量count = 0# BEV --> cam --> uv图像上，如果超出图像大小的则，否则1，放在validfor zi in range(n_z_voxels): # 4for yi in range(n_y_voxels): # 100for xi in range(n_x_voxels): # 100for img_id in range(n_images):# 逆体素化过程！  voxel coord -> world coord# 将体素中心在BEV空间的X轴上的坐标上加上BEV原点在BEV空间的X轴上的坐标，得到体素中心在世界坐标系中的X坐标pt = np.array([(xi-n_x_voxels/2)*size_x+orgin_x,(yi-n_y_voxels/2)*size_y+orgin_y,(zi-n_z_voxels/2)*size_z+orgin_z,1 # 齐次坐标],)# 逆视锥化！ 使用投影矩阵(相机内外参)将点从 world coord -> uv coordar = projection[img_id] @ pt # (3*4) @ (4*1)ar = ar[:3] / ar[2] # 将齐次坐标系下的 3D点投影到图像平面，并进行归一化。 u,v,1u = int(round(ar[0])) # 检查投影点是否在图像范围内v = int(round(ar[1]))d = ar[2]# 建立查找表if(0 <= u <width and 0 <= v <height and d>0):# LUT(look-up table)中第offset的值 代表 img中(u,v)点LUT[offset] = [img_id, v*width + u] # 2valid[offset] = 1count += 1break# 一个voxel对应的多个image的uv,这里没有处理，直接覆盖了offset += 1return LUT, valid# features+LUT -> volume    uv space + LUT-> voxel space
def backproject_LUT_CPU(features, LUT, volume, n_voxels):"""使用LUT将图像特征投影到BEV空间中Args:features:图像特征, 形状为(n_images,height,width,n_channels)LUT: 查找表, 形状为(n_voxels[0]*n_voxels[1]*n_voxels[2], 2)volume: BEV空间中的体素，形状为(n_voxels[0]*n_voxels[1]*n_voxels[2],n_channels)n_voxels: BEV空间中每个维度上的体素数量, 例如[100,100,4]. 40000个BEVReturns:"""n_x_voxels, n_y_voxels, n_z_voxels = n_voxelsvolume_count = n_x_voxels * n_y_voxels * n_z_voxels *2 # 计算BEV空间中体素的数量*2(因为LUT中每个体素对应两行), volume_count=80000=2*40000cnt = 0for offset in range(0,volume_count,2): # 循环遍历 LUT数组(BEV index), 每次迭代处理两个元素img_id = LUT[offset//2,0] # 获取当前体素对应的图像索引: 第几张图voxel_index = LUT[offset//2,1] # 获取当前体素对应的像素索引: v*width + uif img_id >=0:u = voxel_index // width # voxel_index, 需要 //width表示他在该尺度特征图上的indexv = voxel_index % width# 64 sematic channelssrc = features[img_id, u, v] # 从对应图像中提取特征, pixel * c# 将uv图像特征复制到相应的体素空间volume[offset//2] = srcvolume[offset//2]/=2cnt+=1print("repeat cnt:",cnt)bev_volume = volumereturn bev_volumedef plot_uvd_frustum(frustum): # 41 8 22 3fig = plt.figure()ax = fig.add_subplot(111, projection='3d')# Convert frustum tensor to numpy array for visualizationfrustum_np = frustum# Extract x, y, d coordinatesx = frustum_np[..., 0].flatten()y = frustum_np[..., 1].flatten()d = frustum_np[..., 2].flatten()# Plot the points in 3D spaceax.scatter(x, y, d, c=d, cmap='viridis', marker='o')ax.set_xlabel('u')ax.set_ylabel('v')ax.set_zlabel('semantic channels')plt.show()def plot_voxel(voxel_feat):# 创建一个3D图形fig = plt.figure()ax=fig.add_subplot(projection='3d')# 创建网格坐标x,y,z=np.indices(voxel_feat.shape)# 绘制网格ax.scatter(x.flatten(),y.flatten(),z.flatten(),c=voxel_feat.flatten(),cmap='viridis')ax.set_xlabel('x')ax.set_ylabel('y')ax.set_zlabel('z')plt.show()def plot_bev(bev_feat):# 创建一个2d图形fig = plt.figure()ax=fig.add_subplot()# 创建网格坐标x,y=np.indices(bev_feat.shape)# 绘制网格ax.scatter(x.flatten(),y.flatten(),c=bev_feat.flatten(),cmap='viridis')ax.set_xlabel('u')ax.set_ylabel('v')plt.show()# Fast-Ray 基于查找表LUT、多视角到单个三维体素转换
if __name__ == "__main__":n_voxels = [100,100,4]voxel_size = [0.5,0.5,1.5]orgin = [0,0,0]n_images = 6height = 232width = 400n_channels = 64# 创建投影矩阵projection = np.random.rand(n_images,3,4)# 构建LUT和valid数组 [40000,2] [40000]LUT,valid = build_LUT_CPU(n_voxels, voxel_size, orgin, projection, n_images, height, width)# 创建图像特征features = np.random.rand(n_images,height,width,n_channels)plot_uvd_frustum(features[0])# 创建BEV空间 # [40000,3] 3表示每个voxel的特征长度是3# [40000,64]volume = np.zeros((n_voxels[0]*n_voxels[1]*n_voxels[2],n_channels),dtype=np.float32)# 反投影特征到BEV空间voxel_feat = backproject_LUT_CPU(features,LUT,volume,n_voxels)voxel_feat = voxel_feat.reshape(n_voxels[0],n_voxels[1],n_voxels[2],n_channels)# printprint('volume shape:',volume.shape) # [100,100,4,64]print('volume:', volume)print('dst:',voxel_feat.shape) # [100,100,4,64]# [100,100,4,64] -> [100,100,4]voxel_feat = voxel_feat.sum(axis=3)plot_voxel(voxel_feat)# [100,100,4] -> [100,100,1]bev_feat = voxel_feat.sum(axis=2)plot_bev(bev_feat)

二、逐步代码讲解+图解

完整流程：
1.创建uv coord + semantic channels (2d image + 语义通道64)
2.构建离线LUT表(voxel coord -> uv coord, 3d->2d)
(1) 逆体素化过程：根据世界范围划分和刻度实现 voxel coord -> world coord
(2)逆视锥化过程：使用投影矩阵(相机内外参)实现 world coord -> uv coord
(3)构建LUT表
3.对LUT表进行在线查询(uv coord -> voxel coord, 2d->3d)
4.pool(voxel coord -> bev coord)(去掉Z轴)

1.创建uv coord+ semantic channels (2d image + 语义通道64)

    # 创建图像特征features = np.random.rand(n_images,height,width,n_channels)plot_uvd_frustum(features[0])

在这里插入图片描述

注意坐标范围，u,v范围代表模型输入尺寸，semantic channels为64。

2.构建离线LUT表(voxel coord -> uv coord)

# 查找表的特点：# 1.它不依赖于与数据相关的深度信息，因为摄像头位置和它们的内外参参数在感知系统构建时就已确定，并且对于每次输入都是相同的。# 2.因此，无需为每次输入重新计算投影索引，而是可以预先计算固定的投影索引并将其作为静态查找表存储起来。# 3.在推理（inference）过程中，可以通过查询这个查找表来获得投影索引，这是一个低成本的操作。# 4.无论是处理单帧还是多帧图像，都可以轻松预先计算相机的内外参数，并根据这些参数预先对齐到当前帧。
def build_LUT_CPU(n_voxels, voxel_size, orgin, projection, n_images, height, width):"""构建一个查找表(LUT),将BEV空间中的每个体素反投影到图像空间中的像素。Args:n_voxels: BEV空间中每个维度上的体素数量，例如 [100,100,4]voxel_size: 每个体素的大小，例如[0.5,0.5,1.5]orgin: BEV空间的原点坐标，例如[0,0,0]projection: 相机投影矩阵，形状为(n_images,3,4)n_images: 图像数量height: 图像高度width: 图像宽度Returns:LUT：查找表，形状为(n_voxels[0],n_voxels[1],n_voxels[2],2)valid: 每个体素是否有效，形状为(n_voxels[0],n_voxels[1],n_voxels[2])"""n_x_voxels, n_y_voxels, n_z_voxels = n_voxels # [100,100,4]size_x, size_y, size_z = voxel_size # [0.5,0.5,1.5]orgin_x, orgin_y, orgin_z = orgin # BEV空间的原点坐标nrof_voxels = n_x_voxels * n_y_voxels * n_z_voxels # voxel总数量LUT = np.full((nrof_voxels,2),-1,dtype=np.int32) # [400000,2]valid = np.zeros(nrof_voxels,dtype=np.int32) # [40000]offset = 0 # LUT中的偏移量count = 0# BEV --> cam --> uv图像上，如果超出图像大小的则，否则1，放在validfor zi in range(n_z_voxels): # 4for yi in range(n_y_voxels): # 100for xi in range(n_x_voxels): # 100for img_id in range(n_images):# 逆体素化过程！  voxel coord -> world coord# 将体素中心在BEV空间的X轴上的坐标上加上BEV原点在BEV空间的X轴上的坐标，得到体素中心在世界坐标系中的X坐标pt = np.array([(xi-n_x_voxels/2)*size_x+orgin_x,(yi-n_y_voxels/2)*size_y+orgin_y,(zi-n_z_voxels/2)*size_z+orgin_z,1 # 齐次坐标],)# 逆视锥化！ 使用投影矩阵(相机内外参)将点从 world coord -> uv coordar = projection[img_id] @ pt # (3*4) @ (4*1)ar = ar[:3] / ar[2] # 将齐次坐标系下的 3D点投影到图像平面，并进行归一化。 u,v,1u = int(round(ar[0])) # 检查投影点是否在图像范围内v = int(round(ar[1]))d = ar[2]# 建立查找表if(0 <= u <width and 0 <= v <height and d>0):# LUT(look-up table)中第offset的值 代表 img中(u,v)点LUT[offset] = [img_id, v*width + u] # 2valid[offset] = 1count += 1break# 一个voxel对应的多个image的uv,这里没有处理，直接覆盖了offset += 1return LUT, valid

（1）逆体素化过程：根据世界范围划分和刻度实现 voxel coord -> world coord

# 将体素中心在BEV空间的X轴上的坐标上加上BEV原点在BEV空间的X轴上的坐标，得到体素中心在世界坐标系中的X坐标
pt = np.array([(xi-n_x_voxels/2)*size_x+orgin_x,(yi-n_y_voxels/2)*size_y+orgin_y,(zi-n_z_voxels/2)*size_z+orgin_z,1 # 齐次坐标],)

（2）逆视锥化过程：使用投影矩阵(相机内外参)实现 world coord -> uv coord

# 逆视锥化！ 使用投影矩阵(相机内外参)将点从 world coord -> uv coord
ar = projection[img_id] @ pt # (3*4) @ (4*1)
ar = ar[:3] / ar[2] # 将齐次坐标系下的 3D点投影到图像平面，并进行归一化。 u,v,1u = int(round(ar[0])) # 检查投影点是否在图像范围内
v = int(round(ar[1]))
d = ar[2]

（3）构建LUT表

                    # 建立查找表if(0 <= u <width and 0 <= v <height and d>0):# LUT(look-up table)中第offset的值 代表 img中(u,v)点LUT[offset] = [img_id, v*width + u] # 2valid[offset] = 1count += 1break# 一个voxel对应的多个image的uv,这里没有处理，直接覆盖了offset += 1

基于深度均匀假设：基于深度均匀假设，没做深度估计，多个3d点对应一个2d图像点
在这里插入图片描述

3.对LUT表进行在线查询(uv coord -> voxel coord, 2d->3d)

在这里插入图片描述

def backproject_LUT_CPU(features, LUT, volume, n_voxels):"""使用LUT将图像特征投影到BEV空间中Args:features:图像特征, 形状为(n_images,height,width,n_channels)LUT: 查找表, 形状为(n_voxels[0]*n_voxels[1]*n_voxels[2], 2)volume: BEV空间中的体素，形状为(n_voxels[0]*n_voxels[1]*n_voxels[2],n_channels)n_voxels: BEV空间中每个维度上的体素数量, 例如[100,100,4]. 40000个BEVReturns:"""n_x_voxels, n_y_voxels, n_z_voxels = n_voxelsvolume_count = n_x_voxels * n_y_voxels * n_z_voxels *2 # 计算BEV空间中体素的数量*2(因为LUT中每个体素对应两行), volume_count=80000=2*40000cnt = 0for offset in range(0,volume_count,2): # 循环遍历 LUT数组(BEV index), 每次迭代处理两个元素img_id = LUT[offset//2,0] # 获取当前体素对应的图像索引: 第几张图target = LUT[offset//2,1] # 获取当前体素对应的像素索引: v*width + uif img_id >=0:u = target // width # target这个图像的一维索引是在原图的, 需要 //width表示他在该尺度特征图上的indexv = target % width# 64 sematic channelssrc = features[img_id, u, v] # 从对应图像中提取特征, pixel * c# 将uv图像特征复制到相应的体素空间volume[offset//2] = srcvolume[offset//2]/=2cnt+=1print("repeat cnt:",cnt)bev_volume = volumereturn bev_volume

4.pool(voxel coord -> bev coord)(去掉Z轴)

# [100,100,4,64] -> [100,100,4]
voxel_feat = voxel_feat.sum(axis=3)
plot_voxel(voxel_feat)# [100,100,4] -> [100,100,1]
bev_feat = voxel_feat.sum(axis=2)
plot_bev(bev_feat)