【计算机视觉】3DGS(3D gaussian splatting)代码解读（五）之scene/camera.py和utils/camera

import torch from torch import nn import numpy as np from utils.graphics_utils import getWorld2View2, getProjectionMatrix from utils.general_utils import PILtoTorch import cv2 """ resolution, # 输入1:目标图像分辨率 (width, height) colmap_id, # 输入2:Colmap重建中该相机的唯一ID R, # 输入3:相机外参-旋转矩阵 (3x3)，世界→相机的旋转 T, # 输入4:相机外参-平移向量 (3,)，世界→相机的平移 FoVx, # 输入5:水平视场角（弧度） FoVy, # 输入6:垂直视场角（弧度） depth_params, # 输入7:深度图校准参数字典 image, # 输入8:原始RGB图像(PIL.Image对象,可能含alpha通道) invdepthmap, # 输入9:逆深度图(numpy数组)，值=1/真实深度 image_name, # 输入10:对应图像的文件名 uid, # 输入11:相机全局唯一标识符 trans=np.array([0.0, 0.0, 0.0]), # 输入12:全局场景平移向量 scale=1.0, # 输入13:全局场景缩放因子 data_device="cuda", # 输入14:数据存储设备(cuda/cpu) train_test_exp=False, # 输入15:是否启用"半图训练半图测试"模式 is_test_dataset=False, # 输入16:是否属于测试数据集 is_test_view=False # 输入17:是否是测试视角 """ class Camera(nn.Module): def __init__(self, resolution, colmap_id, R, T, FoVx, FoVy, depth_params, image, invdepthmap, image_name, uid, trans=np.array([0.0, 0.0, 0.0]), scale=1.0, data_device = "cuda", train_test_exp = False, is_test_dataset = False, is_test_view = False ): # 调用父类nn.Module的构造函数 super(Camera, self).__init__() # -------------------------- 基础标识信息存储 -------------------------- self.uid = uid # 相机全局唯一ID，用于区分数据集中所有相机 self.colmap_id = colmap_id # Colmap重建中的相机ID，用于关联Colmap原始数据 self.R = R # 存储旋转矩阵（3x3 numpy数组） self.T = T # 存储平移向量（3维numpy数组） self.FoVx = FoVx # 存储水平视场角 self.FoVy = FoVy # 存储垂直视场角 self.image_name = image_name # 存储对应图像的文件名，用于调试和可视化 # -------------------------- 设备初始化 -------------------------- # 尝试创建指定设备的torch.device对象 try: self.data_device = torch.device(data_device) # 设备创建失败时，回退到默认cuda设备 except Exception as e: print(e) print(f"[Warning] Custom device {data_device} failed, fallback to default cuda device" ) self.data_device = torch.device("cuda") # -------------------------- 图像与Alpha掩码处理 -------------------------- # 将PIL图像转换为PyTorch张量：形状从(H,W,C)→(C,H,W)，像素值归一化到[0,1] resized_image_rgb = PILtoTorch(image, resolution) # 提取前3个通道作为RGB Ground Truth图像 gt_image = resized_image_rgb[:3, ...] # 初始化alpha掩码：标记有效像素区域（掩码为0的区域不计算损失） self.alpha_mask = None # 如果图像有4个通道，第4个通道直接作为alpha掩码 if resized_image_rgb.shape[0] == 4: self.alpha_mask = resized_image_rgb[3:4, ...].to(self.data_device) # 否则创建全1的alpha掩码（所有像素都有效） else: self.alpha_mask = torch.ones_like(resized_image_rgb[0:1, ...].to(self.data_device)) # 半图训练半图测试模式：将alpha掩码的一半设为0，只训练另一半 if train_test_exp and is_test_view: if is_test_dataset: # 测试数据集：左半图设为0，训练右半图 self.alpha_mask[..., :self.alpha_mask.shape[-1] // 2] = 0 else: # 训练数据集：右半图设为0，训练左半图 self.alpha_mask[..., self.alpha_mask.shape[-1] // 2:] = 0 # 存储最终的GT图像，钳位像素值到[0,1]并移到指定设备 self.original_image = gt_image.clamp(0.0, 1.0).to(self.data_device) # 提取图像宽度（张量第2维） self.image_width = self.original_image.shape[2] # 提取图像高度（张量第1维） self.image_height = self.original_image.shape[1] # -------------------------- 逆深度图处理 -------------------------- self.invdepthmap = None# 存储处理后的逆深度图张量 (1,H,W) self.depth_reliable = False# 标记该相机的深度图是否可靠（不可靠则不使用深度损失） # 如果提供了逆深度图，进行处理 if invdepthmap is not None: # 初始化深度掩码：标记有效深度像素区域 self.depth_mask = torch.ones_like(self.alpha_mask) # 将逆深度图缩放到目标分辨率 self.invdepthmap = cv2.resize(invdepthmap, resolution) # 将负的逆深度值设为0（无效深度） self.invdepthmap[self.invdepthmap < 0] = 0 # 暂时标记深度可靠，后续会根据校准参数调整 self.depth_reliable = True # 如果提供了深度修正参数（用于把单目相对深度对齐到绝对世界坐标系） if depth_params is not None: # 检查缩放参数是否在合理范围内（0.2~5倍中值缩放） if depth_params["scale"] < 0.2 * depth_params["med_scale"] or depth_params["scale"] > 5 * depth_params["med_scale"]: # 缩放参数异常，标记深度不可靠，深度掩码全0 self.depth_reliable = False self.depth_mask *= 0 # 应用缩放和偏移校准，修正单目深度图的尺度和偏移误差 if depth_params["scale"] > 0: self.invdepthmap = self.invdepthmap * depth_params["scale"] + depth_params["offset"] # 如果处理后的深度图不是 2 维的（比如还带着通道轴 H x W x 1） if self.invdepthmap.ndim != 2: # 降维，只取第一个通道变成 2D 矩阵 (H, W) self.invdepthmap = self.invdepthmap[..., 0] # 转换为PyTorch张量，增加通道维度 (1,H,W)，并移到指定设备 self.invdepthmap = torch.from_numpy(self.invdepthmap[None]).to(self.data_device) # -------------------------- 裁剪平面与全局变换 -------------------------- self.zfar = 100.0 # 远裁剪面：超过此距离的物体不会被渲染 self.znear = 0.01 # 近裁剪面：小于此距离的物体不会被渲染 self.trans = trans # 全局平移向量：统一调整整个场景的位置 self.scale = scale # 全局缩放因子：统一调整整个场景的大小 # -------------------------- 核心变换矩阵预计算 -------------------------- # 所有矩阵都是4x4齐次变换矩阵，转置是为了将图形学的列优先矩阵转换为PyTorch的行优先张量 # 1. 视图矩阵：世界坐标系 → 相机坐标系 self.world_view_transform = torch.tensor(getWorld2View2(R, T, trans, scale)).transpose(0, 1).cuda() # 2. 投影矩阵：相机坐标系 → 裁剪坐标系 self.projection_matrix = getProjectionMatrix(znear=self.znear, zfar=self.zfar, fovX=self.FoVx, fovY=self.FoVy).transpose(0,1).cuda() # 3. 完整变换矩阵：世界坐标系 → 裁剪坐标系 = 视图矩阵 × 投影矩阵 # unsqueeze(0)增加batch维度以使用bmm批量矩阵乘法，squeeze(0)去掉batch维度 self.full_proj_transform = (self.world_view_transform.unsqueeze(0).bmm(self.projection_matrix.unsqueeze(0))).squeeze(0) # 4. 相机中心：相机在世界坐标系中的位置 # 视图矩阵的逆矩阵是相机坐标系→世界坐标系的变换，最后一行前3个元素就是相机中心 self.camera_center = self.world_view_transform.inverse()[3, :3] class MiniCam: def __init__(self, width, height, fovy, fovx, znear, zfar, world_view_transform, full_proj_transform): self.image_width = width self.image_height = height self.FoVy = fovy self.FoVx = fovx self.znear = znear self.zfar = zfar self.world_view_transform = world_view_transform self.full_proj_transform = full_proj_transform view_inv = torch.inverse(self.world_view_transform) self.camera_center = view_inv[3][:3]

# # Copyright (C) 2023, Inria # GRAPHDECO research group, https://team.inria.fr/graphdeco # All rights reserved. # # This software is free for non-commercial, research and evaluation use # under the terms of the LICENSE.md file. # # For inquiries contact george.drettakis@inria.fr # from scene.cameras import Camera import numpy as np from utils.graphics_utils import fov2focal from PIL import Image import cv2 WARNED = False """ 该函数是3D Gaussian Splatting(3DGS) 官方代码库中最核心的数据加载函数之一， 负责从相机信息对象中加载 RGB 图像、深度图，并根据命令行参数计算最终渲染分辨率， 最终返回标准化的Camera对象供后续训练 / 渲染使用。 """ #函数定义：加载单个相机及其关联的图像、深度数据 # 参数说明： # args: 命令行参数对象（包含分辨率、设备、训练模式等全局配置） # id: 相机在当前数据集中的唯一顺序ID（从0开始） # cam_info: 相机信息对象（包含内外参、图像路径、深度路径等元数据） # resolution_scale: 全局分辨率缩放因子（额外的全局下采样系数） # is_nerf_synthetic: 布尔值，标记是否为NeRF合成数据集（深度图格式特殊） # is_test_dataset: 布尔值，标记该相机是否属于测试集 def loadCam(args, id, cam_info, resolution_scale, is_nerf_synthetic, is_test_dataset): # 使用PIL库打开相机对应的RGB图像文件 image = Image.open(cam_info.image_path) # 检查当前相机是否有对应的深度图路径 if cam_info.depth_path != "": try: if is_nerf_synthetic:# NeRF合成数据集的深度图格式特殊：16位PNG存储逆深度，需除以512还原 invdepthmap = cv2.imread(cam_info.depth_path, -1).astype(np.float32) / 512 else:# 其他数据集（如Colmap重建、真实场景）：16位深度图除以2^16归一化到[0,1] invdepthmap = cv2.imread(cam_info.depth_path, -1).astype(np.float32) / float(2**16) except FileNotFoundError:# 异常处理1：深度文件不存在 print(f"Error: The depth file at path '{cam_info.depth_path}' was not found.") raise# 重新抛出异常，终止程序执行 except IOError:# 异常处理2：深度文件损坏或格式不支持 print(f"Error: Unable to open the image file '{cam_info.depth_path}'. It may be corrupted or an unsupported format.") raise except Exception as e:# 异常处理3：其他未预期的错误 print(f"An unexpected error occurred when trying to read depth at {cam_info.depth_path}: {e}") raise else:# 无深度图时，逆深度图设为None invdepthmap = None #获取原始RGB图像的宽度和高度（PIL的size返回(宽度, 高度)） orig_w, orig_h = image.size # 处理整数倍下采样（3DGS官方支持1/2/4/8倍下采样） if args.resolution in [1, 2, 4, 8]: # 最终分辨率 = 原始分辨率 / (全局缩放因子 × 命令行指定的下采样倍数) resolution = round(orig_w/(resolution_scale * args.resolution)), round(orig_h/(resolution_scale * args.resolution)) else: # should be a type that converts to float# 处理非整数倍分辨率或自动缩放模式 # 自动缩放模式（args.resolution=-1）：大图像自动缩放到1600px宽度 if args.resolution == -1: # 原始图像宽度大于1600px时触发自动缩放 if orig_w > 1600: global WARNED# 全局警告标志，确保只打印一次警告 if not WARNED: print("[ INFO ] Encountered quite large input images (>1.6K pixels width), rescaling to 1.6K.\n " "If this is not desired, please explicitly specify '--resolution/-r' as 1") WARNED = True # 计算全局下采样比例：原始宽度 / 目标宽度(1600) global_down = orig_w / 1600 else:# 图像宽度≤1600px时不缩放 global_down = 1 else:# 用户指定目标宽度（非-1且非1/2/4/8）：按目标宽度等比例缩放 global_down = orig_w / args.resolution # 总缩放比例 = 全局下采样比例 × 额外分辨率缩放因子 scale = float(global_down) * float(resolution_scale) # 计算最终的(宽度, 高度)分辨率（取整） resolution = (int(orig_w / scale), int(orig_h / scale)) # 构造并返回3DGS标准的Camera对象 """ resolution=resolution, # 最终渲染分辨率 colmap_id=cam_info.uid, # Colmap中该相机的唯一ID R=cam_info.R, # 世界到相机的旋转矩阵(3x3) T=cam_info.T, # 世界到相机的平移向量(3x1) FoVx=cam_info.FovX, # 水平视场角（弧度） FoVy=cam_info.FovY, # 垂直视场角（弧度） depth_params=cam_info.depth_params, # 深度图的缩放/偏移参数（如有） image=image, # PIL格式的RGB图像 invdepthmap=invdepthmap, # 逆深度图(1/深度,3DGS优先使用逆深度) image_name=cam_info.image_name, # 图像文件名（不含路径） uid=id, # 相机在数据集中的顺序ID data_device=args.data_device, # 数据存储设备（'cpu'或'cuda') train_test_exp=args.train_test_exp, # 是否启用训练-测试分离实验模式 is_test_dataset=is_test_dataset,# 该相机是否属于测试集 is_test_view=cam_info.is_test # 该相机是否被标记为测试视角 """ return Camera(resolution, colmap_id=cam_info.uid, R=cam_info.R, T=cam_info.T, FoVx=cam_info.FovX, FoVy=cam_info.FovY, depth_params=cam_info.depth_params, image=image, invdepthmap=invdepthmap, image_name=cam_info.image_name, uid=id, data_device=args.data_device, train_test_exp=args.train_test_exp, is_test_dataset=is_test_dataset, is_test_view=cam_info.is_test) """ 该函数是3D Gaussian Splatting 官方代码库中数据加载模块的核心函数之一, 作为上一个loadCam函数的批量包装器,负责将解析好的所有相机元数据一次性转换为可直接用于训练 / 渲染的Camera对象列表。 """ # 函数定义：从相机元数据列表批量生成Camera对象列表 # 参数说明（与loadCam函数完全对应，仅将单个cam_info改为列表cam_infos）： # cam_infos: 相机元数据列表，每个元素是包含单相机所有信息的cam_info对象 # resolution_scale: 全局分辨率缩放因子（额外的全局下采样系数） # args: 命令行参数对象（包含分辨率、设备、训练模式等全局配置） # is_nerf_synthetic: 布尔值，标记是否为NeRF合成数据集（深度图格式特殊） # is_test_dataset: 布尔值，标记这批相机是否属于测试集 def cameraList_from_camInfos(cam_infos, resolution_scale, args, is_nerf_synthetic, is_test_dataset): # 初始化空列表，用于存储转换完成的Camera对象 camera_list = [] # 遍历所有相机元数据，enumerate同时获取： # id: 相机在列表中的顺序索引（从0开始），作为Camera对象的内部uid # c: 当前遍历到的单个相机元数据对象 for id, c in enumerate(cam_infos): # 调用单个相机加载函数loadCam，转换为Camera对象并添加到列表 camera_list.append(loadCam(args, id, c, resolution_scale, is_nerf_synthetic, is_test_dataset)) # 返回转换完成的完整相机列表 return camera_list """ 该函数是3D Gaussian Splatting 官方代码库中用于导出相机参数的核心工具函数。 它将 3DGS 内部使用的Camera对象转换为 JSON 可序列化的 Python 字典格式， 最终用于生成cameras.json文件,供后续渲染、评估或导入到其他三维重建工具（如 Blender、Meshlab)使用。 """ # 函数定义：将单个Camera对象转换为JSON可序列化的字典 # 参数说明： # id: 相机的唯一标识符（通常是相机在列表中的顺序索引） # camera: 3DGS内部的Camera对象（包含所有相机内外参和元数据） def camera_to_JSON(id, camera : Camera): # 初始化4x4齐次变换矩阵，用于存储【世界坐标系 → 相机坐标系】的变换 Rt = np.zeros((4, 4)) # 填充旋转矩阵部分：取camera.R的转置 # ⚠️ 关键注意：3DGS中camera.R是【世界→相机】的行优先旋转矩阵 # 标准齐次变换矩阵使用列优先存储，因此需要转置 Rt[:3, :3] = camera.R.transpose() # 填充平移向量部分：camera.T是【世界→相机】的平移向量 Rt[:3, 3] = camera.T # 齐次矩阵右下角固定为1.0 Rt[3, 3] = 1.0 # 对世界→相机矩阵求逆，得到【相机坐标系 → 世界坐标系】的变换矩阵 # 这个矩阵就是相机在世界空间中的"位姿矩阵"（Pose Matrix） W2C = np.linalg.inv(Rt) # 提取相机在世界坐标系中的位置：位姿矩阵的最后一列前3个元素 pos = W2C[:3, 3] # 提取相机在世界坐标系中的旋转矩阵：位姿矩阵的前3x3子矩阵 rot = W2C[:3, :3] # 将numpy旋转矩阵转换为Python原生列表（JSON不支持直接序列化numpy数组） # 逐行转换，保持矩阵的行列结构 serializable_array_2d = [x.tolist() for x in rot] # 构建最终的JSON相机条目字典 """ 'id' : id, # 相机唯一ID 'img_name' : camera.image_name, # 对应图像的文件名 'width' : camera.width, # 图像宽度（像素） 'height' : camera.height, # 图像高度（像素） 'position': pos.tolist(), # 相机在世界坐标系中的位置 [x, y, z] 'rotation': serializable_array_2d, # 相机在世界坐标系中的旋转矩阵（3x3列表） 'fy' : fov2focal(camera.FovY, camera.height), # 垂直方向焦距（像素） 'fx' : fov2focal(camera.FovX, camera.width) # 水平方向焦距（像素） """ camera_entry = { 'id' : id, 'img_name' : camera.image_name, 'width' : camera.width, 'height' : camera.height, 'position': pos.tolist(), 'rotation': serializable_array_2d, 'fy' : fov2focal(camera.FovY, camera.height), 'fx' : fov2focal(camera.FovX, camera.width) } # 返回可直接序列化为JSON的字典对象 return camera_entry

企业官网建设流程全解析

热门文章

文章分类

标签云

需要专业的网站建设服务？

企业官网建设流程全解析

热门文章

文章分类

标签云

相关文章

智慧政务大数据整体解决方案全解析｜架构设计、建设内容、落地实践与价值复盘

0529-鸿蒙技术上课

Cortex-M调试与追踪参数DEBUG_LVL和TRACE_LVL详解

需要专业的网站建设服务？