nvidia-cosmos · atmguille · Mar 2, 2025 · Mar 18, 2025 · Mar 18, 2025 · Mar 18, 2025
diff --git a/assets/example1_depth.mp4 b/assets/example1_depth.mp4
diff --git a/assets/example1_edge.mp4 b/assets/example1_edge.mp4
diff --git a/assets/example1_input_video.mp4 b/assets/example1_input_video.mp4
diff --git a/assets/example1_seg.mp4 b/assets/example1_seg.mp4
diff --git a/assets/example1_single_control_edge.mp4 b/assets/example1_single_control_edge.mp4
diff --git a/assets/example1_single_control_edge_prompt_upsampler_result.mp4 b/assets/example1_single_control_edge_prompt_upsampler_result.mp4
diff --git a/assets/example1_spatiotemporal_weights.mp4 b/assets/example1_spatiotemporal_weights.mp4
diff --git a/assets/example1_spatiotemporal_weights_mask.mp4 b/assets/example1_spatiotemporal_weights_mask.mp4
diff --git a/assets/example1_uniform_weights.mp4 b/assets/example1_uniform_weights.mp4
diff --git a/assets/example1_vis.mp4 b/assets/example1_vis.mp4
diff --git a/assets/inference_depth_output.mp4 b/assets/inference_depth_output.mp4
diff --git a/assets/inference_keypoint_input_video.mp4 b/assets/inference_keypoint_input_video.mp4
diff --git a/assets/inference_keypoint_output.mp4 b/assets/inference_keypoint_output.mp4
diff --git a/assets/inference_upscaler_input_video.mp4 b/assets/inference_upscaler_input_video.mp4
diff --git a/assets/inference_upscaler_output.mp4 b/assets/inference_upscaler_output.mp4
diff --git a/assets/robot_sample_input.mp4 b/assets/robot_sample_input.mp4
diff --git a/assets/robot_sample_output.mp4 b/assets/robot_sample_output.mp4
diff --git a/assets/robot_sample_seg.mp4 b/assets/robot_sample_seg.mp4
diff --git a/assets/sample_av_multi_control_input_hdmap.mp4 b/assets/sample_av_multi_control_input_hdmap.mp4
diff --git a/assets/sample_av_multi_control_input_lidar.mp4 b/assets/sample_av_multi_control_input_lidar.mp4
diff --git a/assets/sample_av_multi_control_input_video.mp4 b/assets/sample_av_multi_control_input_video.mp4
diff --git a/assets/sample_av_multi_control_output.mp4 b/assets/sample_av_multi_control_output.mp4
diff --git a/assets/sample_av_multi_control_spec_with_input_video.json b/assets/sample_av_multi_control_spec_with_input_video.json
@@ -0,0 +1,11 @@
+{
+    "input_video_path" : "assets/sample_av_multi_control_input_video.mp4",
+    "hdmap": {
+        "control_weight": 0.3,
+        "input_control": "assets/sample_av_multi_control_input_hdmap.mp4"
+    },
+    "lidar": {
+        "control_weight": 0.7,
+        "input_control": "assets/sample_av_multi_control_input_lidar.mp4"
+    }
+}
diff --git a/cosmos_transfer1/auxiliary/depth_anything/assets/input_video.mp4 b/cosmos_transfer1/auxiliary/depth_anything/assets/input_video.mp4
diff --git a/cosmos_transfer1/auxiliary/sam2/assets/input_video.mp4 b/cosmos_transfer1/auxiliary/sam2/assets/input_video.mp4
diff --git a/cosmos_transfer1/auxiliary/tokenizer/test_data/video.mp4 b/cosmos_transfer1/auxiliary/tokenizer/test_data/video.mp4
diff --git a/cosmos_transfer1/diffusion/datasets/augmentors/control_input.py b/cosmos_transfer1/diffusion/datasets/augmentors/control_input.py
@@ -535,8 +535,16 @@ def __call__(self, data_dict: dict) -> dict:
         if "control_input_edge" in data_dict:
             # already processed
             return data_dict
-        key_img = self.input_keys[1]
+
+        key_img = self.input_keys[1]  # typically 'video'
         key_out = self.output_keys[0]
+
+        # In some situations (e.g. warm-up frames) the caller may not provide
+        # RGB frames.  In that case we simply skip edge computation and leave
+        # the dict unchanged so the pipeline can proceed without this hint.
+        if key_img not in data_dict:
+            return data_dict
+
         frames = data_dict[key_img]
         # Get lower and upper threshold for canny edge detection.
         if self.use_random:  # always on for training, always off for inference
@@ -556,6 +564,11 @@ def __call__(self, data_dict: dict) -> dict:
                 t_lower, t_upper = 300, 400
             else:
                 raise ValueError(f"Preset {self.preset_strength} not recognized.")
+
+        # If frames is a torch tensor (potentially on GPU), move to CPU and convert
+        # to numpy so that subsequent OpenCV operations work correctly.
+        if torch.is_tensor(frames):
+            frames = frames.detach().cpu().numpy()
         frames = np.array(frames)
         is_image = len(frames.shape) < 4
 
@@ -571,6 +584,38 @@ def __call__(self, data_dict: dict) -> dict:
         edge_maps = torch.from_numpy(edge_maps).expand(3, -1, -1, -1)
         if is_image:
             edge_maps = edge_maps[:, 0]
+
+        # ------------------------------------------------------------------
+        # DEBUG: Save one side-by-side sample (RGB | edges) the first time we
+        #        compute an edge map during a run.  This helps verify that the
+        #        edge input looks sensible when running the regular pipeline.
+        # ------------------------------------------------------------------
+        try:
+            if True:
+                import os, uuid
+
+                if is_image:
+                    rgb_frame = frames  # HWC uint8
+                    edge_vis = edge_maps[0].numpy()  # HxW uint8
+                else:
+                    # Take first temporal slice
+                    rgb_frame = frames[:, 0].transpose(1, 2, 0)  # HWC
+                    edge_vis = edge_maps[0, 0].numpy()  # HxW
+
+                edge_vis_rgb = cv2.cvtColor(edge_vis, cv2.COLOR_GRAY2BGR)
+                rgb_frame_bgr = cv2.cvtColor(rgb_frame, cv2.COLOR_RGB2BGR)
+                canvas = np.concatenate([rgb_frame_bgr, edge_vis_rgb], axis=1)
+
+                out_dir = "/home/lab/mapodaca/cosmos-transfer1-github/edge_debug/"
+                os.makedirs(out_dir, exist_ok=True)
+                fname = os.path.join(out_dir, f"edge_debug_{uuid.uuid4().hex[:8]}.png")
+                cv2.imwrite(fname, canvas)
+                log.info(f"Saved edge debug frame to {fname}")
+                self._debug_saved = True
+        except Exception as _e:
+            # Don't crash the pipeline if debug save fails
+            log.warning(f"Edge debug save failed: {_e}")
+
         data_dict[key_out] = edge_maps
         return data_dict
 

diff --git a/cosmos_transfer1/diffusion/diffusion/modules/res_sampler.py b/cosmos_transfer1/diffusion/diffusion/modules/res_sampler.py
@@ -147,7 +147,11 @@ def float64_x0_fn(x_B_StateShape: torch.Tensor, t_B: torch.Tensor) -> torch.Tens
         timestamps_cfg = SolverTimestampConfig(nfe=num_steps, t_min=sigma_min, t_max=sigma_max, order=rho)
         sampler_cfg = SamplerConfig(solver=solver_cfg, timestamps=timestamps_cfg, sample_clean=True)
 
-        return self._forward_impl(float64_x0_fn, x_sigma_max, sampler_cfg).to(in_dtype)
+        output, intermediates = self._forward_impl(float64_x0_fn, x_sigma_max, sampler_cfg)
+        intermediate_outputs = []
+        for intermediate in intermediates:
+            intermediate_outputs.append(intermediate.to(in_dtype))
+        return output.to(in_dtype), intermediate_outputs
 
     @torch.no_grad()
     def _forward_impl(
@@ -156,7 +160,7 @@ def _forward_impl(
         noisy_input_B_StateShape: torch.Tensor,
         sampler_cfg: Optional[SamplerConfig] = None,
         callback_fns: Optional[List[Callable]] = None,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, list[torch.Tensor]]:
         """
         Internal implementation of the forward pass.
 
@@ -177,7 +181,7 @@ def _forward_impl(
             sampler_cfg.timestamps.t_min, sampler_cfg.timestamps.t_max, num_timestamps, sampler_cfg.timestamps.order
         ).to(noisy_input_B_StateShape.device)
 
-        denoised_output = differential_equation_solver(
+        denoised_output, intermediates = differential_equation_solver(
             denoiser_fn, sigmas_L, sampler_cfg.solver, callback_fns=callback_fns
         )(noisy_input_B_StateShape)
 
@@ -186,7 +190,7 @@ def _forward_impl(
             ones = torch.ones(denoised_output.size(0), device=denoised_output.device, dtype=denoised_output.dtype)
             denoised_output = denoiser_fn(denoised_output, sigmas_L[-1] * ones)
 
-        return denoised_output
+        return denoised_output, intermediates
 
 
 def fori_loop(lower: int, upper: int, body_fun: Callable[[int, Any], Any], init_val: Any) -> Any:
@@ -203,9 +207,12 @@ def fori_loop(lower: int, upper: int, body_fun: Callable[[int, Any], Any], init_
         The final result after all iterations.
     """
     val = init_val
+    intermediates = []
     for i in range(lower, upper):
         val = body_fun(i, val)
-    return val
+        intermediates.append(val[0])
+
+    return val[0], intermediates
 
 
 def differential_equation_solver(
@@ -277,7 +284,7 @@ def step_fn(
 
             return output_x_B_StateShape, x0_preds
 
-        x_at_eps, _ = fori_loop(0, num_step, step_fn, [input_xT_B_StateShape, None])
-        return x_at_eps
+        x_at_eps, intermediates = fori_loop(0, num_step, step_fn, [input_xT_B_StateShape, None])
+        return x_at_eps, intermediates
 
     return sample_fn