isaac-for-healthcare · mingxin-zheng · May 27, 2025 · May 8, 2025 · May 8, 2025 · May 9, 2025
diff --git a/docs/source/cosmos_transfer_result.png b/docs/source/cosmos_transfer_result.png
diff --git a/tools/env_setup/install_cosmos_transfer1.sh b/tools/env_setup/install_cosmos_transfer1.sh
@@ -24,7 +24,10 @@ PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && cd ../.. && pwd)"
 # Allow setting the python in PYTHON_EXECUTABLE
 PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE:-python}
 
-COSMOS_TRANSFER_DIR=${1:-$$PROJECT_ROOT/third_party/cosmos-transfer1}
+# Install cuDNN
+bash "$PROJECT_ROOT/tools/env_setup/install_cudnn.sh"
+
+COSMOS_TRANSFER_DIR=${1:-$PROJECT_ROOT/third_party/cosmos-transfer1}
 
 if [ -d "$COSMOS_TRANSFER_DIR" ]; then
     echo "Cosmos Transfer directory already exists at $COSMOS_TRANSFER_DIR. Skipping clone."

diff --git a/tools/run_all_tests.py b/tools/run_all_tests.py
@@ -42,7 +42,7 @@ def _run_test_process(cmd, env, test_path):
 
     try:
         process = subprocess.Popen(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-        stdout, stderr = process.communicate(timeout=600)
+        stdout, stderr = process.communicate(timeout=1200)
         # Filter out extension loading messages
         filtered_stdout = "\n".join(
             [line for line in stdout.split("\n") if not ("[ext:" in line and "startup" in line)]
@@ -77,6 +77,17 @@ def _setup_test_env(project_root, tests_dir):
     return env
 
 
+def _setup_test_cosmos_transfer1_env(project_root, workflow_root, tests_dir):
+    """Helper function to setup test environment for cosmos-transfer1"""
+    env = _setup_test_env(workflow_root, tests_dir)
+    pythonpath = [
+        os.path.join(project_root, "third_party", "cosmos-transfer1"),
+    ]
+    env["PYTHONPATH"] = ":".join(pythonpath) + ":" + env["PYTHONPATH"]
+    env["DEBUG_GENERATION"] = "1"
+    return env
+
+
 def run_tests_with_coverage(workflow_name, skip_xvfb):
     """Run all unittest cases with coverage reporting"""
     print(f"Running tests with xvfb skipped: {skip_xvfb}")
@@ -173,6 +184,8 @@ def run_integration_tests(workflow_name):
             "unittest",
             test_path,
         ]
+        if "cosmos_transfer1" in test_path:
+            env = _setup_test_cosmos_transfer1_env(os.getcwd(), project_root, tests_dir)
 
         if not _run_test_process(cmd, env, test_path):
             all_tests_passed = False

diff --git a/workflows/robotic_ultrasound/scripts/simulation/README.md b/workflows/robotic_ultrasound/scripts/simulation/README.md
@@ -7,6 +7,7 @@
   - [PI Zero Policy Evaluation](#pi-zero-policy-evaluation)
   - [Policy Evaluation w/ DDS](#policy-evaluation-w-dds)
   - [Liver Scan State Machine](#liver-scan-state-machine)
+  - [Cosmos-transfer1 Integration](#cosmos-transfer1-integration)
   - [Teleoperation](#teleoperation)
   - [Ultrasound Raytracing Simulation](#ultrasound-raytracing-simulation)
 
@@ -180,6 +181,79 @@ Replace `/path/to/your/hdf5_data_directory` with the actual path to the director
 
 > **Note:** Additional common Isaac Lab arguments (like `--device`) can also be used.
 
+
+### Cosmos-transfer1 Integration
+
+[Cosmos-Transfer1](https://github.com/nvidia-cosmos/cosmos-transfer1) is a world-to-world transfer model designed to bridge the perceptual divide between simulated and real-world environments.
+We introduce a training-free guided generation method on top of Cosmos-Transfer1 to overcome unsatisfactory results on unseen healthcare simulation assets.
+Directly applying Cosmos-Transfer with various control inputs results in unsatisfactory outputs for the human phantom and robotic arm (see bottom figure). In contrast, our guided generation method preserves the appearance of the phantom and robotic arm while generating diverse backgrounds.
+<img src="../../../../docs/source/cosmos_transfer_result.png" width="512" height="600" />
+
+This training-free guided generation approach by encoding simulation videos into the latent space and applying spatial masking to guide the generation process. The trade-off between realism and faithfulness can be controlled by adjusting the number of guided denoising steps. In addition, our generation pipeline supports multi-view video generation. We first leverage the camera information to warp the generated room view to wrist view, then use it as the guidance of wrist-view generation.
+
+#### Download Cosmos-transfer1 Checkpoints
+Please install cosmos-transfer1 dependency and move to the third party `cosmos-transfer1` folder. The following command downloads the checkpoints:
+```sh
+conda activate cosmos-transfer1
+CUDA_HOME=$CONDA_PREFIX PYTHONPATH=$(pwd) python scripts/download_checkpoints.py --output_dir checkpoints/
+```
+#### Video Prompt Generation
+We follow the idea in [lucidsim](https://github.com/lucidsim/lucidsim) to first generate batches of meta prompt that contains a very concise description of the potential scene, then instruct the LLM (e.g., [gemma-3-27b-it](https://build.nvidia.com/google/gemma-3-27b-it)) to upsample the meta prompt with detailed descriptions.
+We provide example prompts in [`generated_prompts_two_seperate_views.json`](./environments/cosmos_transfer1/config/generated_prompts_two_seperate_views.json).
+
+#### Running Cosmos-transfer1 + Guided Generation
+Please move to the current [`simulation` folder](./) and execute the following command to start the generation pipeline:
+```sh
+export CHECKPOINT_DIR="path to downloaded cosmos-transfer1 checkpoints"
+# Set project root path
+export PROJECT_ROOT="{your path}/i4h-workflows"
+# Set PYTHONPATH
+export PYTHONPATH="$PROJECT_ROOT/third_party/cosmos-transfer1:$PROJECT_ROOT/workflows/robotic_ultrasound/scripts"
+# run bath inference for generation pipeline
+CUDA_HOME=$CONDA_PREFIX PYTHONPATH=$PYTHONPATH python \
+    -m environments.cosmos_transfer1.transfer \
+    --checkpoint_dir $CHECKPOINT_DIR \
+    --source_data_dir "Path to source dir of h5 files" \
+    --output_data_dir "Path to output dir of h5 files" \
+    --offload_text_encoder_model
+```
+#### Command Line Arguments
+
+| Argument | Type | Default | Description |
+|----------|------|---------|-------------|
+| `--prompt` | str | "" | Prompt which the sampled video condition on |
+| `--negative_prompt` | str | "The video captures a game playing, ..." | Negative prompt which the sampled video condition on |
+| `--input_video_path` | str | "" | Optional input RGB video path |
+| `--num_input_frames` | int | 1 | Number of conditional frames for long video generation |
+| `--sigma_max` | float | 80 | sigma_max for partial denoising |
+| `--blur_strength` | str | "medium" | Blur strength applied to input |
+| `--canny_threshold` | str | "medium" | Canny threshold applied to input. Lower means less blur or more detected edges, which means higher fidelity to input |
+| `--controlnet_specs` | str | "inference_cosmos_transfer1_two_views.json" | Path to JSON file specifying multicontrolnet configurations |
+| `--checkpoint_dir` | str | "checkpoints" | Base directory containing model checkpoints |
+| `--tokenizer_dir` | str | "Cosmos-Tokenize1-CV8x8x8-720p" | Tokenizer weights directory relative to checkpoint_dir |
+| `--video_save_folder` | str | "outputs/" | Output folder for generating a batch of videos |
+| `--num_steps` | int | 35 | Number of diffusion sampling steps |
+| `--guidance` | float | 5.0 | Classifier-free guidance scale value |
+| `--fps` | int | 30 | FPS of the output video |
+| `--height` | int | 224 | Height of video to sample |
+| `--width` | int | 224 | Width of video to sample |
+| `--seed` | int | 1 | Random seed |
+| `--num_gpus` | int | 1 | Number of GPUs used to run context parallel inference. |
+| `--offload_diffusion_transformer` | bool | False | Offload DiT after inference |
+| `--offload_text_encoder_model` | bool | False | Offload text encoder model after inference |
+| `--offload_guardrail_models` | bool | True | Offload guardrail models after inference |
+| `--upsample_prompt` | bool | False | Upsample prompt using Pixtral upsampler model |
+| `--offload_prompt_upsampler` | bool | False | Offload prompt upsampler model after inference |
+| `--source_data_dir` | str | "" | Path to source data directory for batch inference. It contains h5 files generated from the state machine. |
+| `--output_data_dir` | str | "" | Path to output data directory for batch inference. |
+| `--save_name_offset` | int | 0 | Offset for the video save name. |
+| `--foreground_label` | str | "3,4" | Comma-separated list of labels used to define the foreground mask during guided generation. The foreground corresponds to the object whose appearance we want to keep unchanged during generation. |
+| `--sigma_threshold` | float | 1.2866 | This controls how many guidance steps are performed during generation. Smaller values mean more steps, larger values mean less steps. |
+| `--concat_video_second_view` | bool | True | Whether to concatenate the first and second view videos during generation |
+| `--fill_missing_pixels` | bool | True | Whether to fill missing pixels in the warped second view video |
+| `--model_config_file` | str | "environments/cosmos_transfer1/config/transfer/config.py" | Relative path to the model config file |
+
+
 ### Teleoperation
 
 The teleoperation interface allows direct control of the robotic arm using various input devices. It supports keyboard, SpaceMouse, and gamepad controls for precise manipulation of the ultrasound probe.

diff --git a/...ws/robotic_ultrasound/scripts/simulation/environments/cosmos_transfer1/config/__init__.py b/...ws/robotic_ultrasound/scripts/simulation/environments/cosmos_transfer1/config/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/...simulation/environments/cosmos_transfer1/config/generated_prompts_two_seperate_views.json b/...simulation/environments/cosmos_transfer1/config/generated_prompts_two_seperate_views.json
@@ -0,0 +1,14 @@
+{
+  "Sterile lab, gleaming metal, focused robotic precision.": {
+    "top_view_prompt": "A meticulously organized laboratory environment. Gleaming stainless steel instruments rest on polished surfaces, reflecting the cool, static overhead lighting. A robotic arm, brushed aluminum and articulated with precision, methodically scans over a non-reflective, light-grey laminate tabletop. The scene is sterile and clinical, evoking a sense of advanced medical research or quality control. Cables are neatly managed, and background equipment \u2013 monitors displaying complex waveforms, and sealed containers \u2013 remain static and out of focus, suggesting a continuous, automated process. The overall atmosphere is one of quiet, focused efficiency.",
+    "bottom_view_prompt": "The close-up perspective that is focused directly on the clean surface of light-grey laminate table with uniform texture."
+  },
+  "Bright office, scattered papers, automated scanning process.": {
+    "top_view_prompt": "A brightly lit, modern office space transitions into a laboratory setting. Papers are casually scattered across a large, non-reflective laminate tabletop, suggesting a busy workflow. A robotic arm, sleek and white, methodically scans an unseen object positioned centrally on the table. The scene is static, with consistent, even illumination. Focus is on the robotic arm's precise movements and the organized chaos of the workspace, hinting at automated data collection or analysis. The overall impression is one of efficient, clinical precision within a functional, lived-in environment.",
+    "bottom_view_prompt": "The close-up perspective that is focused directly on the clean surface of the laminate table with uniform texture."
+  },
+  "Clinical white room, equipment hums, methodical search.": {
+    "top_view_prompt": "A sterile, clinical laboratory environment. A robotic arm, precise and deliberate in its movements, systematically scans across a non-reflective, matte grey table. The room is filled with the subtle hum of unseen machinery \u2013 diagnostic equipment, ventilation systems, and power supplies. Cables are neatly managed, running along the ceiling and walls. The overall aesthetic is minimalist and functional, emphasizing cleanliness and precision. The scene is static, with a consistent, diffused overhead lighting creating soft shadows. The focus is on the methodical nature of the robotic scan, suggesting a detailed analysis or quality control process.",
+    "bottom_view_prompt": "The close-up perspective that is focused directly on the clean surface of the matte grey table with uniform texture."
+  }
+}
diff --git a/..._ultrasound/scripts/simulation/environments/cosmos_transfer1/config/inference/__init__.py b/..._ultrasound/scripts/simulation/environments/cosmos_transfer1/config/inference/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/...lation/environments/cosmos_transfer1/config/inference/cosmos-1-diffusion-control2world.py b/...lation/environments/cosmos_transfer1/config/inference/cosmos-1-diffusion-control2world.py
@@ -0,0 +1,170 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cosmos_transfer1.checkpoints import BASE_7B_CHECKPOINT_AV_SAMPLE_PATH
+from cosmos_transfer1.diffusion.config.transfer.conditioner import CTRL_HINT_KEYS_COMB
+from cosmos_transfer1.diffusion.model.model_ctrl import VideoDiffusionT2VModelWithCtrl
+from cosmos_transfer1.diffusion.networks.general_dit_video_conditioned import VideoExtendGeneralDIT
+from cosmos_transfer1.utils.lazy_config import LazyCall as L
+from cosmos_transfer1.utils.lazy_config import LazyDict
+from hydra.core.config_store import ConfigStore
+from simulation.environments.cosmos_transfer1.model.model_ctrl import VideoDiffusionModelWithCtrlAndGuidance
+
+cs = ConfigStore.instance()
+
+# Base configuration for 7B model
+Base_7B_Config = LazyDict(
+    dict(
+        defaults=[
+            {"override /net": "faditv2_7b"},
+            {"override /conditioner": "add_fps_image_size_padding_mask"},
+            {"override /tokenizer": "cosmos_diffusion_tokenizer_res720_comp8x8x8_t121_ver092624"},
+            "_self_",
+        ],
+        model=dict(
+            latent_shape=[16, 16, 88, 160],
+            net=dict(
+                rope_h_extrapolation_ratio=1,
+                rope_w_extrapolation_ratio=1,
+                rope_t_extrapolation_ratio=2,
+            ),
+        ),
+        job=dict(
+            group="Control2World",
+            name="Base_7B_Config",
+        ),
+    )
+)
+
+
+def make_ctrlnet_config_7b(
+    hint_key: str = "control_input_seg",
+    num_control_blocks: int = 3,
+) -> LazyDict:
+    """
+    Make a ControlNet config for 7B model
+    Args:
+        hint_key: The key to use for the control input.
+        num_control_blocks: The number of ViT blocks to use for the ControlNet.
+    Returns:
+        A LazyDict containing the control net config.
+    """
+    hint_mask = [True] * len(CTRL_HINT_KEYS_COMB[hint_key])
+
+    return LazyDict(
+        dict(
+            defaults=[
+                "/experiment/Base_7B_Config",
+                {"override /hint_key": hint_key},
+                {"override /net_ctrl": "faditv2_7b"},
+                {"override /conditioner": "ctrlnet_add_fps_image_size_padding_mask"},
+            ],
+            job=dict(
+                group="CTRL_7Bv1_lvg",
+                name=f"CTRL_7Bv1pt3_lvg_tp_121frames_{hint_key}_block{num_control_blocks}",
+                project="cosmos_transfer1",
+            ),
+            model=dict(
+                hint_mask=hint_mask,
+                hint_dropout_rate=0.3,
+                conditioner=dict(video_cond_bool=dict()),
+                net=L(VideoExtendGeneralDIT)(
+                    extra_per_block_abs_pos_emb=True,
+                    pos_emb_learnable=True,
+                    extra_per_block_abs_pos_emb_type="learnable",
+                ),
+                net_ctrl=dict(
+                    in_channels=17,
+                    hint_channels=128,
+                    num_blocks=28,
+                    layer_mask=[True if (i >= num_control_blocks) else False for i in range(28)],
+                    extra_per_block_abs_pos_emb=True,
+                    pos_emb_learnable=True,
+                    extra_per_block_abs_pos_emb_type="learnable",
+                ),
+            ),
+            model_obj=L(VideoDiffusionModelWithCtrlAndGuidance)(),
+        )
+    )
+
+
+def make_ctrlnet_config_7b_t2v(
+    hint_key: str = "control_input_seg",
+    num_control_blocks: int = 3,
+) -> LazyDict:
+    """
+    Make a ControlNet config for 7B text-to-video model
+    Args:
+        hint_key: The key to use for the control input.
+        num_control_blocks: The number of ViT blocks to use for the ControlNet.
+    Returns:
+        A LazyDict containing the ControlNet config.
+    """
+    hint_mask = [True] * len(CTRL_HINT_KEYS_COMB[hint_key])
+
+    return LazyDict(
+        dict(
+            defaults=[
+                "/experiment/Base_7B_Config",
+                {"override /hint_key": hint_key},
+                {"override /net_ctrl": "faditv2_7b"},
+                {"override /conditioner": "ctrlnet_add_fps_image_size_padding_mask"},
+            ],
+            job=dict(
+                group="CTRL_7Bv1_t2v",
+                name=f"CTRL_7Bv1pt3_t2v_121frames_{hint_key}_block{num_control_blocks}",
+                project="cosmos_ctrlnet1",
+            ),
+            model=dict(
+                base_load_from=dict(
+                    load_path=f"checkpoints/{BASE_7B_CHECKPOINT_AV_SAMPLE_PATH}",
+                ),
+                hint_mask=hint_mask,
+                hint_dropout_rate=0.3,
+                net=dict(
+                    extra_per_block_abs_pos_emb=True,
+                    pos_emb_learnable=True,
+                    extra_per_block_abs_pos_emb_type="learnable",
+                ),
+                net_ctrl=dict(
+                    in_channels=16,
+                    hint_channels=16,
+                    num_blocks=28,
+                    layer_mask=[True if (i >= num_control_blocks) else False for i in range(28)],
+                    extra_per_block_abs_pos_emb=True,
+                    pos_emb_learnable=True,
+                    extra_per_block_abs_pos_emb_type="learnable",
+                ),
+            ),
+            model_obj=L(VideoDiffusionT2VModelWithCtrl)(),
+        )
+    )
+
+
+# Register base configs
+cs.store(group="experiment", package="_global_", name=Base_7B_Config["job"]["name"], node=Base_7B_Config)
+# Register all control configurations
+num_control_blocks = 3
+for key in CTRL_HINT_KEYS_COMB.keys():
+    # Register 7B configurations
+    config_7b = make_ctrlnet_config_7b(hint_key=key, num_control_blocks=num_control_blocks)
+    cs.store(group="experiment", package="_global_", name=config_7b["job"]["name"], node=config_7b)
+
+# Register t2v based control net
+num_control_blocks = 3
+for key in ["control_input_hdmap", "control_input_lidar"]:
+    # Register 7B configurations
+    config_7b = make_ctrlnet_config_7b_t2v(hint_key=key, num_control_blocks=num_control_blocks)
+    cs.store(group="experiment", package="_global_", name=config_7b["job"]["name"], node=config_7b)
diff --git a/...simulation/environments/cosmos_transfer1/config/inference_cosmos_transfer1_two_views.json b/...simulation/environments/cosmos_transfer1/config/inference_cosmos_transfer1_two_views.json
@@ -0,0 +1,15 @@
+{
+    "prompt": "environments/cosmos_transfer1/config/generated_prompts_two_seperate_views.json",
+    "input_video_path" : "placeholder_not_needed.mp4",
+    "edge": {
+        "control_weight": 0.5
+    },
+    "depth": {
+        "control_weight": 0.75,
+        "input_control": "placeholder_not_needed.mp4"
+    },
+    "seg": {
+        "control_weight": 0.75,
+        "input_control": "placeholder_not_needed.mp4"
+    }
+}