Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
690b717
add guided generation pipline
guopengf May 8, 2025
b06bf94
h5 batch inference support
guopengf May 8, 2025
59ddcb9
improve format
guopengf May 9, 2025
723be4b
prepare test case
guopengf May 10, 2025
b53778e
lint
guopengf May 10, 2025
dc66ae8
update test case
guopengf May 12, 2025
0941b69
format
guopengf May 12, 2025
1494470
update test cases
guopengf May 14, 2025
f5800f4
add readme
guopengf May 14, 2025
6d50d78
Merge branch 'main' into pengfeig/cosmos-transfer1-integrate
guopengf May 14, 2025
1d85d0c
add readme figure
guopengf May 14, 2025
c980304
fix link
guopengf May 14, 2025
c757b45
update dependency install and fix issues
guopengf May 15, 2025
220535b
remve change for install_deps.py
guopengf May 15, 2025
0ebe16f
Merge branch 'main' into pengfeig/cosmos-transfer1-integrate
guopengf May 15, 2025
979e0ab
Merge branch 'main' into pengfeig/cosmos-transfer1-integrate
KumoLiu May 19, 2025
389d52c
fix minnor comments
guopengf May 20, 2025
c181a9d
Update workflows/robotic_ultrasound/tests/test_simulation/test_integr…
guopengf May 21, 2025
3c299e7
Update workflows/robotic_ultrasound/tests/test_simulation/test_integr…
guopengf May 21, 2025
6976388
Merge branch 'main' into pengfeig/cosmos-transfer1-integrate
mingxin-zheng May 22, 2025
164aa5d
Merge branch 'main' into pengfeig/cosmos-transfer1-integrate
mingxin-zheng May 23, 2025
67e1e0b
fix
KumoLiu May 23, 2025
01128e3
increase timeout for cosmos-transfer test
KumoLiu May 25, 2025
6efb7c3
Merge remote-tracking branch 'origin/main' into pengfeig/cosmos-trans…
KumoLiu May 25, 2025
3bfc8e8
skip if not enough gpu
KumoLiu May 26, 2025
ad6cb9f
increase pi0 eval timeout
KumoLiu May 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added docs/source/cosmos_transfer_result.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
5 changes: 4 additions & 1 deletion tools/env_setup/install_cosmos_transfer1.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@ PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && cd ../.. && pwd)"
# Allow setting the python in PYTHON_EXECUTABLE
PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE:-python}

COSMOS_TRANSFER_DIR=${1:-$$PROJECT_ROOT/third_party/cosmos-transfer1}
# Install cuDNN
bash "$PROJECT_ROOT/tools/env_setup/install_cudnn.sh"

COSMOS_TRANSFER_DIR=${1:-$PROJECT_ROOT/third_party/cosmos-transfer1}

if [ -d "$COSMOS_TRANSFER_DIR" ]; then
echo "Cosmos Transfer directory already exists at $COSMOS_TRANSFER_DIR. Skipping clone."
Expand Down
15 changes: 14 additions & 1 deletion tools/run_all_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def _run_test_process(cmd, env, test_path):

try:
process = subprocess.Popen(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
stdout, stderr = process.communicate(timeout=600)
stdout, stderr = process.communicate(timeout=1200)
# Filter out extension loading messages
filtered_stdout = "\n".join(
[line for line in stdout.split("\n") if not ("[ext:" in line and "startup" in line)]
Expand Down Expand Up @@ -77,6 +77,17 @@ def _setup_test_env(project_root, tests_dir):
return env


def _setup_test_cosmos_transfer1_env(project_root, workflow_root, tests_dir):
"""Helper function to setup test environment for cosmos-transfer1"""
env = _setup_test_env(workflow_root, tests_dir)
pythonpath = [
os.path.join(project_root, "third_party", "cosmos-transfer1"),
]
env["PYTHONPATH"] = ":".join(pythonpath) + ":" + env["PYTHONPATH"]
env["DEBUG_GENERATION"] = "1"
return env


def run_tests_with_coverage(workflow_name, skip_xvfb):
"""Run all unittest cases with coverage reporting"""
print(f"Running tests with xvfb skipped: {skip_xvfb}")
Expand Down Expand Up @@ -173,6 +184,8 @@ def run_integration_tests(workflow_name):
"unittest",
test_path,
]
if "cosmos_transfer1" in test_path:
env = _setup_test_cosmos_transfer1_env(os.getcwd(), project_root, tests_dir)

if not _run_test_process(cmd, env, test_path):
all_tests_passed = False
Expand Down
74 changes: 74 additions & 0 deletions workflows/robotic_ultrasound/scripts/simulation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
- [PI Zero Policy Evaluation](#pi-zero-policy-evaluation)
- [Policy Evaluation w/ DDS](#policy-evaluation-w-dds)
- [Liver Scan State Machine](#liver-scan-state-machine)
- [Cosmos-transfer1 Integration](#cosmos-transfer1-integration)
- [Teleoperation](#teleoperation)
- [Ultrasound Raytracing Simulation](#ultrasound-raytracing-simulation)

Expand Down Expand Up @@ -180,6 +181,79 @@ Replace `/path/to/your/hdf5_data_directory` with the actual path to the director

> **Note:** Additional common Isaac Lab arguments (like `--device`) can also be used.


### Cosmos-transfer1 Integration

[Cosmos-Transfer1](https://github.com/nvidia-cosmos/cosmos-transfer1) is a world-to-world transfer model designed to bridge the perceptual divide between simulated and real-world environments.
We introduce a training-free guided generation method on top of Cosmos-Transfer1 to overcome unsatisfactory results on unseen healthcare simulation assets.
Directly applying Cosmos-Transfer with various control inputs results in unsatisfactory outputs for the human phantom and robotic arm (see bottom figure). In contrast, our guided generation method preserves the appearance of the phantom and robotic arm while generating diverse backgrounds.
<img src="../../../../docs/source/cosmos_transfer_result.png" width="512" height="600" />

This training-free guided generation approach by encoding simulation videos into the latent space and applying spatial masking to guide the generation process. The trade-off between realism and faithfulness can be controlled by adjusting the number of guided denoising steps. In addition, our generation pipeline supports multi-view video generation. We first leverage the camera information to warp the generated room view to wrist view, then use it as the guidance of wrist-view generation.

#### Download Cosmos-transfer1 Checkpoints
Please install cosmos-transfer1 dependency and move to the third party `cosmos-transfer1` folder. The following command downloads the checkpoints:
```sh
conda activate cosmos-transfer1
CUDA_HOME=$CONDA_PREFIX PYTHONPATH=$(pwd) python scripts/download_checkpoints.py --output_dir checkpoints/
```
#### Video Prompt Generation
We follow the idea in [lucidsim](https://github.com/lucidsim/lucidsim) to first generate batches of meta prompt that contains a very concise description of the potential scene, then instruct the LLM (e.g., [gemma-3-27b-it](https://build.nvidia.com/google/gemma-3-27b-it)) to upsample the meta prompt with detailed descriptions.
We provide example prompts in [`generated_prompts_two_seperate_views.json`](./environments/cosmos_transfer1/config/generated_prompts_two_seperate_views.json).

#### Running Cosmos-transfer1 + Guided Generation
Please move to the current [`simulation` folder](./) and execute the following command to start the generation pipeline:
```sh
export CHECKPOINT_DIR="path to downloaded cosmos-transfer1 checkpoints"
# Set project root path
export PROJECT_ROOT="{your path}/i4h-workflows"
# Set PYTHONPATH
export PYTHONPATH="$PROJECT_ROOT/third_party/cosmos-transfer1:$PROJECT_ROOT/workflows/robotic_ultrasound/scripts"
# run bath inference for generation pipeline
CUDA_HOME=$CONDA_PREFIX PYTHONPATH=$PYTHONPATH python \
-m environments.cosmos_transfer1.transfer \
--checkpoint_dir $CHECKPOINT_DIR \
--source_data_dir "Path to source dir of h5 files" \
--output_data_dir "Path to output dir of h5 files" \
--offload_text_encoder_model
```
#### Command Line Arguments

| Argument | Type | Default | Description |
|----------|------|---------|-------------|
| `--prompt` | str | "" | Prompt which the sampled video condition on |
| `--negative_prompt` | str | "The video captures a game playing, ..." | Negative prompt which the sampled video condition on |
| `--input_video_path` | str | "" | Optional input RGB video path |
| `--num_input_frames` | int | 1 | Number of conditional frames for long video generation |
| `--sigma_max` | float | 80 | sigma_max for partial denoising |
| `--blur_strength` | str | "medium" | Blur strength applied to input |
| `--canny_threshold` | str | "medium" | Canny threshold applied to input. Lower means less blur or more detected edges, which means higher fidelity to input |
| `--controlnet_specs` | str | "inference_cosmos_transfer1_two_views.json" | Path to JSON file specifying multicontrolnet configurations |
| `--checkpoint_dir` | str | "checkpoints" | Base directory containing model checkpoints |
| `--tokenizer_dir` | str | "Cosmos-Tokenize1-CV8x8x8-720p" | Tokenizer weights directory relative to checkpoint_dir |
| `--video_save_folder` | str | "outputs/" | Output folder for generating a batch of videos |
| `--num_steps` | int | 35 | Number of diffusion sampling steps |
| `--guidance` | float | 5.0 | Classifier-free guidance scale value |
| `--fps` | int | 30 | FPS of the output video |
| `--height` | int | 224 | Height of video to sample |
| `--width` | int | 224 | Width of video to sample |
| `--seed` | int | 1 | Random seed |
| `--num_gpus` | int | 1 | Number of GPUs used to run context parallel inference. |
| `--offload_diffusion_transformer` | bool | False | Offload DiT after inference |
| `--offload_text_encoder_model` | bool | False | Offload text encoder model after inference |
| `--offload_guardrail_models` | bool | True | Offload guardrail models after inference |
| `--upsample_prompt` | bool | False | Upsample prompt using Pixtral upsampler model |
| `--offload_prompt_upsampler` | bool | False | Offload prompt upsampler model after inference |
| `--source_data_dir` | str | "" | Path to source data directory for batch inference. It contains h5 files generated from the state machine. |
| `--output_data_dir` | str | "" | Path to output data directory for batch inference. |
| `--save_name_offset` | int | 0 | Offset for the video save name. |
| `--foreground_label` | str | "3,4" | Comma-separated list of labels used to define the foreground mask during guided generation. The foreground corresponds to the object whose appearance we want to keep unchanged during generation. |
| `--sigma_threshold` | float | 1.2866 | This controls how many guidance steps are performed during generation. Smaller values mean more steps, larger values mean less steps. |
| `--concat_video_second_view` | bool | True | Whether to concatenate the first and second view videos during generation |
| `--fill_missing_pixels` | bool | True | Whether to fill missing pixels in the warped second view video |
| `--model_config_file` | str | "environments/cosmos_transfer1/config/transfer/config.py" | Relative path to the model config file |


### Teleoperation

The teleoperation interface allows direct control of the robotic arm using various input devices. It supports keyboard, SpaceMouse, and gamepad controls for precise manipulation of the ultrasound probe.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"Sterile lab, gleaming metal, focused robotic precision.": {
"top_view_prompt": "A meticulously organized laboratory environment. Gleaming stainless steel instruments rest on polished surfaces, reflecting the cool, static overhead lighting. A robotic arm, brushed aluminum and articulated with precision, methodically scans over a non-reflective, light-grey laminate tabletop. The scene is sterile and clinical, evoking a sense of advanced medical research or quality control. Cables are neatly managed, and background equipment \u2013 monitors displaying complex waveforms, and sealed containers \u2013 remain static and out of focus, suggesting a continuous, automated process. The overall atmosphere is one of quiet, focused efficiency.",
"bottom_view_prompt": "The close-up perspective that is focused directly on the clean surface of light-grey laminate table with uniform texture."
},
"Bright office, scattered papers, automated scanning process.": {
"top_view_prompt": "A brightly lit, modern office space transitions into a laboratory setting. Papers are casually scattered across a large, non-reflective laminate tabletop, suggesting a busy workflow. A robotic arm, sleek and white, methodically scans an unseen object positioned centrally on the table. The scene is static, with consistent, even illumination. Focus is on the robotic arm's precise movements and the organized chaos of the workspace, hinting at automated data collection or analysis. The overall impression is one of efficient, clinical precision within a functional, lived-in environment.",
"bottom_view_prompt": "The close-up perspective that is focused directly on the clean surface of the laminate table with uniform texture."
},
"Clinical white room, equipment hums, methodical search.": {
"top_view_prompt": "A sterile, clinical laboratory environment. A robotic arm, precise and deliberate in its movements, systematically scans across a non-reflective, matte grey table. The room is filled with the subtle hum of unseen machinery \u2013 diagnostic equipment, ventilation systems, and power supplies. Cables are neatly managed, running along the ceiling and walls. The overall aesthetic is minimalist and functional, emphasizing cleanliness and precision. The scene is static, with a consistent, diffused overhead lighting creating soft shadows. The focus is on the methodical nature of the robotic scan, suggesting a detailed analysis or quality control process.",
"bottom_view_prompt": "The close-up perspective that is focused directly on the clean surface of the matte grey table with uniform texture."
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from cosmos_transfer1.checkpoints import BASE_7B_CHECKPOINT_AV_SAMPLE_PATH
from cosmos_transfer1.diffusion.config.transfer.conditioner import CTRL_HINT_KEYS_COMB
from cosmos_transfer1.diffusion.model.model_ctrl import VideoDiffusionT2VModelWithCtrl
from cosmos_transfer1.diffusion.networks.general_dit_video_conditioned import VideoExtendGeneralDIT
from cosmos_transfer1.utils.lazy_config import LazyCall as L
from cosmos_transfer1.utils.lazy_config import LazyDict
from hydra.core.config_store import ConfigStore
from simulation.environments.cosmos_transfer1.model.model_ctrl import VideoDiffusionModelWithCtrlAndGuidance

cs = ConfigStore.instance()

# Base configuration for 7B model
Base_7B_Config = LazyDict(
dict(
defaults=[
{"override /net": "faditv2_7b"},
{"override /conditioner": "add_fps_image_size_padding_mask"},
{"override /tokenizer": "cosmos_diffusion_tokenizer_res720_comp8x8x8_t121_ver092624"},
"_self_",
],
model=dict(
latent_shape=[16, 16, 88, 160],
net=dict(
rope_h_extrapolation_ratio=1,
rope_w_extrapolation_ratio=1,
rope_t_extrapolation_ratio=2,
),
),
job=dict(
group="Control2World",
name="Base_7B_Config",
),
)
)


def make_ctrlnet_config_7b(
hint_key: str = "control_input_seg",
num_control_blocks: int = 3,
) -> LazyDict:
"""
Make a ControlNet config for 7B model
Args:
hint_key: The key to use for the control input.
num_control_blocks: The number of ViT blocks to use for the ControlNet.
Returns:
A LazyDict containing the control net config.
"""
hint_mask = [True] * len(CTRL_HINT_KEYS_COMB[hint_key])

return LazyDict(
dict(
defaults=[
"/experiment/Base_7B_Config",
{"override /hint_key": hint_key},
{"override /net_ctrl": "faditv2_7b"},
{"override /conditioner": "ctrlnet_add_fps_image_size_padding_mask"},
],
job=dict(
group="CTRL_7Bv1_lvg",
name=f"CTRL_7Bv1pt3_lvg_tp_121frames_{hint_key}_block{num_control_blocks}",
project="cosmos_transfer1",
),
model=dict(
hint_mask=hint_mask,
hint_dropout_rate=0.3,
conditioner=dict(video_cond_bool=dict()),
net=L(VideoExtendGeneralDIT)(
extra_per_block_abs_pos_emb=True,
pos_emb_learnable=True,
extra_per_block_abs_pos_emb_type="learnable",
),
net_ctrl=dict(
in_channels=17,
hint_channels=128,
num_blocks=28,
layer_mask=[True if (i >= num_control_blocks) else False for i in range(28)],
extra_per_block_abs_pos_emb=True,
pos_emb_learnable=True,
extra_per_block_abs_pos_emb_type="learnable",
),
),
model_obj=L(VideoDiffusionModelWithCtrlAndGuidance)(),
)
)


def make_ctrlnet_config_7b_t2v(
hint_key: str = "control_input_seg",
num_control_blocks: int = 3,
) -> LazyDict:
"""
Make a ControlNet config for 7B text-to-video model
Args:
hint_key: The key to use for the control input.
num_control_blocks: The number of ViT blocks to use for the ControlNet.
Returns:
A LazyDict containing the ControlNet config.
"""
hint_mask = [True] * len(CTRL_HINT_KEYS_COMB[hint_key])

return LazyDict(
dict(
defaults=[
"/experiment/Base_7B_Config",
{"override /hint_key": hint_key},
{"override /net_ctrl": "faditv2_7b"},
{"override /conditioner": "ctrlnet_add_fps_image_size_padding_mask"},
],
job=dict(
group="CTRL_7Bv1_t2v",
name=f"CTRL_7Bv1pt3_t2v_121frames_{hint_key}_block{num_control_blocks}",
project="cosmos_ctrlnet1",
),
model=dict(
base_load_from=dict(
load_path=f"checkpoints/{BASE_7B_CHECKPOINT_AV_SAMPLE_PATH}",
),
hint_mask=hint_mask,
hint_dropout_rate=0.3,
net=dict(
extra_per_block_abs_pos_emb=True,
pos_emb_learnable=True,
extra_per_block_abs_pos_emb_type="learnable",
),
net_ctrl=dict(
in_channels=16,
hint_channels=16,
num_blocks=28,
layer_mask=[True if (i >= num_control_blocks) else False for i in range(28)],
extra_per_block_abs_pos_emb=True,
pos_emb_learnable=True,
extra_per_block_abs_pos_emb_type="learnable",
),
),
model_obj=L(VideoDiffusionT2VModelWithCtrl)(),
)
)


# Register base configs
cs.store(group="experiment", package="_global_", name=Base_7B_Config["job"]["name"], node=Base_7B_Config)
# Register all control configurations
num_control_blocks = 3
for key in CTRL_HINT_KEYS_COMB.keys():
# Register 7B configurations
config_7b = make_ctrlnet_config_7b(hint_key=key, num_control_blocks=num_control_blocks)
cs.store(group="experiment", package="_global_", name=config_7b["job"]["name"], node=config_7b)

# Register t2v based control net
num_control_blocks = 3
for key in ["control_input_hdmap", "control_input_lidar"]:
# Register 7B configurations
config_7b = make_ctrlnet_config_7b_t2v(hint_key=key, num_control_blocks=num_control_blocks)
cs.store(group="experiment", package="_global_", name=config_7b["job"]["name"], node=config_7b)
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"prompt": "environments/cosmos_transfer1/config/generated_prompts_two_seperate_views.json",
"input_video_path" : "placeholder_not_needed.mp4",
"edge": {
"control_weight": 0.5
},
"depth": {
"control_weight": 0.75,
"input_control": "placeholder_not_needed.mp4"
},
"seg": {
"control_weight": 0.75,
"input_control": "placeholder_not_needed.mp4"
}
}
Loading
Loading