Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
68 commits
Select commit Hold shift + click to select a range
d3dd39c
Initial commit
Mar 2, 2025
97facc6
initial commit
HannaMao Mar 18, 2025
04d4b0f
fix broken videos
HannaMao Mar 18, 2025
2e9a36e
update website link
HannaMao Mar 18, 2025
b786096
Update Readme
codeJRV Mar 19, 2025
cfb0e19
chore: update inference_utils.py
eltociear Mar 24, 2025
a364090
Merge pull request #5 from eltociear/patch-1
jwgu Mar 26, 2025
2824ccd
Improvements and Bug Fixes
Mar 26, 2025
fb0ce40
Improvements and Bug Fixes
Mar 26, 2025
78438cf
Improvements and Bug Fixes
Mar 27, 2025
2e7d324
Update README and add transfer1 architecture diagram
mingyuliutw Mar 29, 2025
8ed39c3
Update README.md
mingyuliutw Mar 29, 2025
8eceb73
Minor fixes on typos and grammars
zhe-thoughts Mar 31, 2025
9bd7946
Merge pull request #11 from zhe-thoughts/patch-1
mingyuliutw Mar 31, 2025
9f1ec0c
Fix sample_av_hdmap_spec.json input video path (#18)
leungjch Apr 1, 2025
2ca75b9
Improvements and Bug Fixes
Apr 2, 2025
9f6588a
[feat] Add License Header (#15)
codeJRV Apr 2, 2025
cbf56b3
Allow user to specify between all, 7b and 7b_av models (#16)
codeJRV Apr 2, 2025
672fba9
Remove third_party submodules (#23)
codeJRV Apr 2, 2025
c450ef4
fix cp issue (#26)
jwgu Apr 4, 2025
35c2599
Update README.md with workflow section and model descriptions
mingyuliutw Apr 6, 2025
d57d460
update examples README to include instructions on batch inference (#28)
tiffanycai6 Apr 7, 2025
fb3de85
Update guardrails to it's own model (#27)
codeJRV Apr 7, 2025
7d6e203
fix bug for spatiotemporal control weight (#33)
tcwang0509 Apr 14, 2025
007ac54
Update Readme (#30)
codeJRV Apr 14, 2025
d90ba4f
fix cp for t2v transfer model (#36)
arieling Apr 15, 2025
679302d
Fix input_control path in sample_av_hdmap_spec.json (#39)
yifanlu0227 Apr 15, 2025
d6a576b
feat: add post-training and custom-training support (#31)
qianlim Apr 17, 2025
661149c
Update README to use torchrun for all examples for consistency and av…
gyhandy Apr 21, 2025
7e1f88c
[fead-cicd]: add-linting, formatting and check video tags (#55)
hchodhary Apr 22, 2025
66940a6
fix + enhancement: ckpt loading for post-train and TP checkpoint merg…
qianlim Apr 22, 2025
d3fe9b4
fix: Inference video quality degradation (#57)
qianlim Apr 22, 2025
e64acc6
Add Dockerfile to cosmos-transfer1 (#47)
codeJRV Apr 25, 2025
aabfe25
fix batch inference bug and update usage in readme (#64)
tiffanycai6 Apr 25, 2025
de53563
Add Depth example for 7b model (#24)
zhe-thoughts Apr 25, 2025
f72fea5
fix: minor update in base model ckpt loading in training script (#63)
qianlim Apr 25, 2025
187143f
Vis example (#53)
zhe-thoughts Apr 26, 2025
f28c045
Segmentation example (#52)
zhe-thoughts Apr 26, 2025
73af7da
Adding a News section (#51)
zhe-thoughts Apr 26, 2025
2c1ba3d
[feat] Replacing aegis with Llama Guard 3 (#41)
andrewdotwang Apr 28, 2025
82b6a7d
fix: contrl hint key parsing in the example dataset (#69)
qianlim Apr 30, 2025
f4c0576
[Feature] Add post training config for single view sample av models (…
caotians1 May 2, 2025
71d8d9e
Add link in main readme to sample-av post-training example (#73)
caotians1 May 2, 2025
e02dfcb
fix issue 82 (#83)
jwgu May 9, 2025
c42eb07
fix: control input key in keypointControl inference (#85)
qianlim May 9, 2025
f8c0010
Update README.md to point to single control examples (#86)
zhe-thoughts May 13, 2025
4425c23
Robot Augmentation Workflow (#48)
gyhandy May 14, 2025
af7fee0
[feat] Enable Input Video In AV Transfer with Cutoff Frames
codeJRV May 14, 2025
d424446
[feat] Enable Input Video In AV Transfer with Cutoff Frames
codeJRV May 14, 2025
70c53b0
Features to improve AV use case support
codeJRV May 16, 2025
d9ce0ea
Features to improve AV use case support
codeJRV May 16, 2025
7208104
Track MP4 files with Git-LFS
Jun 27, 2025
ba87870
Remove large files
Jul 2, 2025
ce60d37
Track IPYNB files with Git-LFS
Jul 2, 2025
24c86a9
Add rolling (offline and online) inference funcrtionality
Jul 2, 2025
0879cee
Merge cutoff_frame and intermediates functionality
Jul 15, 2025
ff3dd5a
Merge rolling inference logic
Jul 15, 2025
927c551
Try adding mp4 files in merge
Jul 15, 2025
3a7f8fa
Merge branch 'nvidia-cosmos:main' into main
atmguille Jul 16, 2025
808b832
Provide different seed per chunk and assure control inputs idx are ha…
Jul 16, 2025
d5088a8
Merge branch 'main' of github.com:atmguille/cosmos-transfer1
Jul 16, 2025
c0a4995
Disable setting input_video to 0 in cutoff_frame
Jul 28, 2025
e83bc89
Rely on chunk conditioning instead of cutoff_frame functionality
Jul 28, 2025
c049cfd
Merge remote-tracking branch 'gitlab_private/av-improvements'
Jul 28, 2025
a26276d
Simplify if clause
Jul 28, 2025
76d62ac
Merge remote-tracking branch 'gitlab_private/av-improvements'
Jul 28, 2025
fcac400
Merge branch 'nvidia-cosmos:main' into main
atmguille Jul 29, 2025
af7b3f1
Support changes in main branch an edge functionality
Aug 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified assets/example1_depth.mp4
Binary file not shown.
Binary file modified assets/example1_edge.mp4
Binary file not shown.
Binary file modified assets/example1_input_video.mp4
Binary file not shown.
Binary file modified assets/example1_seg.mp4
Binary file not shown.
Binary file modified assets/example1_single_control_edge.mp4
Binary file not shown.
Binary file modified assets/example1_single_control_edge_prompt_upsampler_result.mp4
Binary file not shown.
Binary file modified assets/example1_spatiotemporal_weights.mp4
Binary file not shown.
Binary file modified assets/example1_spatiotemporal_weights_mask.mp4
Binary file not shown.
Binary file modified assets/example1_uniform_weights.mp4
Binary file not shown.
Binary file modified assets/example1_vis.mp4
Binary file not shown.
Binary file modified assets/inference_depth_output.mp4
Binary file not shown.
Binary file modified assets/inference_keypoint_input_video.mp4
Binary file not shown.
Binary file modified assets/inference_keypoint_output.mp4
Binary file not shown.
Binary file modified assets/inference_upscaler_input_video.mp4
Binary file not shown.
Binary file modified assets/inference_upscaler_output.mp4
Binary file not shown.
Binary file modified assets/robot_sample_input.mp4
Binary file not shown.
Binary file modified assets/robot_sample_output.mp4
Binary file not shown.
Binary file modified assets/robot_sample_seg.mp4
Binary file not shown.
Binary file modified assets/sample_av_multi_control_input_hdmap.mp4
Binary file not shown.
Binary file modified assets/sample_av_multi_control_input_lidar.mp4
Binary file not shown.
3 changes: 3 additions & 0 deletions assets/sample_av_multi_control_input_video.mp4
Git LFS file not shown
Binary file modified assets/sample_av_multi_control_output.mp4
Binary file not shown.
11 changes: 11 additions & 0 deletions assets/sample_av_multi_control_spec_with_input_video.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"input_video_path" : "assets/sample_av_multi_control_input_video.mp4",
"hdmap": {
"control_weight": 0.3,
"input_control": "assets/sample_av_multi_control_input_hdmap.mp4"
},
"lidar": {
"control_weight": 0.7,
"input_control": "assets/sample_av_multi_control_input_lidar.mp4"
}
}
Binary file modified cosmos_transfer1/auxiliary/depth_anything/assets/input_video.mp4
Binary file not shown.
Binary file modified cosmos_transfer1/auxiliary/sam2/assets/input_video.mp4
Binary file not shown.
Binary file modified cosmos_transfer1/auxiliary/tokenizer/test_data/video.mp4
Binary file not shown.
47 changes: 46 additions & 1 deletion cosmos_transfer1/diffusion/datasets/augmentors/control_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,8 +535,16 @@ def __call__(self, data_dict: dict) -> dict:
if "control_input_edge" in data_dict:
# already processed
return data_dict
key_img = self.input_keys[1]

key_img = self.input_keys[1] # typically 'video'
key_out = self.output_keys[0]

# In some situations (e.g. warm-up frames) the caller may not provide
# RGB frames. In that case we simply skip edge computation and leave
# the dict unchanged so the pipeline can proceed without this hint.
if key_img not in data_dict:
return data_dict

frames = data_dict[key_img]
# Get lower and upper threshold for canny edge detection.
if self.use_random: # always on for training, always off for inference
Expand All @@ -556,6 +564,11 @@ def __call__(self, data_dict: dict) -> dict:
t_lower, t_upper = 300, 400
else:
raise ValueError(f"Preset {self.preset_strength} not recognized.")

# If frames is a torch tensor (potentially on GPU), move to CPU and convert
# to numpy so that subsequent OpenCV operations work correctly.
if torch.is_tensor(frames):
frames = frames.detach().cpu().numpy()
frames = np.array(frames)
is_image = len(frames.shape) < 4

Expand All @@ -571,6 +584,38 @@ def __call__(self, data_dict: dict) -> dict:
edge_maps = torch.from_numpy(edge_maps).expand(3, -1, -1, -1)
if is_image:
edge_maps = edge_maps[:, 0]

# ------------------------------------------------------------------
# DEBUG: Save one side-by-side sample (RGB | edges) the first time we
# compute an edge map during a run. This helps verify that the
# edge input looks sensible when running the regular pipeline.
# ------------------------------------------------------------------
try:
if True:
import os, uuid

if is_image:
rgb_frame = frames # HWC uint8
edge_vis = edge_maps[0].numpy() # HxW uint8
else:
# Take first temporal slice
rgb_frame = frames[:, 0].transpose(1, 2, 0) # HWC
edge_vis = edge_maps[0, 0].numpy() # HxW

edge_vis_rgb = cv2.cvtColor(edge_vis, cv2.COLOR_GRAY2BGR)
rgb_frame_bgr = cv2.cvtColor(rgb_frame, cv2.COLOR_RGB2BGR)
canvas = np.concatenate([rgb_frame_bgr, edge_vis_rgb], axis=1)

out_dir = "/home/lab/mapodaca/cosmos-transfer1-github/edge_debug/"
os.makedirs(out_dir, exist_ok=True)
fname = os.path.join(out_dir, f"edge_debug_{uuid.uuid4().hex[:8]}.png")
cv2.imwrite(fname, canvas)
log.info(f"Saved edge debug frame to {fname}")
self._debug_saved = True
except Exception as _e:
# Don't crash the pipeline if debug save fails
log.warning(f"Edge debug save failed: {_e}")

data_dict[key_out] = edge_maps
return data_dict

Expand Down
21 changes: 14 additions & 7 deletions cosmos_transfer1/diffusion/diffusion/modules/res_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,11 @@ def float64_x0_fn(x_B_StateShape: torch.Tensor, t_B: torch.Tensor) -> torch.Tens
timestamps_cfg = SolverTimestampConfig(nfe=num_steps, t_min=sigma_min, t_max=sigma_max, order=rho)
sampler_cfg = SamplerConfig(solver=solver_cfg, timestamps=timestamps_cfg, sample_clean=True)

return self._forward_impl(float64_x0_fn, x_sigma_max, sampler_cfg).to(in_dtype)
output, intermediates = self._forward_impl(float64_x0_fn, x_sigma_max, sampler_cfg)
intermediate_outputs = []
for intermediate in intermediates:
intermediate_outputs.append(intermediate.to(in_dtype))
return output.to(in_dtype), intermediate_outputs

@torch.no_grad()
def _forward_impl(
Expand All @@ -156,7 +160,7 @@ def _forward_impl(
noisy_input_B_StateShape: torch.Tensor,
sampler_cfg: Optional[SamplerConfig] = None,
callback_fns: Optional[List[Callable]] = None,
) -> torch.Tensor:
) -> tuple[torch.Tensor, list[torch.Tensor]]:
"""
Internal implementation of the forward pass.

Expand All @@ -177,7 +181,7 @@ def _forward_impl(
sampler_cfg.timestamps.t_min, sampler_cfg.timestamps.t_max, num_timestamps, sampler_cfg.timestamps.order
).to(noisy_input_B_StateShape.device)

denoised_output = differential_equation_solver(
denoised_output, intermediates = differential_equation_solver(
denoiser_fn, sigmas_L, sampler_cfg.solver, callback_fns=callback_fns
)(noisy_input_B_StateShape)

Expand All @@ -186,7 +190,7 @@ def _forward_impl(
ones = torch.ones(denoised_output.size(0), device=denoised_output.device, dtype=denoised_output.dtype)
denoised_output = denoiser_fn(denoised_output, sigmas_L[-1] * ones)

return denoised_output
return denoised_output, intermediates


def fori_loop(lower: int, upper: int, body_fun: Callable[[int, Any], Any], init_val: Any) -> Any:
Expand All @@ -203,9 +207,12 @@ def fori_loop(lower: int, upper: int, body_fun: Callable[[int, Any], Any], init_
The final result after all iterations.
"""
val = init_val
intermediates = []
for i in range(lower, upper):
val = body_fun(i, val)
return val
intermediates.append(val[0])

return val[0], intermediates


def differential_equation_solver(
Expand Down Expand Up @@ -277,7 +284,7 @@ def step_fn(

return output_x_B_StateShape, x0_preds

x_at_eps, _ = fori_loop(0, num_step, step_fn, [input_xT_B_StateShape, None])
return x_at_eps
x_at_eps, intermediates = fori_loop(0, num_step, step_fn, [input_xT_B_StateShape, None])
return x_at_eps, intermediates

return sample_fn
Loading