NVIDIA-NeMo · ko3n1g · Aug 19, 2025 · Aug 20, 2025 · Aug 20, 2025 · Aug 25, 2025
@@ -101,7 +101,7 @@ jobs:
     runs-on: linux-amd64-cpu16
     name: UV - AMD64/Linux - NGC ${{ contains(matrix.image, 'cuda') && 'CUDA' || 'PyTorch' }}
     container:
-      image: ubuntu:24.04
+      image: nvcr.io/nvidia/pytorch:25.05-py3
     environment: nemo-ci
     steps:
       - name: Checkout repository
@@ -128,7 +128,7 @@ jobs:
             FRAMEWORK=("--inference-framework" "inframework")
           fi
 
-          bash docker/common/install.sh --base-image ubuntu --use-uv --python-version 3.12 "${FRAMEWORK[@]}"
+          bash docker/common/install.sh --base-image pytorch --use-uv --python-version 3.12 "${FRAMEWORK[@]}"
           uv run python -m ensurepip --upgrade
           ln -sf /opt/venv/bin/pip3 /opt/venv/bin/pip
           uv pip install --no-deps -e .

@@ -139,6 +139,8 @@ main() {
             --all-groups ${UV_ARGS[@]}
         # Install the package
         uv pip install --no-deps -e .
+
+        patch -p1 $(uv run python -c "import triton; print(triton.__path__[0])")/runtime/autotuner.py external/patches/triton-lang_triton_6570_lazy_init.patch
     else
         if [[ "$INFERENCE_FRAMEWORK" != "inframework" ]]; then
             EXTRA="[$INFERENCE_FRAMEWORK]"
@@ -154,6 +156,8 @@ main() {
 
 
         pip install --pre --no-cache-dir --no-build-isolation .$EXTRA
+
+        patch -p1 $(python -c "import triton; print(triton.__path__[0])")/runtime/autotuner.py external/patches/triton-lang_triton_6570_lazy_init.patch
     fi
 
 }

diff --git a/external/patches/triton-lang_triton_6570_lazy_init.patch b/external/patches/triton-lang_triton_6570_lazy_init.patch
@@ -1,32 +1,31 @@
-From 7240b92457a723a3a3ec2292e40df6274382524c Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?oliver=20k=C3=B6nig?= <[email protected]>
-Date: Wed, 11 Jun 2025 19:13:30 +0000
-Subject: [PATCH] f
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
+/*
+ * Code imported via patch from https://github.com/triton-lang/triton/pull/6570, commit 2afae45951b74785b144151b31e91e6c82b0b02f.
+ * Copyright (c) 2018-2022 Philippe Tillet, OpenAI.
+ * Licensed under the MIT License.
+ */
 
-Signed-off-by: oliver könig <[email protected]>
----
- external/patches/main.py | 15 +++++++++------
- 1 file changed, 9 insertions(+), 6 deletions(-)
+From 2afae45951b74785b144151b31e91e6c82b0b02f Mon Sep 17 00:00:00 2001
+From: Han Zhu <[email protected]>
+Date: Tue, 22 Apr 2025 18:42:23 -0700
+Subject: [PATCH] [autotuner] Lazily initiailize do_bench
 
-diff --git a/usr/local/lib/python3.12/dist-packages/triton/runtime/autotuner.py b/usr/local/lib/python3.12/dist-packages/triton/runtime/autotuner.py
-index 69305dc94..4600542b8 100644
---- a/usr/local/lib/python3.12/dist-packages/triton/runtime/autotuner.py
-+++ b/usr/local/lib/python3.12/dist-packages/triton/runtime/autotuner.py
+---
+diff --git a/a/autotuner.py b/b/autotuner.py
+index 0ee6bea09..b75c5e353 100644
+--- a/a/autotuner.py
++++ b/b/autotuner.py
 @@ -4,6 +4,7 @@ import builtins
  import os
  import time
  import inspect
 +from functools import cached_property
  from typing import Dict, Tuple, List, Optional
- 
+
  from .jit import KernelInterface
 @@ -94,6 +95,7 @@ class Autotuner(KernelInterface):
          while not inspect.isfunction(self.base_fn):
              self.base_fn = self.base_fn.fn
- 
+
 +        self._do_bench = do_bench
          self.num_warmups = warmup
          self.num_reps = rep
@@ -42,7 +41,7 @@ index 69305dc94..4600542b8 100644
                      quantiles=quantiles,
 @@ -115,7 +117,7 @@ class Autotuner(KernelInterface):
                  return
- 
+
              import triton.testing
 -            self.do_bench = lambda kernel_call, quantiles: triton.testing.do_bench(
 +            self._do_bench = lambda kernel_call, quantiles: triton.testing.do_bench(
@@ -52,7 +51,7 @@ index 69305dc94..4600542b8 100644
 @@ -123,10 +125,11 @@ class Autotuner(KernelInterface):
              )
              return
- 
+
 -        if do_bench is None:
 -            self.do_bench = driver.active.get_benchmarker()
 -        else:
@@ -62,9 +61,6 @@ index 69305dc94..4600542b8 100644
 +        if self._do_bench is None:
 +            return driver.active.get_benchmarker()
 +        return self._do_bench
- 
+
      def _bench(self, *args, config, **meta):
          from ..compiler.errors import CompileTimeAssertionFailure
--- 
-2.43.0
-
@@ -41,41 +41,26 @@ description = "NeMo Export and Deploy - a library to export and deploy LLMs and
 requires-python = ">=3.10,<3.13"
 license = { text = "Apache 2.0" }
 dependencies = [
-    "megatron-core>=0.14.0a0,<0.15.0",
-    "megatron-bridge>=0.1.0a0,<0.2.0",
-    "nvidia-modelopt[torch]>=0.33.0a0,<0.34.0; sys_platform != 'darwin'",
-    "nvidia-resiliency-ext>=0.4.0a0,<0.5.0; sys_platform != 'darwin'",
-    "transformer-engine[pytorch]>=2.6.0a0,<2.7.0; sys_platform != 'darwin'",
-    "accelerate",
+    "megatron-bridge>=0.2.0a0,<0.3.0",
+    "megatron-core[mlm,dev]>=0.15.0a0,<0.16.0",
     "fastapi",
     "pydantic-settings",
     "ray",
     "ray[serve]",
     "uvicorn",
-    "tensorstore",
-    "zarr>=2.18.2,<3.0.0",
-    # Lightning deps
-    "cloudpickle",
-    "fiddle",
-    "hydra-core>1.3,<=1.3.2",
-    "lightning",
-    "omegaconf>=2.3.0",
-    "peft",
     "torch==2.7.1",
     "torchvision",
     "torchmetrics>=0.11.0",
     "wandb",
     "webdataset>=0.2.86",
     "nvidia-pytriton ; platform_system != 'Darwin' ",
-    "flashinfer-python>=0.2.5 ; platform_system != 'Darwin'",
     "Pillow ; platform_system != 'Darwin' and platform_machine != 'aarch64'",
     "decord ; platform_system != 'Darwin' and platform_machine != 'aarch64'",
     "pyparsing>2.0.2",
-    "sentencepiece",
-    "tiktoken",
-    "einops",
     "ijson",
     "pyarrow<21.0.0",
+    "peft",
+
 ]
 
 [project.optional-dependencies]
@@ -99,6 +84,12 @@ linting = ["pre-commit>=3.6.0", "ruff~=0.9.0"]
 test = ["pytest", "pytest-mock", "coverage", "click"]
 nemo-toolkit = [
     "nemo-toolkit[automodel,common-only,nlp-only,eval,multimodal-only]>=2.5.0a0,<2.6.0",
+    # Lightning deps
+    "cloudpickle",
+    "fiddle",
+    "hydra-core>1.3,<=1.3.2",
+    "lightning",
+    "omegaconf>=2.3.0",
 ]
 nemo-run = ["nemo-run"]
 
@@ -115,7 +106,14 @@ transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git",
 
 [tool.uv]
 # Currently, TE must be built with no build-isolation b/c it requires torch
-no-build-isolation-package = ["transformer-engine", "transformer-engine-torch"]
+no-build-isolation-package = [
+    "transformer-engine",
+    "transformer-engine-torch",
+    "flash-attn",
+    "mamba-ssm",
+    "causal-conv1d",
+    "nv-grouped-gemm",
+]
 # Always apply the build group since dependencies like TE/mcore/nemo-run require build dependencies
 # and this lets us assume they are implicitly installed with a simply `uv sync`. Ideally, we'd
 # avoid including these in the default dependency set, but for now it's required.
@@ -128,8 +126,9 @@ link-mode = "copy"
 conflicts = [[{ extra = "trtllm" }, { extra = "vllm" }, { extra = "trt-onnx" }]]
 override-dependencies = [
     "urllib3>1.27.0",
-    "tiktoken>=0.9.0",                   # because nemo-toolkit and megatron-bridge disagree on tiktoken, we need to pin it here,
+    "tiktoken>=0.9.0",                          # because nemo-toolkit and megatron-bridge disagree on tiktoken, we need to pin it here,
     "fsspec[http]>=2023.1.0,<=2024.9.0",
+    "megatron-energon[av-decode]>=6.0,<7.dev0", # because nemo-toolkit and megatron-core disagree on megatron-energon, we need to pin it here,
 ]
 prerelease = "allow"