Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
62dbe07
Add CUPTI Cython API
pentschev Aug 29, 2025
ffa74e8
Minor constness improvement
pentschev Aug 29, 2025
9395746
Add tests
pentschev Aug 29, 2025
6d644cd
Merge remote-tracking branch 'upstream/branch-25.10' into cupti-python
pentschev Sep 9, 2025
afb0582
Remove types from docstring
pentschev Sep 9, 2025
0378f20
Raise error on init
pentschev Sep 9, 2025
1aac972
Fix linting
pentschev Sep 9, 2025
2b843c1
Add cupti to conda recipe
pentschev Sep 9, 2025
19582a7
Fix punctuation
pentschev Sep 9, 2025
0270c36
Remove return types from pyx file
pentschev Sep 9, 2025
0cefd9b
Rename to bool_t
pentschev Sep 9, 2025
f4054b8
Merge remote-tracking branch 'origin/cupti-python' into cupti-python
pentschev Sep 9, 2025
deb328e
Add psutil C++ test dependency
pentschev Sep 9, 2025
c0a248c
Revert "Add psutil C++ test dependency"
pentschev Sep 9, 2025
7a52d67
Add CUPTI memory monitor to bulk_mpi_shuffle example
pentschev Sep 10, 2025
2545638
Add standalone CUPTI example
pentschev Sep 10, 2025
d8c01a7
Merge remote-tracking branch 'upstream/branch-25.10' into cupti-python
pentschev Sep 10, 2025
7f84d74
Fix linting
pentschev Sep 10, 2025
189de6f
Remove unnecessary comments
pentschev Sep 10, 2025
001c63b
Fix linting (again)
pentschev Sep 10, 2025
f03eb18
Merge remote-tracking branch 'upstream/branch-25.10' into cupti-python
pentschev Sep 15, 2025
e890c99
Remove cupti module import from init
pentschev Sep 15, 2025
05b5f29
Remove return type annotations from Cython
pentschev Sep 15, 2025
1f06542
Merge remote-tracking branch 'upstream/branch-25.10' into cupti-python
pentschev Sep 15, 2025
8c9200b
Remove unused typing import
pentschev Sep 15, 2025
558c82c
Centralize tests skip when CUPTI is unavailable
pentschev Sep 15, 2025
7d513c2
Add CuPy import back
pentschev Sep 15, 2025
8daa0b3
Remove overzealous exception
pentschev Sep 15, 2025
1c8d201
Test cleanup
pentschev Sep 15, 2025
704c639
Revert "Centralize tests skip when CUPTI is unavailable"
pentschev Sep 15, 2025
801ca13
Improve formatting
pentschev Sep 15, 2025
e987763
Merge remote-tracking branch 'origin/cupti-python' into cupti-python
pentschev Sep 15, 2025
ff5a5e4
Remove unnecessary CUPTI check
pentschev Sep 15, 2025
fd77925
Fix linting
pentschev Sep 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion conda/recipes/rapidsmpf/recipe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,12 @@ requirements:
- ${{ compiler("c") }}
- ${{ compiler("cxx") }}
- ${{ compiler("cuda") }}
- cuda-cupti-dev
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We typically have only compilers in build and all CUDA libraries (e.g. libcublas-dev, cuda-cupti-dev, etc.) in host. (Because the compiler depends on cuda-cudart-dev, it ends up in both build and host.) One way to think about it is that if you were cross-compiling, you'd need your toolchain like the compiler executables in build but all the libraries you're building against go in host. For this reason I suspect we should only need cuda-cupti-dev in host, and that if it isn't working that way, we have some kind of build system bug that needs to be identified and addressed.

For now, I don't want to block further work, but let's flag this:

Suggested change
- cuda-cupti-dev
- cuda-cupti-dev # TODO: This should only be needed in host, and may indicate a packaging issue

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Following a discussion offline, instead of adding comments we started #510 to track this, given this PR is only an extension of changes already added in #445 , where cuda-cupti-dev is part of build section of librapidsmpf's conda recipe.

- cuda-version =${{ cuda_version }}
- ${{ stdlib("c") }}
host:
- cuda-cudart-dev
- cuda-cupti-dev
- cuda-version =${{ cuda_version }}
- cython >=3.0.0
- librapidsmpf =${{ version }}
Expand All @@ -74,10 +76,11 @@ requirements:
- rmm =${{ minor_version }}
- scikit-build-core >=0.10.0
run:
- ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
- cuda-cupti
- if: cuda_major == "12"
then: cuda-python >=12.9.2,<13.0a0
else: cuda-python >=13.0.1,<14.0a0
- ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
- cudf =${{ minor_version }}
- cupy >=13.6.0
- librapidsmpf =${{ version }}
Expand All @@ -98,6 +101,7 @@ requirements:
- ${{ stdlib("c") }}
by_name:
- cuda-cudart
- cuda-cupti
- cuda-version
- openmpi
- mpi4py
Expand Down
2 changes: 1 addition & 1 deletion cpp/examples/example_cupti_monitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ int main() {
// Allocate device memory using rmm::device_buffer
rmm::device_buffer buf(allocation_size, rmm::cuda_stream_default);
device_buffers.push_back(std::move(buf));
} catch (rmm::bad_alloc& e) {
} catch (rmm::bad_alloc const& e) {
std::cerr << "rmm::device_buffer allocation failed: " << e.what()
<< std::endl;
break;
Expand Down
5 changes: 5 additions & 0 deletions python/rapidsmpf/rapidsmpf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ set(cython_modules config.pyx cuda_stream.pyx progress_thread.pyx rmm_resource_a
shuffler.pyx statistics.pyx
)

# Add cupti module conditionally if CUPTI support is enabled
if(RAPIDSMPF_HAVE_CUPTI)
list(APPEND cython_modules cupti.pyx)
endif()

rapids_cython_create_modules(
CXX
SOURCE_FILES "${cython_modules}"
Expand Down
6 changes: 6 additions & 0 deletions python/rapidsmpf/rapidsmpf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

from __future__ import annotations

import contextlib

from rapidsmpf._version import __git_commit__, __version__ # noqa: F401

# If librapidsmpf was installed as a wheel, we must request it to load the
Expand All @@ -16,3 +18,7 @@
else:
librapidsmpf.load_library()
del librapidsmpf

# Import CUPTI module if available (only built when CUPTI is found)
with contextlib.suppress(ImportError):
from rapidsmpf import cupti # noqa: F401
56 changes: 56 additions & 0 deletions python/rapidsmpf/rapidsmpf/cupti.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
# SPDX-License-Identifier: Apache-2.0

from libc.stddef cimport size_t
from libcpp cimport bool as cpp_bool
from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.unordered_map cimport unordered_map
from libcpp.vector cimport vector


cdef extern from "<chrono>" namespace "std::chrono" nogil:
cdef cppclass milliseconds:
milliseconds(long long) except +

cdef extern from "<cupti.h>" nogil:
ctypedef enum CUpti_CallbackId:
pass


cdef extern from "<rapidsmpf/cupti.hpp>" nogil:
cdef struct cpp_MemoryDataPoint "rapidsmpf::MemoryDataPoint":
double timestamp
size_t free_memory
size_t total_memory
size_t used_memory

cdef cppclass cpp_CuptiMonitor "rapidsmpf::CuptiMonitor":
cpp_CuptiMonitor(
cpp_bool enable_periodic_sampling,
milliseconds sampling_interval_ms
) except +
void start_monitoring() except +
void stop_monitoring() except +
cpp_bool is_monitoring() except +
void capture_memory_sample() except +
const vector[cpp_MemoryDataPoint]& get_memory_samples() except +
void clear_samples() except +
size_t get_sample_count() except +
void write_csv(const string& filename) except +
void set_debug_output(cpp_bool enabled, size_t threshold_mb) except +
unordered_map[CUpti_CallbackId, size_t] get_callback_counters() except +
void clear_callback_counters() except +
size_t get_total_callback_count() except +
string get_callback_summary() except +


cdef class MemoryDataPoint:
cdef cpp_MemoryDataPoint _data

@staticmethod
cdef MemoryDataPoint from_cpp(cpp_MemoryDataPoint data)


cdef class CuptiMonitor:
cdef unique_ptr[cpp_CuptiMonitor] _handle
33 changes: 33 additions & 0 deletions python/rapidsmpf/rapidsmpf/cupti.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

class MemoryDataPoint:
@property
def timestamp(self) -> float: ...
@property
def free_memory(self) -> int: ...
@property
def total_memory(self) -> int: ...
@property
def used_memory(self) -> int: ...
def __repr__(self) -> str: ...

class CuptiMonitor:
def __init__(
self, enable_periodic_sampling: bool = False, sampling_interval_ms: int = 100
) -> None: ...
def start_monitoring(self) -> None: ...
def stop_monitoring(self) -> None: ...
def is_monitoring(self) -> bool: ...
def capture_memory_sample(self) -> None: ...
def get_memory_samples(self) -> list[MemoryDataPoint]: ...
def clear_samples(self) -> None: ...
def get_sample_count(self) -> int: ...
def write_csv(self, filename: str) -> None: ...
def set_debug_output(self, enabled: bool, threshold_mb: int = 10) -> None: ...
def get_callback_counters(self) -> dict[int, int]: ...
def clear_callback_counters(self) -> None: ...
def get_total_callback_count(self) -> int: ...
def get_callback_summary(self) -> str: ...
Loading
Loading