Skip to content
This repository was archived by the owner on Nov 17, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions CONTRIBUTORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ The committers are the granted write access to the project.
* [Sergey Kolychev](https://github.com/sergeykolychev)
- Sergey is original author and current maintainer of Perl5 interface.
* [Naveen Swamy](https://github.com/nswamy)
* [Marco de Abreu](https://github.com/marcoabreu)
- Marco is the creator of the current MXNet CI.
* [Carin Meier](https://github.com/gigasquid)
- Carin created and is the current maintainer for the Clojure interface.


### Become a Committer
MXNet is a opensource project and we are actively looking for new committers
Expand Down Expand Up @@ -153,8 +158,6 @@ List of Contributors
* [Manu Seth](https://github.com/mseth10/)
* [Calum Leslie](https://github.com/calumleslie)
* [Andre Tamm](https://github.com/andretamm)
* [Marco de Abreu](https://github.com/marcoabreu)
- Marco is the creator of the current MXNet CI.
* [Julian Salazar](https://github.com/JulianSlzr)
* [Meghna Baijal](https://github.com/mbaijal)
* [Tao Hu](https://github.com/dongzhuoyao)
Expand All @@ -178,3 +181,10 @@ List of Contributors
* [Aaron Markham](https://github.com/aaronmarkham)
* [Sam Skalicky](https://github.com/samskalicky)
* [Per Goncalves da Silva](https://github.com/perdasilva)
* [Zhijingcheng Yu](https://github.com/jasonyu1996)
* [Cheng-Che Lee](https://github.com/stu1130)
* [Chaitanya Bapat](https://github.com/ChaiBapchya)
* [LuckyPigeon](https://github.com/LuckyPigeon)
* [Anton Chernov](https://github.com/lebeg)
* [Denisa Roberts](https://github.com/D-Roberts)
* [Dick Carter](https://github.com/DickJC123)
4 changes: 4 additions & 0 deletions docs/faq/env_var.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
- Values: Int ```(default=5)```
- The percentage of GPU memory to reserve for things other than the GPU array, such as kernel launch or cudnn handle space.
- If you see a strange out-of-memory error from the kernel launch, after multiple iterations, try setting this to a larger value.
* MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE
- Values: Int ```(default=2097152)```
- When using the naive pool type, memory allocations larger than this threshhold are rounded up to a multiple of this value.
- The default was chosen to minimize global memory fragmentation within the GPU driver. Set this to 1 to disable.

## Engine Type

Expand Down
14 changes: 7 additions & 7 deletions include/mxnet/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,11 +225,11 @@ struct Context {
/*!
* \brief get the free and total available memory on a GPU
* \param dev the GPU number to query
* \param free_mem pointer to the integer holding free GPU memory
* \param total_mem pointer to the integer holding total GPU memory
* \param free_mem pointer to the uint64_t holding free GPU memory
* \param total_mem pointer to the uint64_t holding total GPU memory
* \return No return value
*/
inline static void GetGPUMemoryInformation(int dev, int *free, int *total);
inline static void GetGPUMemoryInformation(int dev, uint64_t *free, uint64_t *total);
/*!
* Create a pinned CPU context.
* \param dev_id the device id for corresponding GPU.
Expand Down Expand Up @@ -334,8 +334,8 @@ inline int32_t Context::GetGPUCount() {
#endif
}

inline void Context::GetGPUMemoryInformation(int dev, int *free_mem,
int *total_mem) {
inline void Context::GetGPUMemoryInformation(int dev, uint64_t *free_mem,
uint64_t *total_mem) {
#if MXNET_USE_CUDA

size_t memF, memT;
Expand All @@ -354,8 +354,8 @@ inline void Context::GetGPUMemoryInformation(int dev, int *free_mem,
e = cudaSetDevice(curDevice);
CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);

*free_mem = static_cast<int>(memF);
*total_mem = static_cast<int>(memT);
*free_mem = static_cast<uint64_t>(memF);
*total_mem = static_cast<uint64_t>(memT);

#else
LOG(FATAL)
Expand Down
10 changes: 10 additions & 0 deletions include/mxnet/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -439,13 +439,23 @@ MXNET_DLL int MXGetGPUCount(int* out);

/*!
* \brief get the free and total available memory on a GPU
* Note: Deprecated, use MXGetGPUMemoryInformation64 instead.
* \param dev the GPU number to query
* \param free_mem pointer to the integer holding free GPU memory
* \param total_mem pointer to the integer holding total GPU memory
* \return 0 when success, -1 when failure happens
*/
MXNET_DLL int MXGetGPUMemoryInformation(int dev, int *free_mem, int *total_mem);

/*!
* \brief get the free and total available memory on a GPU
* \param dev the GPU number to query
* \param free_mem pointer to the uint64_t holding free GPU memory
* \param total_mem pointer to the uint64_t holding total GPU memory
* \return 0 when success, -1 when failure happens
*/
MXNET_DLL int MXGetGPUMemoryInformation64(int dev, uint64_t *free_mem, uint64_t *total_mem);

/*!
* \brief get the MXNet library version as an integer
* \param pointer to the integer holding the version number
Expand Down
9 changes: 9 additions & 0 deletions perl-package/AI-MXNetCAPI/mxnet.i
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,15 @@ int MXEngineSetBulkSize(int bulk_size, int* out);
*/
int MXGetGPUCount(int* out);

/*!
* \brief get the free and total available memory on a GPU
* \param dev the GPU number to query
* \param free_mem pointer to the uint64_t holding free GPU memory
* \param total_mem pointer to the uint64_t holding total GPU memory
* \return 0 when success, -1 when failure happens
*/
int MXGetGPUMemoryInformation64(int dev, uint64_t *out, uint64_t *out);


//-------------------------------------
// Part 1: NDArray creation and deletion
Expand Down
24 changes: 24 additions & 0 deletions python/mxnet/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,30 @@ def num_gpus():
check_call(_LIB.MXGetGPUCount(ctypes.byref(count)))
return count.value

def gpu_memory_info(device_id=0):
"""Query CUDA for the free and total bytes of GPU global memory.

Parameters
----------
device_id : int, optional
The device id of the GPU device.

Raises
------
Will raise an exception on any CUDA error.

Returns
-------
(free, total) : (int, int)
The number of GPUs.

"""
free = ctypes.c_uint64()
total = ctypes.c_uint64()
dev_id = ctypes.c_int(device_id)
check_call(_LIB.MXGetGPUMemoryInformation64(dev_id, ctypes.byref(free), ctypes.byref(total)))
return (free.value, total.value)

def current_context():
"""Returns the current context.

Expand Down
11 changes: 11 additions & 0 deletions src/c_api/c_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,18 @@ int MXGetGPUCount(int* out) {
API_END();
}

// Deprecated: use MXGetGPUMemoryInformation64() instead.
int MXGetGPUMemoryInformation(int dev, int *free_mem, int *total_mem) {
API_BEGIN();
uint64_t free_mem64 = 0UL;
uint64_t total_mem64 = 0UL;
Context::GetGPUMemoryInformation(dev, &free_mem64, &total_mem64);
*free_mem = static_cast<int>(free_mem64);
*total_mem = static_cast<int>(total_mem64);
API_END();
}

int MXGetGPUMemoryInformation64(int dev, uint64_t *free_mem, uint64_t *total_mem) {
API_BEGIN();
Context::GetGPUMemoryInformation(dev, free_mem, total_mem);
API_END();
Expand Down
66 changes: 28 additions & 38 deletions src/operator/nn/cudnn/cudnn_algoreg-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
#include <mutex>
#include <string>
#include <vector>
#include <functional>
#include <utility>
#include "../../../common/cuda_utils.h"
#include "../convolution-inl.h"
#include "../deconvolution-inl.h"
Expand Down Expand Up @@ -65,7 +67,11 @@ class CuDNNAlgo {
template<typename ParamType>
class CuDNNAlgoReg {
public:
bool Find(const ParamType &param,
using AlgoSetter_t = std::function<void(CuDNNAlgo<cudnnConvolutionFwdAlgo_t> *,
CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> *,
CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *)>;

void FindOrElseRegister(const ParamType &param,
const std::vector<TShape> &in_shape,
const std::vector<TShape> &out_shape,
cudnnDataType_t cudnn_data_type,
Expand All @@ -75,7 +81,8 @@ class CuDNNAlgoReg {
bool add_to_weight,
CuDNNAlgo<cudnnConvolutionFwdAlgo_t> *fwd,
CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> *bwd,
CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *flt) {
CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *flt,
const AlgoSetter_t &algo_setter) {
CHECK(in_shape.size() == 2 || in_shape.size() == 3);
ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type,
cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight};
Expand All @@ -85,45 +92,28 @@ class CuDNNAlgoReg {
*fwd = i->second.fwd;
*bwd = i->second.bwd;
*flt = i->second.flt;
return true;
}
return false;
}

void Register(const ParamType &param,
const std::vector<TShape> &in_shape,
const std::vector<TShape> &out_shape,
cudnnDataType_t cudnn_data_type,
cudnnDataType_t cudnn_forward_compute_type,
cudnnDataType_t cudnn_backward_compute_type,
int sm_arch,
bool add_to_weight,
const CuDNNAlgo<cudnnConvolutionFwdAlgo_t> &fwd,
const CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> &bwd,
const CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> &flt) {
CHECK(in_shape.size() == 2 || in_shape.size() == 3);
ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type,
cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight};
std::lock_guard<std::mutex> guard(lock_);
if (param.cudnn_tune.value() && reg_.size() % 50 == 0) {
LOG(INFO) << "Running performance tests to find the best convolution "
"algorithm, "
"this can take a while... (setting env variable "
"MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)";
if (reg_.size() >= 1000) {
// Many people are very concerned about this warning, so change the warning once.
if (!is_warning_autotune_) {
LOG(INFO)
<< "If you see this message in the middle of training, you are "
"probably using bucketing. Consider setting env variable "
"MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable cudnn tuning.";
is_warning_autotune_ = true;
} else {
if (param.cudnn_tune.value() && reg_.size() % 50 == 0) {
LOG(INFO) << "Running performance tests to find the best convolution "
"algorithm, "
"this can take a while... (setting env variable "
"MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)";
if (reg_.size() >= 1000) {
// Many people are very concerned about this warning, so change the warning once.
if (!is_warning_autotune_) {
LOG(INFO)
<< "If you see this message in the middle of training, you are "
"probably using bucketing. Consider setting env variable "
"MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable cudnn tuning.";
is_warning_autotune_ = true;
}
}
}
// Call provided function to determine the algos- likely uses cudnnFind() or cudnnGet()
algo_setter(fwd, bwd, flt);
// Save result so future lookups hit in this registry
reg_.insert(std::pair<ParamKey, CudnnAlgorithms>(key, CudnnAlgorithms{*fwd, *bwd, *flt}));
}
reg_[key].fwd = fwd;
reg_[key].bwd = bwd;
reg_[key].flt = flt;
}

static CuDNNAlgoReg *Get();
Expand Down
Loading