CudnnFind() usage improvements (#12804)

DickJC123 · eric-haibin-lin · commit ee5f699754b7 · 2018-10-26T00:03:41.000-07:00
* Add mx.context.gpu_memory_info() to python api for flexible tests.

* Add test_gluon_gpu.py:test_large_models to show cudnnFind headroom issue.

* Output model sizes tried by test_gluon_gpu.py:test_large_models.

* Fix perl interface to MXGetGPUMemoryInformation.

* Increase difficulty of test_gluon_gpu.py:test_large_models.

* Forgot a file in fix for perl.

* Modify test to pass on no-cudnn CI runner.

* Mutex algo reg updates, serialize cudnnFind calls.

* Fix for cudnnFind memory headroom issue.

* Fix cpplint.

* Respond to reviewers comments.

* Guard against improper MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE values.

* Fix potentially unassigned var.
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -187,3 +187,4 @@ List of Contributors
 * [LuckyPigeon](https://github.com/LuckyPigeon)
 * [Anton Chernov](https://github.com/lebeg)
 * [Denisa Roberts](https://github.com/D-Roberts)
+* [Dick Carter](https://github.com/DickJC123)
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
@@ -67,6 +67,10 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 * MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF
   - Values: Int ```(default=24)```
   - The cutoff threshold that decides the rounding strategy. Let's denote the threshold as T. If the memory size is smaller than `2 ** T` (by default, it's 2 ** 24 = 16MB), it rounds to the smallest `2 ** n` that is larger than the requested memory size; if the memory size is larger than `2 ** T`, it rounds to the next k * 2 ** T.
+* MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE
+  - Values: Int ```(default=2097152)```
+  - When using the naive pool type, memory allocations larger than this threshhold are rounded up to a multiple of this value.
+  - The default was chosen to minimize global memory fragmentation within the GPU driver.  Set this to 1 to disable.
 
 ## Engine Type
 
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
@@ -225,11 +225,11 @@ struct Context {
   /*!
    * \brief get the free and total available memory on a GPU
    * \param dev the GPU number to query
-   * \param free_mem pointer to the integer holding free GPU memory
-   * \param total_mem pointer to the integer holding total GPU memory
+   * \param free_mem pointer to the uint64_t holding free GPU memory
+   * \param total_mem pointer to the uint64_t holding total GPU memory
    * \return No return value
    */
-  inline static void GetGPUMemoryInformation(int dev, int *free, int *total);
+  inline static void GetGPUMemoryInformation(int dev, uint64_t *free, uint64_t *total);
   /*!
    * Create a pinned CPU context.
    * \param dev_id the device id for corresponding GPU.
@@ -334,8 +334,8 @@ inline int32_t Context::GetGPUCount() {
 #endif
 }
 
-inline void Context::GetGPUMemoryInformation(int dev, int *free_mem,
-                                             int *total_mem) {
+inline void Context::GetGPUMemoryInformation(int dev, uint64_t *free_mem,
+                                             uint64_t *total_mem) {
 #if MXNET_USE_CUDA
 
   size_t memF, memT;
@@ -354,8 +354,8 @@ inline void Context::GetGPUMemoryInformation(int dev, int *free_mem,
   e = cudaSetDevice(curDevice);
   CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
 
-  *free_mem = static_cast<int>(memF);
-  *total_mem = static_cast<int>(memT);
+  *free_mem = static_cast<uint64_t>(memF);
+  *total_mem = static_cast<uint64_t>(memT);
 
 #else
   LOG(FATAL)
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
@@ -441,13 +441,23 @@ MXNET_DLL int MXGetGPUCount(int* out);
 
 /*!
  * \brief get the free and total available memory on a GPU
+ *  Note: Deprecated, use MXGetGPUMemoryInformation64 instead.
  * \param dev the GPU number to query
  * \param free_mem pointer to the integer holding free GPU memory
  * \param total_mem pointer to the integer holding total GPU memory
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXGetGPUMemoryInformation(int dev, int *free_mem, int *total_mem);
 
+/*!
+ * \brief get the free and total available memory on a GPU
+ * \param dev the GPU number to query
+ * \param free_mem pointer to the uint64_t holding free GPU memory
+ * \param total_mem pointer to the uint64_t holding total GPU memory
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXGetGPUMemoryInformation64(int dev, uint64_t *free_mem, uint64_t *total_mem);
+
 /*!
  * \brief get the MXNet library version as an integer
  * \param pointer to the integer holding the version number
diff --git a/perl-package/AI-MXNetCAPI/mxnet.i b/perl-package/AI-MXNetCAPI/mxnet.i
@@ -344,13 +344,23 @@ int MXGetGPUCount(int* out);
 
 /*!
  * \brief get the free and total available memory on a GPU
+ *  Note: deprecated, use MXGetGPUMemoryInformation64().
  * \param dev the GPU number to query
  * \param free_mem pointer to the integer holding free GPU memory
  * \param total_mem pointer to the integer holding total GPU memory
  * \return 0 when success, -1 when failure happens
  */
 int MXGetGPUMemoryInformation(int dev, int *out, int *out);
 
+/*!
+ * \brief get the free and total available memory on a GPU
+ * \param dev the GPU number to query
+ * \param free_mem pointer to the uint64_t holding free GPU memory
+ * \param total_mem pointer to the uint64_t holding total GPU memory
+ * \return 0 when success, -1 when failure happens
+ */
+int MXGetGPUMemoryInformation64(int dev, uint64_t *out, uint64_t *out);
+
 
 //-------------------------------------
 // Part 1: NDArray creation and deletion
diff --git a/python/mxnet/context.py b/python/mxnet/context.py
@@ -258,6 +258,30 @@ def num_gpus():
     check_call(_LIB.MXGetGPUCount(ctypes.byref(count)))
     return count.value
 
+def gpu_memory_info(device_id=0):
+    """Query CUDA for the free and total bytes of GPU global memory.
+
+    Parameters
+    ----------
+    device_id : int, optional
+        The device id of the GPU device.
+
+    Raises
+    ------
+    Will raise an exception on any CUDA error.
+
+    Returns
+    -------
+    (free, total) : (int, int)
+        The number of GPUs.
+
+    """
+    free = ctypes.c_uint64()
+    total = ctypes.c_uint64()
+    dev_id = ctypes.c_int(device_id)
+    check_call(_LIB.MXGetGPUMemoryInformation64(dev_id, ctypes.byref(free), ctypes.byref(total)))
+    return (free.value, total.value)
+
 def current_context():
     """Returns the current context.
 
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
@@ -122,7 +122,18 @@ int MXGetGPUCount(int* out) {
   API_END();
 }
 
+// Deprecated: use MXGetGPUMemoryInformation64() instead.
 int MXGetGPUMemoryInformation(int dev, int *free_mem, int *total_mem) {
+  API_BEGIN();
+  uint64_t free_mem64 = 0UL;
+  uint64_t total_mem64 = 0UL;
+  Context::GetGPUMemoryInformation(dev, &free_mem64, &total_mem64);
+  *free_mem = static_cast<int>(free_mem64);
+  *total_mem = static_cast<int>(total_mem64);
+  API_END();
+}
+
+int MXGetGPUMemoryInformation64(int dev, uint64_t *free_mem, uint64_t *total_mem) {
   API_BEGIN();
   Context::GetGPUMemoryInformation(dev, free_mem, total_mem);
   API_END();
diff --git a/src/operator/nn/cudnn/cudnn_algoreg-inl.h b/src/operator/nn/cudnn/cudnn_algoreg-inl.h
@@ -30,6 +30,8 @@
 #include <mutex>
 #include <string>
 #include <vector>
+#include <functional>
+#include <utility>
 #include "../../../common/cuda_utils.h"
 #include "../convolution-inl.h"
 #include "../deconvolution-inl.h"
@@ -65,7 +67,11 @@ class CuDNNAlgo {
 template<typename ParamType>
 class CuDNNAlgoReg {
  public:
-  bool Find(const ParamType &param,
+  using AlgoSetter_t = std::function<void(CuDNNAlgo<cudnnConvolutionFwdAlgo_t> *,
+                            CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> *,
+                            CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *)>;
+
+  void FindOrElseRegister(const ParamType &param,
             const std::vector<TShape> &in_shape,
             const std::vector<TShape> &out_shape,
             cudnnDataType_t cudnn_data_type,
@@ -75,7 +81,8 @@ class CuDNNAlgoReg {
             bool add_to_weight,
             CuDNNAlgo<cudnnConvolutionFwdAlgo_t> *fwd,
             CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> *bwd,
-            CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *flt) {
+            CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *flt,
+            const AlgoSetter_t &algo_setter) {
     CHECK(in_shape.size() == 2 || in_shape.size() == 3);
     ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type,
                  cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight};
@@ -85,45 +92,28 @@ class CuDNNAlgoReg {
       *fwd = i->second.fwd;
       *bwd = i->second.bwd;
       *flt = i->second.flt;
-      return true;
-    }
-    return false;
-  }
-
-  void Register(const ParamType &param,
-                const std::vector<TShape> &in_shape,
-                const std::vector<TShape> &out_shape,
-                cudnnDataType_t cudnn_data_type,
-                cudnnDataType_t cudnn_forward_compute_type,
-                cudnnDataType_t cudnn_backward_compute_type,
-                int sm_arch,
-                bool add_to_weight,
-                const CuDNNAlgo<cudnnConvolutionFwdAlgo_t> &fwd,
-                const CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> &bwd,
-                const CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> &flt) {
-    CHECK(in_shape.size() == 2 || in_shape.size() == 3);
-    ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type,
-                 cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight};
-    std::lock_guard<std::mutex> guard(lock_);
-    if (param.cudnn_tune.value() && reg_.size() % 50 == 0) {
-      LOG(INFO) << "Running performance tests to find the best convolution "
-                   "algorithm, "
-                   "this can take a while... (setting env variable "
-                   "MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)";
-      if (reg_.size() >= 1000) {
-        // Many people are very concerned about this warning, so change the warning once.
-        if (!is_warning_autotune_) {
-          LOG(INFO)
-            << "If you see this message in the middle of training, you are "
-            "probably using bucketing. Consider setting env variable "
-            "MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable cudnn tuning.";
-          is_warning_autotune_ = true;
+    } else {
+      if (param.cudnn_tune.value() && reg_.size() % 50 == 0) {
+        LOG(INFO) << "Running performance tests to find the best convolution "
+            "algorithm, "
+            "this can take a while... (setting env variable "
+            "MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)";
+        if (reg_.size() >= 1000) {
+          // Many people are very concerned about this warning, so change the warning once.
+          if (!is_warning_autotune_) {
+            LOG(INFO)
+                << "If you see this message in the middle of training, you are "
+                    "probably using bucketing. Consider setting env variable "
+                    "MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable cudnn tuning.";
+            is_warning_autotune_ = true;
+          }
         }
       }
+      // Call provided function to determine the algos- likely uses cudnnFind() or cudnnGet()
+      algo_setter(fwd, bwd, flt);
+      // Save result so future lookups hit in this registry
+      reg_.insert(std::pair<ParamKey, CudnnAlgorithms>(key, CudnnAlgorithms{*fwd, *bwd, *flt}));
     }
-    reg_[key].fwd = fwd;
-    reg_[key].bwd = bwd;
-    reg_[key].flt = flt;
   }
 
   static CuDNNAlgoReg *Get();
diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h
diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py