From a419d953e5d3d7431404ce782a895770818ccd4f Mon Sep 17 00:00:00 2001
From: xuxinyi04 <xuxinyi04@baidu.com>
Date: Tue, 21 May 2024 11:43:04 +0000
Subject: [PATCH 1/7] load all once

---
 CMakeLists.txt                                |   6 +
 paddle/common/flags.cc                        |   4 +
 paddle/phi/backends/dynload/dynamic_loader.cc | 111 ++++++++++++++--
 paddle/phi/common/port.cc                     |  12 +-
 python/paddle/__init__.py                     | 121 ++++++++++++++++++
 python/setup.py.in                            |  26 +++-
 setup.py                                      |  88 ++++++++-----
 7 files changed, 326 insertions(+), 42 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0aa41a26d700e2..dcff4da2aacb3f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,6 +99,12 @@ if(WITH_GPU AND WITH_ROCM)
 endif()
 
 if(WITH_GPU AND NOT APPLE)
+  if(WIN32)
+    add_definitions(-DWITH_PIP_CUDA_LIBRARIES)
+    set(WITH_PIP_CUDA_LIBRARIES
+        ON
+        CACHE BOOL "" FORCE)
+  endif()
   #(Note risemeup1): The cudart dynamic library libcudart.so is used by set CUDA_USE_STATIC_CUDA_RUNTIME and CMAKE_CUDA_FLAGS
   if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL
                                             "x86_64")
diff --git a/paddle/common/flags.cc b/paddle/common/flags.cc
index c9b3b29115d757..e36084047835c0 100644
--- a/paddle/common/flags.cc
+++ b/paddle/common/flags.cc
@@ -1713,3 +1713,7 @@ PHI_DEFINE_EXPORTED_string(cusolver_dir,  // NOLINT
 PHI_DEFINE_EXPORTED_string(cusparse_dir,  // NOLINT
                            "",
                            "Specify path for loading libcusparse.so.*.");
+PHI_DEFINE_EXPORTED_string(
+    win_cuda_bin_dir,  // NOLINT
+    "",
+    "Specify path for loading *.dll about cuda on windows");
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index fc3d6b6c9c1161..0f07bcf56e150f 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
 #include <dirent.h>
 
+#include <codecvt>
 #include <cstdlib>
 #include <string>
 #include <vector>
@@ -45,6 +46,7 @@ COMMON_DECLARE_string(cusparselt_dir);
 COMMON_DECLARE_string(curand_dir);
 COMMON_DECLARE_string(cusolver_dir);
 COMMON_DECLARE_string(cusparse_dir);
+COMMON_DECLARE_string(win_cuda_bin_dir);
 #ifdef PADDLE_WITH_HIP
 
 PHI_DEFINE_string(miopen_dir,
@@ -132,8 +134,12 @@ static constexpr char* win_cufft_lib =
 
 static inline std::string join(const std::string& part1,
                                const std::string& part2) {
-  // directory separator
+// directory separator
+#if defined(_WIN32)
+  const char sep = '\\';
+#else
   const char sep = '/';
+#endif
   if (!part2.empty() && part2.front() == sep) {
     return part2;
   }
@@ -263,6 +269,26 @@ static inline void* GetDsoHandleFromSearchPath(
 #else
   int dynload_flags = 0;
 #endif  // !_WIN32
+#if defined(_WIN32)
+  std::vector<std::wstring> cuda_bin_search_path = {
+      L"cublas",
+      L"cuda_nvrtc",
+      L"cuda_runtime",
+      L"cudnn",
+      L"cufft",
+      L"curand",
+      L"cusolver",
+      L"cusparse",
+      L"nvjitlink",
+  };
+  for (auto search_path : cuda_bin_search_path) {
+    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+    std::wstring win_path_wstring =
+        converter.from_bytes(FLAGS_win_cuda_bin_dir);
+    search_path = win_path_wstring + L"\\" + search_path + L"\\bin";
+    AddDllDirectory(search_path.c_str());
+  }
+#endif
   std::vector<std::string> dso_names = split(dso_name, ";");
   void* dso_handle = nullptr;
   for (auto const& dso : dso_names) {
@@ -324,8 +350,26 @@ void* GetCublasDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
+  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
+#ifdef WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublas64_11.dll");
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
+#endif
+  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 12030) {
+#ifdef WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublas64_12.dll");
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
+#endif
+  } else {
+    std::string warning_msg(
+        "Your CUDA_VERSION is less than 11 or greater than 12, paddle "
+        "temporarily no longer supports");
+    return nullptr;
+  }
 #elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
 #ifdef WITH_PIP_CUDA_LIBRARIES
@@ -403,8 +447,13 @@ void* GetCUDNNDsoHandle() {
       "Toolkit\\CUDA\\v10.0\n"
       "You should do this according to your CUDA installation directory and "
       "CUDNN version.");
+#ifdef WITH_PIP_CUDA_LIBRARIES
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cuda_dir, "cudnn64_8.dll", true, {cuda_lib_path}, win_warn_meg);
+#else
   return GetDsoHandleFromSearchPath(
-      FLAGS_cudnn_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg);
+      FLAGS_cuda_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg);
+#endif
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
 #else
@@ -461,8 +510,13 @@ void* GetCurandDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+#ifdef WITH_PIP_CUDA_LIBRARIES
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cuda_dir, "curand64_10.dll", true, {cuda_lib_path});
+#else
   return GetDsoHandleFromSearchPath(
       FLAGS_cuda_dir, win_curand_lib, true, {cuda_lib_path});
+#endif
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
 #else
@@ -500,8 +554,13 @@ void* GetCusolverDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib");
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+#ifdef WITH_PIP_CUDA_LIBRARIES
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cuda_dir, "cusolver64_11.dll", true, {cuda_lib_path});
+#else
   return GetDsoHandleFromSearchPath(
       FLAGS_cuda_dir, win_cusolver_lib, true, {cuda_lib_path});
+#endif
 #else
 #ifdef WITH_PIP_CUDA_LIBRARIES
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.11");
@@ -515,8 +574,26 @@ void* GetCusparseDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.dylib");
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
+  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
+#ifdef WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cusparse64_11.dll");
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
+#endif
+  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 12030) {
+#ifdef WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cusparse64_12.dll");
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
+#endif
+  } else {
+    std::string warning_msg(
+        "Your CUDA_VERSION is less than 11 or greater than 12, paddle "
+        "temporarily no longer supports");
+    return nullptr;
+  }
 #elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
 #ifdef WITH_PIP_CUDA_LIBRARIES
@@ -709,8 +786,26 @@ void* GetCUFFTDsoHandle() {
     return nullptr;
   }
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path});
+  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
+#ifdef WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cufft64_10.dll");
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path});
+#endif
+  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 12030) {
+#ifdef WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cufft64_11.dll");
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path});
+#endif
+  } else {
+    std::string warning_msg(
+        "Your CUDA_VERSION is less than 11 or greater than 12, paddle "
+        "temporarily no longer supports");
+    return nullptr;
+  }
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so");
 #endif
diff --git a/paddle/phi/common/port.cc b/paddle/phi/common/port.cc
index 8c94232260aef3..41c127ddb2e415 100644
--- a/paddle/phi/common/port.cc
+++ b/paddle/phi/common/port.cc
@@ -18,7 +18,6 @@
 #include <memory>
 #include <stdexcept>
 #include <string>
-
 #include "glog/logging.h"
 
 #if !defined(_WIN32)
@@ -42,7 +41,14 @@ void *dlsym(void *handle, const char *symbol_name) {
 
 void *dlopen(const char *filename, int flag) {
   std::string file_name(filename);
-  HMODULE hModule = LoadLibrary(file_name.c_str());
+  HMODULE hModule = nullptr;
+#ifdef WITH_PIP_CUDA_LIBRARIES
+  hModule =
+      LoadLibraryEx(file_name.c_str(), NULL, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS);
+#endif
+  if (!hModule) {
+    hModule = LoadLibrary(file_name.c_str());
+  }
   if (!hModule) {
     if (flag) {
       throw std::runtime_error(file_name + " not found.");
@@ -72,7 +78,7 @@ int gettimeofday(struct timeval *tp, void *tzp) {
 
   return (0);
 }
-#endif              // !_WIN32
+#endif  // !_WIN32
 
 void ExecShellCommand(const std::string &cmd, std::string *message) {
   std::array<char, 128> buffer;
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index a9e3345474f4c8..3c1f10c538d32b 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -570,6 +570,7 @@
     if os.path.exists(cuh_file):
         os.environ.setdefault('runtime_include_dir', runtime_include_dir)
 
+
 if is_compiled_with_cuda():
     import os
     import platform
@@ -601,6 +602,126 @@
         cupti_dir_lib_path = package_dir + "/.." + "/nvidia/cuda_cupti/lib"
         set_flags({"FLAGS_cupti_dir": cupti_dir_lib_path})
 
+    elif (
+        platform.system() == 'Windows'
+        and platform.machine() in ('x86_64', 'AMD64')
+        and paddle.version.with_pip_cuda_libraries == 'ON'
+    ):
+        package_dir = os.path.dirname(os.path.abspath(__file__))
+        win_cuda_bin_path = package_dir + "\\.." + "\\nvidia"
+        set_flags({"FLAGS_win_cuda_bin_dir": win_cuda_bin_path})
+
+        import sys
+
+        if sys.platform == 'win32':
+            pfiles_path = os.getenv('ProgramFiles', 'C:\\Program Files')
+            py_dll_path = os.path.join(sys.exec_prefix, 'Library', 'bin')
+            th_dll_path = os.path.join(os.path.dirname(__file__), 'libs')
+            site_cuda_base_path = os.path.join(
+                os.path.dirname(__file__), '..', 'nvidia'
+            )
+            site_cuda_list = [
+                "cublas",
+                "cuda_nvrtc",
+                "cuda_runtime",
+                "cudnn",
+                "cufft",
+                "curand",
+                "cusolver",
+                "cusparse",
+                "nvjitlink",
+            ]
+
+            if sys.exec_prefix != sys.base_exec_prefix:
+                base_py_dll_path = os.path.join(
+                    sys.base_exec_prefix, 'Library', 'bin'
+                )
+            else:
+                base_py_dll_path = ''
+
+            dll_paths = list(
+                filter(
+                    os.path.exists, [th_dll_path, py_dll_path, base_py_dll_path]
+                )
+            )
+            for site_cuda_package in site_cuda_list:
+                site_cuda_path = os.path.join(
+                    site_cuda_base_path, site_cuda_package, 'bin'
+                )
+                if os.path.exists(site_cuda_path):
+                    dll_paths.append(site_cuda_path)
+
+            cuda_version = paddle.version.cuda_version
+            cuda_path_var = 'CUDA_PATH_V' + cuda_version.replace('.', '_')
+            default_path = os.path.join(
+                pfiles_path,
+                'NVIDIA GPU Computing Toolkit',
+                'CUDA',
+                'v' + cuda_version,
+            )
+            cuda_path = os.path.join(
+                os.getenv(cuda_path_var, default_path), 'bin'
+            )
+
+            dll_paths.extend(filter(os.path.exists, [cuda_path]))
+            import ctypes
+
+            kernel32 = ctypes.WinDLL('kernel32.dll', use_last_error=True)
+            with_load_library_flags = hasattr(kernel32, 'AddDllDirectory')
+            prev_error_mode = kernel32.SetErrorMode(0x0001)
+
+            kernel32.LoadLibraryW.restype = ctypes.c_void_p
+            if with_load_library_flags:
+                kernel32.LoadLibraryExW.restype = ctypes.c_void_p
+
+            for dll_path in dll_paths:
+                os.add_dll_directory(dll_path)
+
+            try:
+                ctypes.CDLL('vcruntime140.dll')
+                ctypes.CDLL('msvcp140.dll')
+                ctypes.CDLL('vcruntime140_1.dll')
+            except OSError:
+                print(
+                    '''Microsoft Visual C++ Redistributable is not installed, this may lead to the DLL load failure.
+                        It can be downloaded at https://aka.ms/vs/16/release/vc_redist.x64.exe'''
+                )
+            import glob
+
+            dlls = glob.glob(os.path.join(th_dll_path, '*.dll'))
+            for site_cuda_package in site_cuda_list:
+                site_cuda_path = os.path.join(
+                    site_cuda_base_path, site_cuda_package, 'bin'
+                )
+                if os.path.exists(site_cuda_path):
+                    dlls.extend(
+                        glob.glob(os.path.join(site_cuda_path, '*.dll'))
+                    )
+            path_patched = False
+            for dll in dlls:
+                is_loaded = False
+                print("dll:", dll)
+                if with_load_library_flags:
+                    res = kernel32.LoadLibraryW(dll)
+                    last_error = ctypes.get_last_error()
+                    if res is None and last_error != 126:
+                        err = ctypes.WinError(last_error)
+                        err.strerror += f' Error loading "{dll}" or one of its dependencies.'
+                        raise err
+                    elif res is not None:
+                        is_loaded = True
+                if not is_loaded:
+                    if not path_patched:
+                        os.environ['PATH'] = ';'.join(
+                            dll_paths + [os.environ['PATH']]
+                        )
+                        path_patched = True
+                    res = kernel32.LoadLibraryW(dll)
+                    if res is None:
+                        err = ctypes.WinError(ctypes.get_last_error())
+                        err.strerror += f' Error loading "{dll}" or one of its dependencies.'
+                        raise err
+            kernel32.SetErrorMode(prev_error_mode)
 
 disable_static()
 
diff --git a/python/setup.py.in b/python/setup.py.in
index 1f4cd1145ccc1b..96e2334946635b 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -425,7 +425,8 @@ write_distributed_training_mode_py(filename='@PADDLE_BINARY_DIR@/python/paddle/i
 def get_paddle_extra_install_requirements():
     #(Note risemeup1): Paddle will install the pypi cuda package provided by Nvidia, which includes the cuda runtime, cudnn, and cublas, thereby making the operation of 'pip install paddle' no longer dependent on the installation of cuda and cudnn.
     if '@WITH_PIP_CUDA_LIBRARIES@' == 'ON':
-        PADDLE_CUDA_INSTALL_REQUIREMENTS = {
+        if platform.system() == 'Linux':
+            PADDLE_CUDA_INSTALL_REQUIREMENTS = {
             "V11": (
                 "nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "
                 "nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | "
@@ -453,6 +454,27 @@ def get_paddle_extra_install_requirements():
                 "nvidia-cuda-nvrtc-cu12==12.3.107; platform_system == 'Linux' and platform_machine == 'x86_64'"
             ),
         }
+        elif platform.system() == 'Windows':
+            PADDLE_CUDA_INSTALL_REQUIREMENTS = {
+                "V11": (
+                    "nvidia-cuda-runtime-cu11==11.8.89 | "
+                    "nvidia-cudnn-cu11==8.9.4.19 | "
+                    "nvidia-cublas-cu11==11.11.3.6 | "
+                    "nvidia-cufft-cu11==10.9.0.58 | "
+                    "nvidia-curand-cu11==10.3.0.86 | "
+                    "nvidia-cusolver-cu11==11.4.1.48 | "
+                    "nvidia-cusparse-cu11==11.7.5.86 "
+                ),
+                "V12": (
+                    "nvidia-cuda-runtime-cu12==12.4.127 | "
+                    "nvidia-cudnn-cu12==9.0.0.312 | "
+                    "nvidia-cublas-cu12==12.4.5.8 | "
+                    "nvidia-cufft-cu12==11.2.1.3 | "
+                    "nvidia-curand-cu12==10.3.5.147 | "
+                    "nvidia-cusolver-cu12==11.6.1.9 | "
+                    "nvidia-cusparse-cu12==12.3.1.170 "
+                ),
+            }
         try:
             output = subprocess.check_output(['nvcc', '--version']).decode('utf-8')
             version_line = [line for line in output.split('\n') if 'release' in line][0]
@@ -654,7 +676,7 @@ if sys.version_info >= (3,8):
             continue
         setup_requires_tmp+=[setup_requires_i]
     setup_requires = setup_requires_tmp
-    if platform.system() == 'Linux' and platform.machine() == 'x86_64':
+    if '@WITH_GPU@' == 'ON' and platform.system() in ('Linux', 'Windows') and platform.machine() in ('x86_64', 'AMD64'):
         paddle_cuda_requires = get_paddle_extra_install_requirements()
         setup_requires += paddle_cuda_requires
 
diff --git a/setup.py b/setup.py
index 756f1334ed213e..105f4fa2ef13fa 100644
--- a/setup.py
+++ b/setup.py
@@ -955,34 +955,56 @@ def get_setup_requires():
 def get_paddle_extra_install_requirements():
     # (Note risemeup1): Paddle will install the pypi cuda package provided by Nvidia, which includes the cuda runtime, cudnn, and cublas, thereby making the operation of 'pip install paddle' no longer dependent on the installation of cuda and cudnn.
     if env_dict.get("WITH_PIP_CUDA_LIBRARIES") == "ON":
-        PADDLE_CUDA_INSTALL_REQUIREMENTS = {
-            "V11": (
-                "nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                "nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                "nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                "nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                "nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                "nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                "nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                "nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                "nvidia-nccl-cu11==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                "nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                "nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64'"
-            ),
-            "V12": (
-                "nvidia-cuda-runtime-cu12==12.3.101; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                "nvidia-cuda-cupti-cu12==12.3.101; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                "nvidia-cudnn-cu12==9.0.0.312; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                "nvidia-cublas-cu12==12.3.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                "nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                "nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                "nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                "nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                "nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                "nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-                "nvidia-cuda-nvrtc-cu12==12.3.107; platform_system == 'Linux' and platform_machine == 'x86_64'"
-            ),
-        }
+        if platform.system() == 'Linux':
+            PADDLE_CUDA_INSTALL_REQUIREMENTS = {
+                "V11": (
+                    "nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                    "nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                    "nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                    "nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                    "nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                    "nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                    "nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                    "nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                    "nvidia-nccl-cu11==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                    "nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                    "nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64'"
+                ),
+                "V12": (
+                    "nvidia-cuda-runtime-cu12==12.3.101; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                    "nvidia-cuda-cupti-cu12==12.3.101; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                    "nvidia-cudnn-cu12==9.0.0.312; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                    "nvidia-cublas-cu12==12.3.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                    "nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                    "nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                    "nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                    "nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                    "nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                    "nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                    "nvidia-cuda-nvrtc-cu12==12.3.107; platform_system == 'Linux' and platform_machine == 'x86_64'"
+                ),
+            }
+        elif platform.system() == 'Windows':
+            PADDLE_CUDA_INSTALL_REQUIREMENTS = {
+                "V11": (
+                    "nvidia-cuda-runtime-cu11==11.8.89 | "
+                    "nvidia-cudnn-cu11==8.9.4.19 | "
+                    "nvidia-cublas-cu11==11.11.3.6 | "
+                    "nvidia-cufft-cu11==10.9.0.58 | "
+                    "nvidia-curand-cu11==10.3.0.86 | "
+                    "nvidia-cusolver-cu11==11.4.1.48 | "
+                    "nvidia-cusparse-cu11==11.7.5.86 "
+                ),
+                "V12": (
+                    "nvidia-cuda-runtime-cu12==12.4.127 | "
+                    "nvidia-cudnn-cu12==9.0.0.312 | "
+                    "nvidia-cublas-cu12==12.4.5.8 | "
+                    "nvidia-cufft-cu12==11.2.1.3 | "
+                    "nvidia-curand-cu12==10.3.5.147 | "
+                    "nvidia-cusolver-cu12==11.6.1.9 | "
+                    "nvidia-cusparse-cu12==12.3.1.170 "
+                ),
+            }
         try:
             output = subprocess.check_output(['nvcc', '--version']).decode(
                 'utf-8'
@@ -1465,7 +1487,15 @@ def get_headers():
 def get_setup_parameters():
     # get setup_requires
     setup_requires = get_setup_requires()
-    if platform.system() == 'Linux' and platform.machine() == 'x86_64':
+    if (
+        env_dict.get("WITH_GPU") == 'ON'
+        and platform.system() in ('Linux', 'Windows')
+        and platform.machine()
+        in (
+            'x86_64',
+            'AMD64',
+        )
+    ):
         paddle_cuda_requires = get_paddle_extra_install_requirements()
         setup_requires += paddle_cuda_requires
 

From f28a30126d853dd3e81e7c903981166aa24aea27 Mon Sep 17 00:00:00 2001
From: xuxinyi04 <xuxinyi04@baidu.com>
Date: Tue, 21 May 2024 12:07:25 +0000
Subject: [PATCH 2/7] fix 32 bits dll error

---
 python/paddle/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 3c1f10c538d32b..df5f93fbe8077d 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -697,6 +697,8 @@
                     dlls.extend(
                         glob.glob(os.path.join(site_cuda_path, '*.dll'))
                     )
+            # Not load 32 bit dlls in 64 bit python.
+            dlls = [dll for dll in dlls if '32_' not in dll]
             path_patched = False
             for dll in dlls:
                 is_loaded = False

From 7fd89e86eb1f507f63096232b1dcd3809c09db12 Mon Sep 17 00:00:00 2001
From: xuxinyi04 <xuxinyi04@baidu.com>
Date: Wed, 22 May 2024 02:46:21 +0000
Subject: [PATCH 3/7] fix

---
 paddle/phi/backends/dynload/dynamic_loader.cc | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index 0f07bcf56e150f..5a2b0d80cb99a3 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -447,13 +447,23 @@ void* GetCUDNNDsoHandle() {
       "Toolkit\\CUDA\\v10.0\n"
       "You should do this according to your CUDA installation directory and "
       "CUDNN version.");
+  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
 #ifdef WITH_PIP_CUDA_LIBRARIES
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cuda_dir, "cudnn64_8.dll", true, {cuda_lib_path}, win_warn_meg);
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, "cudnn64_8.dll", true, {cuda_lib_path}, win_warn_meg);
 #else
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cuda_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg);
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg);
+#endif
+  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 12030) {
+#ifdef WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, "cudnn64_9.dll", true, {cuda_lib_path}, win_warn_meg);
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg);
 #endif
+  }
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
 #else

From 7102b17b5d1917beb8f108167085cfdca2d6c2c1 Mon Sep 17 00:00:00 2001
From: xuxinyi04 <xuxinyi04@baidu.com>
Date: Wed, 22 May 2024 04:09:14 +0000
Subject: [PATCH 4/7] fix conflict with cuda env

---
 python/paddle/__init__.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index df5f93fbe8077d..af76944bdc5dc9 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -663,7 +663,6 @@
                 os.getenv(cuda_path_var, default_path), 'bin'
             )
 
-            dll_paths.extend(filter(os.path.exists, [cuda_path]))
             import ctypes
 
             kernel32 = ctypes.WinDLL('kernel32.dll', use_last_error=True)
@@ -704,7 +703,7 @@
                 is_loaded = False
                 print("dll:", dll)
                 if with_load_library_flags:
-                    res = kernel32.LoadLibraryW(dll)
+                    res = kernel32.LoadLibraryExW(dll, None, 0x00001100)
                     last_error = ctypes.get_last_error()
                     if res is None and last_error != 126:
                         err = ctypes.WinError(last_error)
@@ -714,11 +713,15 @@
                         is_loaded = True
                 if not is_loaded:
                     if not path_patched:
+                        dll_paths.extend(filter(os.path.exists, [cuda_path]))
+                        prev_path = os.environ['PATH']
                         os.environ['PATH'] = ';'.join(
                             dll_paths + [os.environ['PATH']]
                         )
                         path_patched = True
                     res = kernel32.LoadLibraryW(dll)
+                    if path_patched:
+                        os.environ['PATH'] = prev_path
                     if res is None:
                         err = ctypes.WinError(ctypes.get_last_error())
                         err.strerror += f' Error loading "{dll}" or one of its dependencies.'

From 69d98da4549b529bd48417ad392a1259a134d1d9 Mon Sep 17 00:00:00 2001
From: xuxinyi04 <xuxinyi04@baidu.com>
Date: Wed, 22 May 2024 18:00:12 +0000
Subject: [PATCH 5/7] fix_cudnn

---
 paddle/phi/backends/dynload/dynamic_loader.cc | 12 ++++++------
 python/paddle/__init__.py                     | 13 -------------
 2 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index 5a2b0d80cb99a3..24e18143dbe155 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -357,7 +357,7 @@ void* GetCublasDsoHandle() {
     return GetDsoHandleFromSearchPath(
         FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
 #endif
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 12030) {
+  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) {
 #ifdef WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublas64_12.dll");
 #else
@@ -447,7 +447,7 @@ void* GetCUDNNDsoHandle() {
       "Toolkit\\CUDA\\v10.0\n"
       "You should do this according to your CUDA installation directory and "
       "CUDNN version.");
-  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
+  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12030) {
 #ifdef WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(
         FLAGS_cuda_dir, "cudnn64_8.dll", true, {cuda_lib_path}, win_warn_meg);
@@ -455,7 +455,7 @@ void* GetCUDNNDsoHandle() {
     return GetDsoHandleFromSearchPath(
         FLAGS_cuda_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg);
 #endif
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 12030) {
+  } else if (CUDA_VERSION >= 12030) {
 #ifdef WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(
         FLAGS_cuda_dir, "cudnn64_9.dll", true, {cuda_lib_path}, win_warn_meg);
@@ -496,7 +496,7 @@ void* GetCUPTIDsoHandle() {
         FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path});
 #endif
 
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 12030) {
+  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) {
 #ifdef WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(
         FLAGS_cupti_dir, "libcupti.so.12", false, {cupti_lib_path});
@@ -591,7 +591,7 @@ void* GetCusparseDsoHandle() {
     return GetDsoHandleFromSearchPath(
         FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
 #endif
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 12030) {
+  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) {
 #ifdef WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cusparse64_12.dll");
 #else
@@ -803,7 +803,7 @@ void* GetCUFFTDsoHandle() {
     return GetDsoHandleFromSearchPath(
         FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path});
 #endif
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 12030) {
+  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) {
 #ifdef WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cufft64_11.dll");
 #else
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index af76944bdc5dc9..af2bc334ce717e 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -651,18 +651,6 @@
                 if os.path.exists(site_cuda_path):
                     dll_paths.append(site_cuda_path)
 
-            cuda_version = paddle.version.cuda_version
-            cuda_path_var = 'CUDA_PATH_V' + cuda_version.replace('.', '_')
-            default_path = os.path.join(
-                pfiles_path,
-                'NVIDIA GPU Computing Toolkit',
-                'CUDA',
-                'v' + cuda_version,
-            )
-            cuda_path = os.path.join(
-                os.getenv(cuda_path_var, default_path), 'bin'
-            )
-
             import ctypes
 
             kernel32 = ctypes.WinDLL('kernel32.dll', use_last_error=True)
@@ -713,7 +701,6 @@
                         is_loaded = True
                 if not is_loaded:
                     if not path_patched:
-                        dll_paths.extend(filter(os.path.exists, [cuda_path]))
                         prev_path = os.environ['PATH']
                         os.environ['PATH'] = ';'.join(
                             dll_paths + [os.environ['PATH']]

From 31e928955fac9a5dc31d603cb7ed1b8964aff289 Mon Sep 17 00:00:00 2001
From: xuxinyi04 <xuxinyi04@baidu.com>
Date: Tue, 28 May 2024 03:29:03 +0000
Subject: [PATCH 6/7] fix version

---
 python/setup.py.in | 4 ++--
 setup.py           | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/setup.py.in b/python/setup.py.in
index 96e2334946635b..655b1c877aa699 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -466,9 +466,9 @@ def get_paddle_extra_install_requirements():
                     "nvidia-cusparse-cu11==11.7.5.86 "
                 ),
                 "V12": (
-                    "nvidia-cuda-runtime-cu12==12.4.127 | "
+                    "nvidia-cuda-runtime-cu12==12.3.101 | "
                     "nvidia-cudnn-cu12==9.0.0.312 | "
-                    "nvidia-cublas-cu12==12.4.5.8 | "
+                    "nvidia-cublas-cu12==12.3.4.1 | "
                     "nvidia-cufft-cu12==11.2.1.3 | "
                     "nvidia-curand-cu12==10.3.5.147 | "
                     "nvidia-cusolver-cu12==11.6.1.9 | "
diff --git a/setup.py b/setup.py
index 105f4fa2ef13fa..1b03257c193c17 100644
--- a/setup.py
+++ b/setup.py
@@ -996,9 +996,9 @@ def get_paddle_extra_install_requirements():
                     "nvidia-cusparse-cu11==11.7.5.86 "
                 ),
                 "V12": (
-                    "nvidia-cuda-runtime-cu12==12.4.127 | "
+                    "nvidia-cuda-runtime-cu12==12.3.101 | "
                     "nvidia-cudnn-cu12==9.0.0.312 | "
-                    "nvidia-cublas-cu12==12.4.5.8 | "
+                    "nvidia-cublas-cu12==12.3.4.1 | "
                     "nvidia-cufft-cu12==11.2.1.3 | "
                     "nvidia-curand-cu12==10.3.5.147 | "
                     "nvidia-cusolver-cu12==11.6.1.9 | "

From c079dfa5bc835fe3df7b17814b0d206563d5dff7 Mon Sep 17 00:00:00 2001
From: xuxinyi04 <xuxinyi04@baidu.com>
Date: Tue, 28 May 2024 07:48:09 +0000
Subject: [PATCH 7/7] close switch

---
 CMakeLists.txt                                | 6 ------
 paddle/phi/backends/dynload/dynamic_loader.cc | 2 +-
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dcff4da2aacb3f..0aa41a26d700e2 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,12 +99,6 @@ if(WITH_GPU AND WITH_ROCM)
 endif()
 
 if(WITH_GPU AND NOT APPLE)
-  if(WIN32)
-    add_definitions(-DWITH_PIP_CUDA_LIBRARIES)
-    set(WITH_PIP_CUDA_LIBRARIES
-        ON
-        CACHE BOOL "" FORCE)
-  endif()
   #(Note risemeup1): The cudart dynamic library libcudart.so is used by set CUDA_USE_STATIC_CUDA_RUNTIME and CMAKE_CUDA_FLAGS
   if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL
                                             "x86_64")
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index 24e18143dbe155..783792203cf7a9 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -496,7 +496,7 @@ void* GetCUPTIDsoHandle() {
         FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path});
 #endif
 
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) {
+  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 12030) {
 #ifdef WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(
         FLAGS_cupti_dir, "libcupti.so.12", false, {cupti_lib_path});