diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 926a7b1d69c7f8..c1c93e17fd82ea 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -53,7 +53,8 @@ if(NOT WITH_GPU) list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) else() - add_definitions(-DPADDLE_WITH_GPU) + add_definitions(-DPADDLE_WITH_CUDA) + FIND_PACKAGE(CUDA REQUIRED) if(${CUDA_VERSION_MAJOR} VERSION_LESS 7) diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc index 8dcc357a16a057..ad71c6fa563616 100644 --- a/paddle/platform/device_context.cc +++ b/paddle/platform/device_context.cc @@ -1,13 +1,13 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ #include "paddle/platform/device_context.h" #include "paddle/memory/memory.h" @@ -15,93 +15,73 @@ limitations under the License. */ namespace paddle { namespace platform { -template <> -Eigen::DefaultDevice* DeviceContext::GetEigenDevice< - platform::CPUPlace, Eigen::DefaultDevice>() const { - return reinterpret_cast(this)->eigen_device(); -} +#ifdef PADDLE_WITH_CUDA -CPUDeviceContext::CPUDeviceContext() { - eigen_device_.reset(new Eigen::DefaultDevice()); +CUDADeviceContext::EigenCudaStreamDevice::EigenCudaStreamDevice() + : scratch_(nullptr), semaphore_(nullptr) { + Eigen::initializeDeviceProp(); } +CUDADeviceContext::EigenCudaStreamDevice::~EigenCudaStreamDevice() override {} -CPUDeviceContext::CPUDeviceContext(CPUPlace place) { - eigen_device_.reset(new Eigen::DefaultDevice()); +void CUDADeviceContext::EigenCudaStreamDevice::SetValues( + const cudaStream_t* cuda_stream, GPUPlace place) { + stream_ = cuda_stream; + place_ = place; + device_prop_ = &Eigen::m_deviceProperties[place.device]; } -Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const { - return eigen_device_.get(); +const cudaStream_t& void CUDADeviceContext::EigenCudaStreamDevice::stream() + const override { + return *stream_; } -Place CPUDeviceContext::GetPlace() const { return CPUPlace(); } - -#ifdef PADDLE_WITH_GPU - -template <> -Eigen::GpuDevice* -DeviceContext::GetEigenDevice() const { - return reinterpret_cast(this)->eigen_device(); +const cudaDeviceProp& void +CUDADeviceContext::EigenCudaStreamDevice::deviceProperties() const override { + return *device_prop_; } -class EigenCudaStreamDevice : public Eigen::StreamInterface { - public: - EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) { - Eigen::initializeDeviceProp(); - } - ~EigenCudaStreamDevice() override {} - - void Reinitialize(const cudaStream_t* cuda_stream, GPUPlace place) { - stream_ = cuda_stream; - place_ = place; - device_prop_ = &Eigen::m_deviceProperties[place.device]; - } - - const cudaStream_t& stream() const override { return *stream_; } - - const cudaDeviceProp& deviceProperties() const override { - return *device_prop_; - } - - void* allocate(size_t num_bytes) const override { - return paddle::memory::Alloc(place_, num_bytes); - } +void* void CUDADeviceContext::EigenCudaStreamDevice::allocate( + size_t num_bytes) const override { + return paddle::memory::Alloc(place_, num_bytes); +} - void deallocate(void* buffer) const override { - paddle::memory::Free(place_, buffer); - } +void void CUDADeviceContext::EigenCudaStreamDevice::deallocate( + void* buffer) const override { + paddle::memory::Free(place_, buffer); +} - void* scratchpad() const override { - if (scratch_ == NULL) { - scratch_ = allocate(Eigen::kCudaScratchSize + sizeof(unsigned int)); - } - return scratch_; +void* void CUDADeviceContext::EigenCudaStreamDevice::scratchpad() + const override { + if (scratch_ == NULL) { + scratch_ = allocate(Eigen::kCudaScratchSize + sizeof(unsigned int)); } + return scratch_; +} - unsigned int* semaphore() const override { - if (semaphore_ == NULL) { - char* scratch = - static_cast(scratchpad()) + Eigen::kCudaScratchSize; - semaphore_ = reinterpret_cast(scratch); - PADDLE_ENFORCE( - cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_)); - } - return semaphore_; +unsigned int* void CUDADeviceContext::EigenCudaStreamDevice::semaphore() + const override { + if (semaphore_ == NULL) { + char* scratch = static_cast(scratchpad()) + Eigen::kCudaScratchSize; + semaphore_ = reinterpret_cast(scratch); + PADDLE_ENFORCE( + cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_)); } - - private: - GPUPlace place_; - const cudaStream_t* stream_; // not owned; - const cudaDeviceProp* device_prop_; // not owned; - mutable void* scratch_; - mutable unsigned int* semaphore_; -}; + return semaphore_; +} CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) { + // Create CUDA stream on the given device. SetDeviceId(place_.device); PADDLE_ENFORCE(cudaStreamCreate(&stream_)); + + // Set the CUDA stream into the EigenCudaStreamDevice instance. eigen_stream_.reset(new EigenCudaStreamDevice()); - eigen_stream_->Reinitialize(&stream_, place); + eigen_stream_->SetValues(&stream_, place); + + // Initialize Eigen::CpuDevice using EigenCudaStreamDevice. eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); + + // Create other handles in addition to the CUDA stream. PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_)); PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); @@ -109,8 +89,12 @@ CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) { } CUDADeviceContext::~CUDADeviceContext() { + // Wait for the completion of all operations before destructing. SetDeviceId(place_.device); Wait(); + + // Note: the destruction order must be the same with the + // construction order. PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_)); PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); eigen_stream_.reset(); @@ -118,25 +102,7 @@ CUDADeviceContext::~CUDADeviceContext() { PADDLE_ENFORCE(cudaStreamDestroy(stream_)); } -Place CUDADeviceContext::GetPlace() const { return place_; } - -void CUDADeviceContext::Wait() const { - PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); -} - -Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { - return eigen_device_.get(); -} - -cublasHandle_t CUDADeviceContext::cublas_handle() const { - return cublas_handle_; -} - -cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; } - -cudaStream_t CUDADeviceContext::stream() const { return stream_; } - -#endif // PADDLE_ONLY_CPU +#endif // PADDLE_WITH_CUDA } // namespace platform } // namespace paddle diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index c1c4c7f7600b7e..3bf0482e092d09 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -1,13 +1,13 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ #pragma once @@ -27,74 +27,82 @@ limitations under the License. */ namespace paddle { namespace platform { -template -struct EigenDeviceConverter; - -template <> -struct EigenDeviceConverter { - using EigenDeviceType = Eigen::DefaultDevice; -}; - -class DeviceContext { - public: - virtual ~DeviceContext() {} - virtual Place GetPlace() const = 0; - - template ::EigenDeviceType> - DeviceType* GetEigenDevice() const; - - virtual void Wait() const {} -}; - class CPUDeviceContext : public DeviceContext { public: - CPUDeviceContext(); - explicit CPUDeviceContext(CPUPlace place); - - Eigen::DefaultDevice* eigen_device() const; + CPUDeviceContext() { eigen_device_.reset(new Eigen::DefaultDevice()); } + explicit CPUDeviceContext(CPUPlace place) { + eigen_device_.reset(new Eigen::DefaultDevice()); + } - Place GetPlace() const override; + Eigen::DefaultDevice* GetEigenDevice() const { return eigen_device_.get(); } + Place GetPlace() const { return CPUPlace(); } private: std::unique_ptr eigen_device_; }; -#ifdef PADDLE_WITH_GPU -template <> -struct EigenDeviceConverter { - using EigenDeviceType = Eigen::GpuDevice; -}; - -class EigenCudaStreamDevice; - +#ifdef PADDLE_WITH_CUDA + +// The CUDADeviceContext is a parameter to framework::OperatorBase::Run: +/* + virtual void Run(const Scope& scope, + const platform::DeviceContext& dev_ctx) const = 0; +*/ +// To call Eigen functions in Run, we'd need to provide a parameter of +// type Eigen::CpuDevice, from CUDADeviceContext::GetEigenDevice(). +// +// SomeEigenFunction(dev_ctx.GetEigenDevice(), ...); +// +// If we are going to call CUDA, cuDNN, cuBLAS function, we need to +// pass them handles returned by stream, cudnn_handle, cublas_handle. +// For example: +// +// SomeCUDNNFunction(dev_ctx.cudnn_handle(), ...); +// class CUDADeviceContext : public DeviceContext { public: explicit CUDADeviceContext(GPUPlace place); virtual ~CUDADeviceContext(); - /*! \brief Wait for all operations completion in the stream. */ - void Wait() const override; - - /*! \brief Return place in the device context. */ - Place GetPlace() const override; - - /*! \brief Return eigen device in the device context. */ - Eigen::GpuDevice* eigen_device() const; - - /*! \brief Return cublas handle in the device context. */ - cublasHandle_t cublas_handle() const; + Eigen::GpuDevice* GetEigenDevice() const { return eigen_device_.get(); } + Place GetPlace() const override { return place_; } - /*! \brief Return cudnn handle in the device context. */ - cudnnHandle_t cudnn_handle() const; + /*! \brief Wait for all operations completion in the stream. */ + void Wait() const override { PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); } - /*! \brief Return cuda stream in the device context. */ - cudaStream_t stream() const; + cublasHandle_t cublas_handle() const { return cublas_handle_; } + cudnnHandle_t cudnn_handle() const { return cudnn_handle_; } + cudaStream_t stream() const { return stream_; } private: - GPUPlace place_; + // Eigen requires that a Eigen::GpuDevice instance being initialized + // from a class derived from Eigen::StreamInterface. + class EigenCudaStreamDevice : public Eigen::StreamInterface { + public: + EigenCudaStreamDevice(); + ~EigenCudaStreamDevice() override {} + + // https://github.com/PaddlePaddle/Paddle/pull/3497#issue-250238535 + // explained that initializing CUDA stream in the constructor + // would cause SEGFAULT, so we add this method. + void SetValues(const cudaStream_t* cuda_stream, GPUPlace place); + + const cudaStream_t& stream() const override; + const cudaDeviceProp& deviceProperties() const override; + void* allocate(size_t num_bytes) const override; + void deallocate(void* buffer) const override; + void* scratchpad() const override; + unsigned int* semaphore() const override; + + private: + GPUPlace place_; + const cudaStream_t* stream_; // not owned; + const cudaDeviceProp* device_prop_; // not owned; + mutable void* scratch_; + mutable unsigned int* semaphore_; + }; + GPUPlace place_; std::unique_ptr eigen_device_; std::unique_ptr eigen_stream_; @@ -103,7 +111,13 @@ class CUDADeviceContext : public DeviceContext { cublasHandle_t cublas_handle_; }; -#endif +#endif // PADDLE_WITH_CUDA + +#ifdef PADDLE_WITH_CUDA +typedef boost::variant DeviceContext; +#else +typedef boost::variant DeviceContext; +#endif // PADDLE_WITH_CUDA } // namespace platform } // namespace paddle diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc index f4b00c57dee519..e751ede3f0b7e8 100644 --- a/paddle/platform/device_context_test.cc +++ b/paddle/platform/device_context_test.cc @@ -15,35 +15,26 @@ limitations under the License. */ #include "paddle/platform/device_context.h" #include "gtest/gtest.h" -TEST(Device, Init) { +#ifdef PADDLE_WITH_CUDA +TEST(DeviceContext, CUDA) { using paddle::platform::DeviceContext; using paddle::platform::CUDADeviceContext; using paddle::platform::GPUPlace; - int count = paddle::platform::GetDeviceCount(); - for (int i = 0; i < count; i++) { - DeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); - Eigen::GpuDevice* gpu_device = - device_context->template GetEigenDevice(); - ASSERT_NE(nullptr, gpu_device); - delete device_context; + for (int i = 0; i < paddle::platform::GetDeviceCount(); i++) { + DeviceContext dev_ctx(GPUPlace(i)); + ASSERT_NE(nullptr, boost::get(dev_ctx).GetEigenDevice()); + ASSERT_NE(nullptr, boost::get(dev_ctx).cudnn_handle()); + ASSERT_NE(nullptr, boost::get(dev_ctx).cublas_handle()); + ASSERT_NE(nullptr, boost::get(dev_ctx).stream()); } } +#endif // PADDLE_WITH_CUDA -TEST(Device, CUDADeviceContext) { - using paddle::platform::CUDADeviceContext; - using paddle::platform::GPUPlace; +TEST(DeviceContext, CPU) { + using paddle::platform::DeviceContext; + using paddle::platform::CPUDeviceContext; - int count = paddle::platform::GetDeviceCount(); - for (int i = 0; i < count; i++) { - CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); - Eigen::GpuDevice* gpu_device = device_context->eigen_device(); - ASSERT_NE(nullptr, gpu_device); - cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); - ASSERT_NE(nullptr, cudnn_handle); - cublasHandle_t cublas_handle = device_context->cublas_handle(); - ASSERT_NE(nullptr, cublas_handle); - ASSERT_NE(nullptr, device_context->stream()); - delete device_context; - } + DeviceContext dev_ctx; // defaults to CPUPlace + ASSERT_NE(nullptr, boost::get(dev_ctx).GetEigenDevice()); } diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h index ac884386dde1f9..19c896d31e7b9a 100644 --- a/paddle/platform/gpu_info.h +++ b/paddle/platform/gpu_info.h @@ -28,6 +28,7 @@ const std::string kEnvFractionGpuMemoryToUse = "PADDLE_FRACTION_GPU_MEMORY_TO_USE"; //! Get the total number of GPU devices in system. +// TODO(yi): Rename into GetCUDADeviceCount(). int GetDeviceCount(); //! Get the current GPU device id in system.