diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 926a7b1d69c7f8..c1c93e17fd82ea 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -53,7 +53,8 @@ if(NOT WITH_GPU)
 
     list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
-    add_definitions(-DPADDLE_WITH_GPU)
+    add_definitions(-DPADDLE_WITH_CUDA)
+
     FIND_PACKAGE(CUDA REQUIRED)
 
     if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index 8dcc357a16a057..ad71c6fa563616 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -1,13 +1,13 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 
 #include "paddle/platform/device_context.h"
 #include "paddle/memory/memory.h"
@@ -15,93 +15,73 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-template <>
-Eigen::DefaultDevice* DeviceContext::GetEigenDevice<
-    platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return reinterpret_cast<const CPUDeviceContext*>(this)->eigen_device();
-}
+#ifdef PADDLE_WITH_CUDA
 
-CPUDeviceContext::CPUDeviceContext() {
-  eigen_device_.reset(new Eigen::DefaultDevice());
+CUDADeviceContext::EigenCudaStreamDevice::EigenCudaStreamDevice()
+    : scratch_(nullptr), semaphore_(nullptr) {
+  Eigen::initializeDeviceProp();
 }
+CUDADeviceContext::EigenCudaStreamDevice::~EigenCudaStreamDevice() override {}
 
-CPUDeviceContext::CPUDeviceContext(CPUPlace place) {
-  eigen_device_.reset(new Eigen::DefaultDevice());
+void CUDADeviceContext::EigenCudaStreamDevice::SetValues(
+    const cudaStream_t* cuda_stream, GPUPlace place) {
+  stream_ = cuda_stream;
+  place_ = place;
+  device_prop_ = &Eigen::m_deviceProperties[place.device];
 }
 
-Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
-  return eigen_device_.get();
+const cudaStream_t& void CUDADeviceContext::EigenCudaStreamDevice::stream()
+    const override {
+  return *stream_;
 }
 
-Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }
-
-#ifdef PADDLE_WITH_GPU
-
-template <>
-Eigen::GpuDevice*
-DeviceContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return reinterpret_cast<const CUDADeviceContext*>(this)->eigen_device();
+const cudaDeviceProp& void
+CUDADeviceContext::EigenCudaStreamDevice::deviceProperties() const override {
+  return *device_prop_;
 }
 
-class EigenCudaStreamDevice : public Eigen::StreamInterface {
- public:
-  EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
-    Eigen::initializeDeviceProp();
-  }
-  ~EigenCudaStreamDevice() override {}
-
-  void Reinitialize(const cudaStream_t* cuda_stream, GPUPlace place) {
-    stream_ = cuda_stream;
-    place_ = place;
-    device_prop_ = &Eigen::m_deviceProperties[place.device];
-  }
-
-  const cudaStream_t& stream() const override { return *stream_; }
-
-  const cudaDeviceProp& deviceProperties() const override {
-    return *device_prop_;
-  }
-
-  void* allocate(size_t num_bytes) const override {
-    return paddle::memory::Alloc(place_, num_bytes);
-  }
+void* void CUDADeviceContext::EigenCudaStreamDevice::allocate(
+    size_t num_bytes) const override {
+  return paddle::memory::Alloc(place_, num_bytes);
+}
 
-  void deallocate(void* buffer) const override {
-    paddle::memory::Free(place_, buffer);
-  }
+void void CUDADeviceContext::EigenCudaStreamDevice::deallocate(
+    void* buffer) const override {
+  paddle::memory::Free(place_, buffer);
+}
 
-  void* scratchpad() const override {
-    if (scratch_ == NULL) {
-      scratch_ = allocate(Eigen::kCudaScratchSize + sizeof(unsigned int));
-    }
-    return scratch_;
+void* void CUDADeviceContext::EigenCudaStreamDevice::scratchpad()
+    const override {
+  if (scratch_ == NULL) {
+    scratch_ = allocate(Eigen::kCudaScratchSize + sizeof(unsigned int));
   }
+  return scratch_;
+}
 
-  unsigned int* semaphore() const override {
-    if (semaphore_ == NULL) {
-      char* scratch =
-          static_cast<char*>(scratchpad()) + Eigen::kCudaScratchSize;
-      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
-      PADDLE_ENFORCE(
-          cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
-    }
-    return semaphore_;
+unsigned int* void CUDADeviceContext::EigenCudaStreamDevice::semaphore()
+    const override {
+  if (semaphore_ == NULL) {
+    char* scratch = static_cast<char*>(scratchpad()) + Eigen::kCudaScratchSize;
+    semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+    PADDLE_ENFORCE(
+        cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
   }
-
- private:
-  GPUPlace place_;
-  const cudaStream_t* stream_;         // not owned;
-  const cudaDeviceProp* device_prop_;  // not owned;
-  mutable void* scratch_;
-  mutable unsigned int* semaphore_;
-};
+  return semaphore_;
+}
 
 CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
+  // Create CUDA stream on the given device.
   SetDeviceId(place_.device);
   PADDLE_ENFORCE(cudaStreamCreate(&stream_));
+
+  // Set the CUDA stream into the EigenCudaStreamDevice instance.
   eigen_stream_.reset(new EigenCudaStreamDevice());
-  eigen_stream_->Reinitialize(&stream_, place);
+  eigen_stream_->SetValues(&stream_, place);
+
+  // Initialize Eigen::CpuDevice using EigenCudaStreamDevice.
   eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
+
+  // Create other handles in addition to the CUDA stream.
   PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
   PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
   PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
@@ -109,8 +89,12 @@ CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
 }
 
 CUDADeviceContext::~CUDADeviceContext() {
+  // Wait for the completion of all operations before destructing.
   SetDeviceId(place_.device);
   Wait();
+
+  // Note: the destruction order must be the same with the
+  // construction order.
   PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
   PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
   eigen_stream_.reset();
@@ -118,25 +102,7 @@ CUDADeviceContext::~CUDADeviceContext() {
   PADDLE_ENFORCE(cudaStreamDestroy(stream_));
 }
 
-Place CUDADeviceContext::GetPlace() const { return place_; }
-
-void CUDADeviceContext::Wait() const {
-  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-}
-
-Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
-  return eigen_device_.get();
-}
-
-cublasHandle_t CUDADeviceContext::cublas_handle() const {
-  return cublas_handle_;
-}
-
-cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
-
-cudaStream_t CUDADeviceContext::stream() const { return stream_; }
-
-#endif  // PADDLE_ONLY_CPU
+#endif  // PADDLE_WITH_CUDA
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index c1c4c7f7600b7e..3bf0482e092d09 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -1,13 +1,13 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 
 #pragma once
 
@@ -27,74 +27,82 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-template <typename T>
-struct EigenDeviceConverter;
-
-template <>
-struct EigenDeviceConverter<platform::CPUPlace> {
-  using EigenDeviceType = Eigen::DefaultDevice;
-};
-
-class DeviceContext {
- public:
-  virtual ~DeviceContext() {}
-  virtual Place GetPlace() const = 0;
-
-  template <typename PlaceType,
-            typename DeviceType =
-                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
-  DeviceType* GetEigenDevice() const;
-
-  virtual void Wait() const {}
-};
-
 class CPUDeviceContext : public DeviceContext {
  public:
-  CPUDeviceContext();
-  explicit CPUDeviceContext(CPUPlace place);
-
-  Eigen::DefaultDevice* eigen_device() const;
+  CPUDeviceContext() { eigen_device_.reset(new Eigen::DefaultDevice()); }
+  explicit CPUDeviceContext(CPUPlace place) {
+    eigen_device_.reset(new Eigen::DefaultDevice());
+  }
 
-  Place GetPlace() const override;
+  Eigen::DefaultDevice* GetEigenDevice() const { return eigen_device_.get(); }
+  Place GetPlace() const { return CPUPlace(); }
 
  private:
   std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
 };
 
-#ifdef PADDLE_WITH_GPU
-template <>
-struct EigenDeviceConverter<platform::GPUPlace> {
-  using EigenDeviceType = Eigen::GpuDevice;
-};
-
-class EigenCudaStreamDevice;
-
+#ifdef PADDLE_WITH_CUDA
+
+// The CUDADeviceContext is a parameter to framework::OperatorBase::Run:
+/*
+  virtual void Run(const Scope& scope,
+                   const platform::DeviceContext& dev_ctx) const = 0;
+*/
+// To call Eigen functions in Run, we'd need to provide a parameter of
+// type Eigen::CpuDevice, from CUDADeviceContext::GetEigenDevice().
+//
+//   SomeEigenFunction(dev_ctx.GetEigenDevice(), ...);
+//
+// If we are going to call CUDA, cuDNN, cuBLAS function, we need to
+// pass them handles returned by stream, cudnn_handle, cublas_handle.
+// For example:
+//
+//  SomeCUDNNFunction(dev_ctx.cudnn_handle(), ...);
+//
 class CUDADeviceContext : public DeviceContext {
  public:
   explicit CUDADeviceContext(GPUPlace place);
   virtual ~CUDADeviceContext();
 
-  /*! \brief  Wait for all operations completion in the stream. */
-  void Wait() const override;
-
-  /*! \brief  Return place in the device context. */
-  Place GetPlace() const override;
-
-  /*! \brief  Return eigen device in the device context. */
-  Eigen::GpuDevice* eigen_device() const;
-
-  /*! \brief  Return cublas handle in the device context. */
-  cublasHandle_t cublas_handle() const;
+  Eigen::GpuDevice* GetEigenDevice() const { return eigen_device_.get(); }
+  Place GetPlace() const override { return place_; }
 
-  /*! \brief  Return cudnn  handle in the device context. */
-  cudnnHandle_t cudnn_handle() const;
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const override { PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); }
 
-  /*! \brief  Return cuda stream in the device context. */
-  cudaStream_t stream() const;
+  cublasHandle_t cublas_handle() const { return cublas_handle_; }
+  cudnnHandle_t cudnn_handle() const { return cudnn_handle_; }
+  cudaStream_t stream() const { return stream_; }
 
  private:
-  GPUPlace place_;
+  // Eigen requires that a Eigen::GpuDevice instance being initialized
+  // from a class derived from Eigen::StreamInterface.
+  class EigenCudaStreamDevice : public Eigen::StreamInterface {
+   public:
+    EigenCudaStreamDevice();
+    ~EigenCudaStreamDevice() override {}
+
+    // https://github.com/PaddlePaddle/Paddle/pull/3497#issue-250238535
+    // explained that initializing CUDA stream in the constructor
+    // would cause SEGFAULT, so we add this method.
+    void SetValues(const cudaStream_t* cuda_stream, GPUPlace place);
+
+    const cudaStream_t& stream() const override;
+    const cudaDeviceProp& deviceProperties() const override;
+    void* allocate(size_t num_bytes) const override;
+    void deallocate(void* buffer) const override;
+    void* scratchpad() const override;
+    unsigned int* semaphore() const override;
+
+   private:
+    GPUPlace place_;
+    const cudaStream_t* stream_;         // not owned;
+    const cudaDeviceProp* device_prop_;  // not owned;
+    mutable void* scratch_;
+    mutable unsigned int* semaphore_;
+  };
 
+  GPUPlace place_;
   std::unique_ptr<Eigen::GpuDevice> eigen_device_;
   std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
 
@@ -103,7 +111,13 @@ class CUDADeviceContext : public DeviceContext {
   cublasHandle_t cublas_handle_;
 };
 
-#endif
+#endif  // PADDLE_WITH_CUDA
+
+#ifdef PADDLE_WITH_CUDA
+typedef boost::variant<CPUDeviceContext, CUDADeviceContext> DeviceContext;
+#else
+typedef boost::variant<CPUDeviceContext> DeviceContext;
+#endif  // PADDLE_WITH_CUDA
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc
index f4b00c57dee519..e751ede3f0b7e8 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -15,35 +15,26 @@ limitations under the License. */
 #include "paddle/platform/device_context.h"
 #include "gtest/gtest.h"
 
-TEST(Device, Init) {
+#ifdef PADDLE_WITH_CUDA
+TEST(DeviceContext, CUDA) {
   using paddle::platform::DeviceContext;
   using paddle::platform::CUDADeviceContext;
   using paddle::platform::GPUPlace;
 
-  int count = paddle::platform::GetDeviceCount();
-  for (int i = 0; i < count; i++) {
-    DeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
-    Eigen::GpuDevice* gpu_device =
-        device_context->template GetEigenDevice<GPUPlace>();
-    ASSERT_NE(nullptr, gpu_device);
-    delete device_context;
+  for (int i = 0; i < paddle::platform::GetDeviceCount(); i++) {
+    DeviceContext dev_ctx(GPUPlace(i));
+    ASSERT_NE(nullptr, boost::get<CUDADeviceContext>(dev_ctx).GetEigenDevice());
+    ASSERT_NE(nullptr, boost::get<CUDADeviceContext>(dev_ctx).cudnn_handle());
+    ASSERT_NE(nullptr, boost::get<CUDADeviceContext>(dev_ctx).cublas_handle());
+    ASSERT_NE(nullptr, boost::get<CUDADeviceContext>(dev_ctx).stream());
   }
 }
+#endif  // PADDLE_WITH_CUDA
 
-TEST(Device, CUDADeviceContext) {
-  using paddle::platform::CUDADeviceContext;
-  using paddle::platform::GPUPlace;
+TEST(DeviceContext, CPU) {
+  using paddle::platform::DeviceContext;
+  using paddle::platform::CPUDeviceContext;
 
-  int count = paddle::platform::GetDeviceCount();
-  for (int i = 0; i < count; i++) {
-    CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
-    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
-    ASSERT_NE(nullptr, gpu_device);
-    cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
-    ASSERT_NE(nullptr, cudnn_handle);
-    cublasHandle_t cublas_handle = device_context->cublas_handle();
-    ASSERT_NE(nullptr, cublas_handle);
-    ASSERT_NE(nullptr, device_context->stream());
-    delete device_context;
-  }
+  DeviceContext dev_ctx;  // defaults to CPUPlace
+  ASSERT_NE(nullptr, boost::get<CUDADeviceContext>(dev_ctx).GetEigenDevice());
 }
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index ac884386dde1f9..19c896d31e7b9a 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -28,6 +28,7 @@ const std::string kEnvFractionGpuMemoryToUse =
     "PADDLE_FRACTION_GPU_MEMORY_TO_USE";
 
 //! Get the total number of GPU devices in system.
+// TODO(yi): Rename into GetCUDADeviceCount().
 int GetDeviceCount();
 
 //! Get the current GPU device id in system.