Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion paddle/framework/init.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ bool InitDevices(const std::vector<std::string> &devices) {
#ifdef PADDLE_WITH_CUDA
auto pos = string::RFind(p, ':', string::Piece::npos);
auto number = device.substr(pos + 1);
places.emplace_back(platform::GPUPlace(std::stoi(number)));
places.emplace_back(platform::CUDAPlace(std::stoi(number)));
#else
LOG(WARNING)
<< "'GPU' is not supported, Please re-compile with WITH_GPU option";
Expand Down
2 changes: 1 addition & 1 deletion paddle/framework/lod_tensor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(),
boost::get<platform::GPUPlace>(tensor.place()),
boost::get<platform::CUDAPlace>(tensor.place()),
reinterpret_cast<const void *>(data), size_to_write,
gpu_dev_ctx.stream());
gpu_dev_ctx.Wait();
Expand Down
2 changes: 1 addition & 1 deletion paddle/framework/lod_tensor_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ __global__ void test(size_t* a, int size) {

TEST(LoDTensor, LoDInGPU) {
paddle::framework::LoDTensor lod_tensor;
paddle::platform::GPUPlace place(0);
paddle::platform::CUDAPlace place(0);

paddle::framework::LoD src_lod;
src_lod.push_back(std::vector<size_t>{0, 2, 4, 6, 8, 10, 12, 14});
Expand Down
4 changes: 2 additions & 2 deletions paddle/framework/op_kernel_type_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,13 @@ TEST(OpKernelType, Hash) {
using OpKernelType = paddle::framework::OpKernelType;
using DataType = paddle::framework::proto::DataType;
using CPUPlace = paddle::platform::CPUPlace;
using GPUPlace = paddle::platform::GPUPlace;
using CUDAPlace = paddle::platform::CUDAPlace;
using DataLayout = paddle::framework::DataLayout;
using LibraryType = paddle::framework::LibraryType;

OpKernelType op_kernel_type_1(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
LibraryType::kCUDNN);
OpKernelType op_kernel_type_2(DataType::FP32, GPUPlace(0), DataLayout::kNCHW,
OpKernelType op_kernel_type_2(DataType::FP32, CUDAPlace(0), DataLayout::kNCHW,
LibraryType::kCUDNN);

OpKernelType::Hash hasher;
Expand Down
2 changes: 1 addition & 1 deletion paddle/framework/op_registry.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ class OpKernelRegistrar : public Registrar {
}

#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::GPUPlace, __VA_ARGS__)
REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)

#define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
Expand Down
2 changes: 1 addition & 1 deletion paddle/framework/tensor.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ private:
```

```c++
typedef boost::variant<GpuPlace, CpuPlace> Place;
typedef boost::variant<CUDAPlace, CpuPlace> Place;
typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>,
Dim<6>, Dim<7>, Dim<8>, Dim<9>> DDimVar;
typedef boost::variant<
Expand Down
6 changes: 3 additions & 3 deletions paddle/framework/tensor_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,11 +125,11 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
boost::get<platform::CPUPlace>(place), size, type));
} else if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
}
#else
holder_.reset(new PlaceholderImpl<platform::GPUPlace>(
boost::get<platform::GPUPlace>(place), size, type));
holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(place), size, type));
}
#endif
offset_ = 0;
Expand Down
16 changes: 8 additions & 8 deletions paddle/framework/tensor_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,20 +80,20 @@ TEST(Tensor, MutableData) {
float* p1 = nullptr;
float* p2 = nullptr;
// initialization
p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CUDAPlace());
EXPECT_NE(p1, nullptr);
// set src_tensor a new dim with large size
// momery is supposed to be re-allocated
p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), GPUPlace());
p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CUDAPlace());
EXPECT_NE(p2, nullptr);
EXPECT_NE(p1, p2);
// set src_tensor a new dim with same size
// momery block is supposed to be unchanged
p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), GPUPlace());
p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), CUDAPlace());
EXPECT_EQ(p1, p2);
// set src_tensor a new dim with smaller size
// momery block is supposed to be unchanged
p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CUDAPlace());
EXPECT_EQ(p1, p2);
}
#endif
Expand Down Expand Up @@ -130,7 +130,7 @@ TEST(Tensor, ShareDataWith) {
{
Tensor src_tensor;
Tensor dst_tensor;
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CUDAPlace());
dst_tensor.ShareDataWith(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
}
Expand Down Expand Up @@ -166,7 +166,7 @@ TEST(Tensor, Slice) {
#ifdef PADDLE_WITH_CUDA
{
Tensor src_tensor;
src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
src_tensor.mutable_data<double>(make_ddim({6, 9}), CUDAPlace());
Tensor slice_tensor = src_tensor.Slice(2, 6);
DDim slice_dims = slice_tensor.dims();
ASSERT_EQ(arity(slice_dims), 2);
Expand All @@ -176,11 +176,11 @@ TEST(Tensor, Slice) {
uintptr_t src_data_address =
reinterpret_cast<uintptr_t>(src_tensor.data<double>());
uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
src_tensor.mutable_data<double>(src_tensor.dims(), GPUPlace()));
src_tensor.mutable_data<double>(src_tensor.dims(), CUDAPlace()));
uintptr_t slice_data_address =
reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
slice_tensor.mutable_data<double>(slice_tensor.dims(), GPUPlace()));
slice_tensor.mutable_data<double>(slice_tensor.dims(), CUDAPlace()));
EXPECT_EQ(src_data_address, src_mutable_data_address);
EXPECT_EQ(slice_data_address, slice_mutable_data_address);
EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
Expand Down
18 changes: 9 additions & 9 deletions paddle/framework/tensor_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,33 +47,33 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
#ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
memory::Copy(
dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
memory::Copy(
dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
memory::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
Expand Down Expand Up @@ -108,7 +108,7 @@ inline void CopyFromVector(const std::vector<T>& src,
#ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(dst_place)) { // NOLINT
memory::Copy(
boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place, src_ptr,
boost::get<platform::CUDAPlace>(dst_place), dst_ptr, src_place, src_ptr,
size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
}
Expand Down Expand Up @@ -141,7 +141,7 @@ inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
#ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src.place())) { // NOLINT
memory::Copy(
dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()),
dst_place, dst_ptr, boost::get<platform::CUDAPlace>(src.place()),
src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
}
Expand Down
6 changes: 3 additions & 3 deletions paddle/framework/tensor_util_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ TEST(CopyFrom, Tensor) {
memcpy(src_ptr, arr, 9 * sizeof(int));

// CPU Tensor to GPU Tensor
auto gpu_place = new platform::GPUPlace(0);
auto gpu_place = new platform::CUDAPlace(0);
platform::CUDADeviceContext gpu_ctx(*gpu_place);
CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);

Expand Down Expand Up @@ -143,7 +143,7 @@ TEST(CopyFromVector, Tensor) {

// Copy to GPUTensor
gpu_tensor.Resize(make_ddim({3, 3}));
auto gpu_place = new paddle::platform::GPUPlace();
auto gpu_place = new paddle::platform::CUDAPlace();
CUDADeviceContext gpu_ctx(*gpu_place);
CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
// Copy from GPU to CPU tensor for comparison
Expand Down Expand Up @@ -210,7 +210,7 @@ TEST(CopyToVector, Tensor) {
{
std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
Tensor gpu_tensor;
GPUPlace place;
CUDAPlace place;
CUDADeviceContext gpu_ctx(place);
CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);

Expand Down
10 changes: 5 additions & 5 deletions paddle/memory/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ p = memory::Alloc(platform::CPUPlace(), 4*1024);
To allocate 4KB memory on the 3rd GPU:

```cpp
p = memory::Alloc(platform::GPUPlace(2), 4*1024);
p = memory::Alloc(platform::CUDAPlace(2), 4*1024);
```

To free memory and check the so-far used amount of memory on a place:

```cpp
auto pl = platform::GPUPlace(0);
auto pl = platform::CUDAPlace(0);
p = memory::Alloc(pl, 4*1024);
cout << memory::Used(pl);
memory::Free(pl, p);
Expand All @@ -36,7 +36,7 @@ template <typename Place> size_t Used(Place);
} // namespace memory
```

These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`:
These function templates have specializations on either `platform::CPUPlace` or `platform::CUDAPlace`:

```cpp
template<>
Expand All @@ -49,7 +49,7 @@ and

```cpp
template<>
void Alloc<GPUPlace>(GPUPlace p, size_t size) {
void Alloc<CUDAPlace>(CUDAPlace p, size_t size) {
return GetGPUBuddyAllocator(p.id)->Alloc(size);
}
```
Expand Down Expand Up @@ -122,7 +122,7 @@ There are two implementations of `Context`:

1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.

1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::CUDAPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.

### Majel

Expand Down
24 changes: 9 additions & 15 deletions paddle/memory/memcpy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,31 +28,25 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,

#ifdef PADDLE_WITH_CUDA
template <>
void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
void* dst,
platform::GPUPlace src_place,
const void* src, size_t num,
cudaStream_t stream) {
void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, cudaStream_t stream) {
platform::SetDeviceId(src_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
}

template <>
void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src, size_t num,
cudaStream_t stream) {
void Copy<platform::CUDAPlace, platform::CPUPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
const void* src, size_t num, cudaStream_t stream) {
platform::SetDeviceId(dst_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
}

template <>
void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
void* dst,
platform::GPUPlace src_place,
const void* src, size_t num,
cudaStream_t stream) {
void Copy<platform::CUDAPlace, platform::CUDAPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, cudaStream_t stream) {
if (dst_place == src_place) {
platform::SetDeviceId(src_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
Expand Down
8 changes: 4 additions & 4 deletions paddle/memory/memory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,12 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
}

template <>
size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
return GetGPUBuddyAllocator(place.device)->Used();
}

template <>
void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
auto* ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
Expand All @@ -101,14 +101,14 @@ void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
LOG(WARNING) << "total " << total;
LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
LOG(WARNING) << "GPU memory used: " << Used<platform::GPUPlace>(place);
LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
platform::SetDeviceId(cur_dev);
}
return ptr;
}

template <>
void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
GetGPUBuddyAllocator(place.device)->Free(p);
}

Expand Down
6 changes: 3 additions & 3 deletions paddle/memory/memory_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {

#ifdef PADDLE_WITH_CUDA

size_t align(size_t size, paddle::platform::GPUPlace place) {
size_t align(size_t size, paddle::platform::CUDAPlace place) {
size += sizeof(paddle::memory::detail::Metadata);
size_t alignment = paddle::platform::GpuMinChunkSize();
size_t remaining = size % alignment;
Expand All @@ -94,7 +94,7 @@ TEST(BuddyAllocator, GPUAllocation) {

EXPECT_EQ(p, nullptr);

paddle::platform::GPUPlace gpu(0);
paddle::platform::CUDAPlace gpu(0);
p = paddle::memory::Alloc(gpu, 4096);

EXPECT_NE(p, nullptr);
Expand All @@ -103,7 +103,7 @@ TEST(BuddyAllocator, GPUAllocation) {
}

TEST(BuddyAllocator, GPUMultAlloc) {
paddle::platform::GPUPlace gpu;
paddle::platform::CUDAPlace gpu;

std::unordered_map<void *, size_t> ps;

Expand Down
2 changes: 1 addition & 1 deletion paddle/operators/accuracy_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
"It must use CUDAPlace.");
auto* inference = ctx.Input<Tensor>("Out");
auto* indices = ctx.Input<Tensor>("Indices");
auto* label = ctx.Input<Tensor>("Label");
Expand Down
4 changes: 2 additions & 2 deletions paddle/operators/batch_norm_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
"It must use CUDAPlace.");
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test");
Expand Down Expand Up @@ -179,7 +179,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
"It must use CUDAPlace.");
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout data_layout =
Expand Down
Loading