Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 16 additions & 14 deletions paddle/phi/kernels/autotune/auto_tune_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class AutoTuneBase {
}

template <typename Context, typename... Args>
void Run(const Context& ctx,
void Run(const Context& dev_ctx,
const AlgorithmType& algo,
const size_t key,
Args&&... args) {
Expand All @@ -78,7 +78,7 @@ class AutoTuneBase {
if (use_autotune) {
// All available kernels have ran while picking the best kernel,
// so there may be no need for another kernel run.
auto best_idx = PickBestKernel(ctx, args...);
auto best_idx = PickBestKernel(dev_ctx, args...);
cache.Set(key, best_idx);
} else {
kernels_[0].Run(args...);
Expand All @@ -100,14 +100,14 @@ class AutoTuneBase {
}

template <typename Context, typename... Args>
size_t PickBestKernel(const Context& ctx, Args&&... args) {
size_t PickBestKernel(const Context& dev_ctx, Args&&... args) {
std::lock_guard<std::mutex> lock(mutex_);
size_t best_idx = 0;
float min_time = std::numeric_limits<float>::max();

// Time cost test established in default stream.
for (size_t i = 0; i < kernels_.size(); ++i) {
auto time = RunAndMeasureKernel<Context>(ctx, i, args...);
auto time = RunAndMeasureKernel<Context>(dev_ctx, i, args...);
if (time < min_time) {
min_time = time;
best_idx = i;
Expand All @@ -118,15 +118,17 @@ class AutoTuneBase {
}

template <typename Context, typename... Args>
float RunAndMeasureKernel(const Context& ctx, const int idx, Args&&... args) {
float RunAndMeasureKernel(const Context& dev_ctx,
const int idx,
Args&&... args) {
// Regard 1st run as warmup, judge the compare result by the time cost
// of rest cycles.
constexpr int repeats = 11;
phi::GpuTimer timer;
float time_cost = 0;
const auto& stream = ctx.stream();
const auto& stream = dev_ctx.stream();

ctx.Wait();
dev_ctx.Wait();
for (int i = 0; i < repeats; ++i) {
timer.Start(stream);
kernels_[idx].Run(args...);
Expand Down Expand Up @@ -158,7 +160,7 @@ class MatmulAutoTuner
}

template <typename Context>
void Run(const Context& ctx, const size_t key, Args... args) {
void Run(const Context& dev_ctx, const size_t key, Args... args) {
this->is_init_ = true;
this->CheckKernelSize();
auto& cache = AutoTuneCache::Instance().GetMatmul();
Expand All @@ -168,7 +170,7 @@ class MatmulAutoTuner
} else {
bool use_autotune = AutoTuneStatus::Instance().UseAutoTune();
if (use_autotune) {
auto best_idx = this->PickBestKernel(ctx, args...);
auto best_idx = this->PickBestKernel(dev_ctx, args...);
cache.Set(key, best_idx);
} else {
this->kernels_[0].Run(args...);
Expand Down Expand Up @@ -210,7 +212,7 @@ class GatherGemmScatterAutoTuner
return instance.get();
}

void Run(const phi::GPUContext& ctx,
void Run(const phi::GPUContext& dev_ctx,
const size_t key,
T const alpha,
T const beta,
Expand All @@ -227,15 +229,15 @@ class GatherGemmScatterAutoTuner
} else {
// Set alpha to 0 and beta to 1 to avoid changing the value of d when
// picking the best kernel
auto best_idx =
PickBestKernel(ctx, static_cast<T>(0), static_cast<T>(1), args...);
auto best_idx = PickBestKernel(
dev_ctx, static_cast<T>(0), static_cast<T>(1), args...);
cache.Set(key, best_idx);
this->kernels_[best_idx].Run(alpha, beta, args...);
}
}

protected:
size_t PickBestKernel(const phi::GPUContext& ctx,
size_t PickBestKernel(const phi::GPUContext& dev_ctx,
const T& alpha,
const T& beta,
Args&... args) {
Expand All @@ -250,7 +252,7 @@ class GatherGemmScatterAutoTuner
// Some kernels may require more shared memory than available, skip these
// kernels.
try {
time = this->RunAndMeasureKernel(ctx, i, alpha, beta, args...);
time = this->RunAndMeasureKernel(dev_ctx, i, alpha, beta, args...);
if (time < min_time) {
min_time = time;
best_idx = i;
Expand Down
4 changes: 2 additions & 2 deletions paddle/phi/kernels/funcs/cross_entropy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ struct HardLabelCrossEntropyCPUFunctorImpl {

template <typename DeviceContext, typename T>
void CrossEntropyFunctor<DeviceContext, T>::operator()(
const DeviceContext& ctx,
const DeviceContext& dev_ctx,
phi::DenseTensor* out,
const phi::DenseTensor* prob,
const phi::DenseTensor* labels,
Expand All @@ -110,7 +110,7 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
auto lbl = EigenMatrix<T>::From(*labels);
auto loss = EigenMatrix<T>::From(*out);

loss.device(*ctx.eigen_device()) =
loss.device(*dev_ctx.eigen_device()) =
-((lbl * in.log().unaryExpr(phi::funcs::TolerableValue<T>()))
.reshape(batch_axis_remain)
.sum(Eigen::DSizes<int, 1>(1)));
Expand Down
121 changes: 77 additions & 44 deletions paddle/phi/kernels/funcs/gather_scatter_functor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ struct cpu_gather_scatter_functor {
const std::string& method_name,
const func_t& reduce_op,
bool include_self,
const phi::DeviceContext& ctx UNUSED) {
const phi::DeviceContext& dev_ctx UNUSED) {
if (index.numel() == 0) {
return;
}
Expand Down Expand Up @@ -237,7 +237,7 @@ void cpu_gather_kernel(phi::DenseTensor self,
const phi::DenseTensor& index,
phi::DenseTensor result,
bool include_self,
const phi::DeviceContext& ctx) {
const phi::DeviceContext& dev_ctx) {
cpu_gather_scatter_functor<tensor_t,
index_t,
/*is_scatter_like=*/false>()(result,
Expand All @@ -247,7 +247,7 @@ void cpu_gather_kernel(phi::DenseTensor self,
"gather_out_cpu",
tensor_assign,
include_self,
ctx);
dev_ctx);
}

template <typename tensor_t, typename index_t>
Expand All @@ -256,7 +256,7 @@ void cpu_scatter_assign_kernel(phi::DenseTensor self,
const phi::DenseTensor& index,
phi::DenseTensor src,
bool include_self,
const phi::DeviceContext& ctx) {
const phi::DeviceContext& dev_ctx) {
cpu_gather_scatter_functor<tensor_t,
index_t,
/*is_scatter_like=*/true>()(self,
Expand All @@ -266,7 +266,7 @@ void cpu_scatter_assign_kernel(phi::DenseTensor self,
"scatter_assign_cpu",
tensor_assign,
include_self,
ctx);
dev_ctx);
}

template <typename tensor_t, typename index_t>
Expand All @@ -275,11 +275,17 @@ void cpu_scatter_add_kernel(phi::DenseTensor self,
const phi::DenseTensor& index,
phi::DenseTensor src,
bool include_self,
const phi::DeviceContext& ctx) {
const phi::DeviceContext& dev_ctx) {
cpu_gather_scatter_functor<tensor_t,
index_t,
/*is_scatter_like=*/true>()(
self, dim, index, src, "scatter_add_cpu", reduce_add, include_self, ctx);
/*is_scatter_like=*/true>()(self,
dim,
index,
src,
"scatter_add_cpu",
reduce_add,
include_self,
dev_ctx);
}

template <typename tensor_t, typename index_t>
Expand All @@ -288,11 +294,17 @@ void cpu_scatter_mul_kernel(phi::DenseTensor self,
const phi::DenseTensor& index,
phi::DenseTensor src,
bool include_self,
const phi::DeviceContext& ctx) {
const phi::DeviceContext& dev_ctx) {
cpu_gather_scatter_functor<tensor_t,
index_t,
/*is_scatter_like=*/true>()(
self, dim, index, src, "scatter_mul_cpu", reduce_mul, include_self, ctx);
/*is_scatter_like=*/true>()(self,
dim,
index,
src,
"scatter_mul_cpu",
reduce_mul,
include_self,
dev_ctx);
}

template <typename tensor_t, typename index_t>
Expand All @@ -301,11 +313,17 @@ void cpu_scatter_mean_kernel(phi::DenseTensor self,
const phi::DenseTensor& index,
phi::DenseTensor src,
bool include_self,
const phi::DeviceContext& ctx) {
const phi::DeviceContext& dev_ctx) {
cpu_gather_scatter_functor<tensor_t,
index_t,
/*is_scatter_like=*/true>()(
self, dim, index, src, "scatter_mean_cpu", reduce_add, include_self, ctx);
/*is_scatter_like=*/true>()(self,
dim,
index,
src,
"scatter_mean_cpu",
reduce_add,
include_self,
dev_ctx);
}

template <typename tensor_t, typename index_t>
Expand All @@ -314,11 +332,17 @@ void cpu_scatter_max_kernel(phi::DenseTensor self,
const phi::DenseTensor& index,
phi::DenseTensor src,
bool include_self,
const phi::DeviceContext& ctx) {
const phi::DeviceContext& dev_ctx) {
cpu_gather_scatter_functor<tensor_t,
index_t,
/*is_scatter_like=*/true>()(
self, dim, index, src, "scatter_max_cpu", reduce_max, include_self, ctx);
/*is_scatter_like=*/true>()(self,
dim,
index,
src,
"scatter_max_cpu",
reduce_max,
include_self,
dev_ctx);
}

template <typename tensor_t, typename index_t>
Expand All @@ -327,11 +351,17 @@ void cpu_scatter_min_kernel(phi::DenseTensor self,
const phi::DenseTensor& index,
phi::DenseTensor src,
bool include_self,
const phi::DeviceContext& ctx) {
const phi::DeviceContext& dev_ctx) {
cpu_gather_scatter_functor<tensor_t,
index_t,
/*is_scatter_like=*/true>()(
self, dim, index, src, "scatter_min_cpu", reduce_min, include_self, ctx);
/*is_scatter_like=*/true>()(self,
dim,
index,
src,
"scatter_min_cpu",
reduce_min,
include_self,
dev_ctx);
}

template <typename tensor_t, typename index_t>
Expand All @@ -340,7 +370,7 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self UNUSED,
const phi::DenseTensor& index,
phi::DenseTensor grad,
bool include_self UNUSED,
const phi::DeviceContext& ctx UNUSED) {
const phi::DeviceContext& dev_ctx UNUSED) {
auto* index_data = index.data<index_t>();
auto* grad_data = grad.data<tensor_t>();

Expand Down Expand Up @@ -376,16 +406,17 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self UNUSED,
}

template <typename tensor_t, typename index_t>
void cpu_scatter_mul_min_max_input_grad_kernel(phi::DenseTensor self UNUSED,
int dim,
const phi::DenseTensor& index,
const phi::DenseTensor& out,
const phi::DenseTensor& x,
const phi::DenseTensor& value,
phi::DenseTensor grad,
const std::string& reduce,
bool include_self UNUSED,
const phi::DeviceContext& ctx) {
void cpu_scatter_mul_min_max_input_grad_kernel(
phi::DenseTensor self UNUSED,
int dim,
const phi::DenseTensor& index,
const phi::DenseTensor& out,
const phi::DenseTensor& x,
const phi::DenseTensor& value,
phi::DenseTensor grad,
const std::string& reduce,
bool include_self UNUSED,
const phi::DeviceContext& dev_ctx) {
auto* index_data = index.data<index_t>();
auto* grad_data = grad.data<tensor_t>();
auto* out_data = out.data<tensor_t>();
Expand Down Expand Up @@ -457,7 +488,8 @@ void cpu_scatter_mean_input_grad_kernel(phi::DenseTensor self UNUSED,
const phi::DenseTensor& index,
phi::DenseTensor grad,
bool include_self UNUSED,
const phi::DeviceContext& ctx UNUSED) {
const phi::DeviceContext& dev_ctx
UNUSED) {
auto* index_data = index.data<index_t>();
auto* grad_data = grad.data<tensor_t>();

Expand Down Expand Up @@ -504,7 +536,7 @@ void cpu_scatter_value_grad_kernel(phi::DenseTensor self,
const phi::DenseTensor& index,
phi::DenseTensor grad,
bool include_self UNUSED,
const phi::DeviceContext& ctx UNUSED) {
const phi::DeviceContext& dev_ctx UNUSED) {
auto* self_data = self.data<tensor_t>();
auto* index_data = index.data<index_t>();
auto* grad_data = grad.data<tensor_t>();
Expand Down Expand Up @@ -564,7 +596,7 @@ void cpu_scatter_add_mean_value_grad_kernel(
phi::DenseTensor grad,
const std::string& reduce,
bool include_self,
const phi::DeviceContext& ctx UNUSED) {
const phi::DeviceContext& dev_ctx UNUSED) {
auto* self_data = self.data<tensor_t>();
auto* index_data = index.data<index_t>();
auto* grad_data = grad.data<tensor_t>();
Expand Down Expand Up @@ -643,16 +675,17 @@ void cpu_scatter_add_mean_value_grad_kernel(
}

template <typename tensor_t, typename index_t>
void cpu_scatter_mul_min_max_value_grad_kernel(phi::DenseTensor self,
int dim,
const phi::DenseTensor& index,
const phi::DenseTensor& out,
const phi::DenseTensor& x,
const phi::DenseTensor& value,
phi::DenseTensor grad,
const std::string& reduce,
bool include_self,
const phi::DeviceContext& ctx) {
void cpu_scatter_mul_min_max_value_grad_kernel(
phi::DenseTensor self,
int dim,
const phi::DenseTensor& index,
const phi::DenseTensor& out,
const phi::DenseTensor& x,
const phi::DenseTensor& value,
phi::DenseTensor grad,
const std::string& reduce,
bool include_self,
const phi::DeviceContext& dev_ctx) {
auto* self_data = self.data<tensor_t>();
auto* index_data = index.data<index_t>();
auto* grad_data = grad.data<tensor_t>();
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/funcs/math/cos_sim_functor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ namespace phi {
namespace math {
template <typename T>
struct CosSimDyFunctor<phi::CPUContext, T> {
void operator()(const phi::CPUContext& ctx,
void operator()(const phi::CPUContext& dev_ctx,
const T* x_norm,
const T* y_norm,
const T* x,
Expand Down
4 changes: 2 additions & 2 deletions paddle/phi/kernels/funcs/math/cos_sim_functor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ __global__ void CosSimDyKernel(const T* x_norm,

template <typename T>
struct CosSimDyFunctor<phi::GPUContext, T> {
void operator()(const phi::GPUContext& ctx,
void operator()(const phi::GPUContext& dev_ctx,
const T* x_norm,
const T* y_norm,
const T* x,
Expand All @@ -63,7 +63,7 @@ struct CosSimDyFunctor<phi::GPUContext, T> {
const int block_size = 512;
dim3 threads(block_size, 1);
dim3 grid((rows + block_size - 1) / block_size, 1);
CosSimDyKernel<T><<<grid, threads, 0, ctx.stream()>>>(
CosSimDyKernel<T><<<grid, threads, 0, dev_ctx.stream()>>>(
x_norm, y_norm, x, y, z, dz, rows, cols, dy);
}
};
Expand Down
Loading