Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 73 additions & 0 deletions paddle/fluid/operators/math/cpu_vec.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,79 @@ inline void vec_sum<float, platform::avx>(const size_t n, const float* x,
#endif
}

template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_mul(const size_t n, const T* x, const T* y, T* z) {
for (size_t i = 0; i < n; ++i) {
z[i] = x[i] * y[i];
}
}

template <>
inline void vec_mul<float, platform::avx>(const size_t n, const float* x,
const float* y, float* z) {
#ifdef __AVX__
constexpr unsigned int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_mul<float, platform::isa_any>(n, x, y, z);
return;
}

unsigned int i = 0, end = 0;
end = n & ~(block - 1);
for (i = 0; i < end; i += block) {
_mm256_storeu_ps(
z + i, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
}

for (; i < n; i++) {
z[i] = x[i] * y[i];
}
#else
vec_mul<float, platform::isa_any>(n, x, y, z);
#endif
}

template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_mul_reduce(const size_t n, const T* x, const T* y, T* z) {
z[0] = x[0] * y[0];
for (size_t i = 1; i < n; ++i) {
z[0] += x[i] * y[i];
}
}

template <>
inline void vec_mul_reduce<float, platform::avx>(const size_t n, const float* x,
const float* y, float* z) {
#ifdef __AVX__
constexpr unsigned int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_mul_reduce<float, platform::isa_any>(n, x, y, z);
return;
}

unsigned int i = 0, end = 0;
z[0] = 0.f;

end = n & ~(block - 1);
__m256 tmp = _mm256_setzero_ps();
for (i = 0; i < end; i += block) {
tmp = _mm256_add_ps(
tmp, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
}

__m256 hsum = _mm256_hadd_ps(tmp, tmp);
hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
_mm_store_ss(z, _mm_hadd_ps(_mm256_castps256_ps128(hsum),
_mm256_castps256_ps128(hsum)));

for (; i < n; i++) {
z[0] += x[i] * y[i];
}
#else
vec_mul_reduce<float, platform::isa_any>(n, x, y, z);
#endif
}

template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
Expand Down
64 changes: 64 additions & 0 deletions paddle/fluid/operators/math/cpu_vec_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,70 @@ TEST(CpuVecTest, vec_clip) {
vec_clip<double, platform::isa_any>);
}

template <typename T>
void compare_mul(
size_t n, std::function<void(const size_t, const T*, const T*, T*)> tgt,
std::function<void(const size_t, const T*, const T*, T*)> ref) {
std::vector<T> x(n), y(n);
std::vector<T> ztgt(n), zref(n);

RandomVec<T>(n, x.data(), static_cast<T>(-2), static_cast<T>(2));
RandomVec<T>(n, y.data(), static_cast<T>(-2), static_cast<T>(2));

const T* x_data = x.data();
const T* y_data = y.data();
T* ztgt_data = ztgt.data();
T* zref_data = zref.data();

tgt(n, x_data, y_data, ztgt_data);
ref(n, x_data, y_data, zref_data);
for (size_t i = 0; i < n; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
}
}

TEST(CpuVecTest, vec_mul) {
namespace platform = paddle::platform;
using namespace paddle::operators::math; // NOLINT
for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
compare_mul<float>(sz, vec_mul<float>, vec_mul<float, platform::isa_any>);
compare_mul<float>(sz, vec_mul<float, platform::avx>,
vec_mul<float, platform::isa_any>);
}
compare_mul<double>(30U, vec_mul<double>, vec_mul<double, platform::isa_any>);
}

template <typename T>
void compare_mul_reduce(
size_t n, std::function<void(const size_t, const T*, const T*, T*)> tgt,
std::function<void(const size_t, const T*, const T*, T*)> ref) {
std::vector<T> x(n), y(n);
T ztgt_data, zref_data;

RandomVec<T>(n, x.data(), static_cast<T>(-2), static_cast<T>(2));
RandomVec<T>(n, y.data(), static_cast<T>(-2), static_cast<T>(2));

const T* x_data = x.data();
const T* y_data = y.data();

tgt(n, x_data, y_data, &ztgt_data);
ref(n, x_data, y_data, &zref_data);
EXPECT_NEAR(ztgt_data, zref_data, 1e-3);
}

TEST(CpuVecTest, vec_mul_reduce) {
namespace platform = paddle::platform;
using namespace paddle::operators::math; // NOLINT
for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
compare_mul_reduce<float>(sz, vec_mul_reduce<float>,
vec_mul_reduce<float, platform::isa_any>);
compare_mul_reduce<float>(sz, vec_mul_reduce<float, platform::avx>,
vec_mul_reduce<float, platform::isa_any>);
}
compare_mul_reduce<double>(30U, vec_mul_reduce<double>,
vec_mul_reduce<double, platform::isa_any>);
}

template <typename T>
void TestInplace(const int n, std::function<void(const int, const T*, T*)> tgt,
std::function<void(const int, const T*, T*)> ref) {
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/math/softmax.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class SoftmaxFunctor {
const framework::Tensor* X, framework::Tensor* Y);
};

template <typename DeviceContext, typename T>
template <typename DeviceContext, typename T, typename Enable = void>
class SoftmaxGradFunctor {
public:
void operator()(const DeviceContext& context, const int axis_dim,
Expand Down
54 changes: 48 additions & 6 deletions paddle/fluid/operators/math/softmax_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,16 +140,16 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
};

template <typename DeviceContext, typename T>
void SoftmaxGradFunctor<DeviceContext, T>::operator()(
const DeviceContext& context, const int axis_dim,
const framework::Tensor* y, const framework::Tensor* y_grad,
framework::Tensor* x_grad) {
void SoftmaxGradEigen(const DeviceContext& context, const int axis_dim,
const framework::Tensor* y,
const framework::Tensor* y_grad,
framework::Tensor* x_grad) {
auto softmax = EigenMatrix<T>::From(*y);
auto softmax_grad = EigenMatrix<T>::From(*y_grad);
auto logits_grad = EigenMatrix<T>::From(*x_grad);

const int kBatchDim = 0;
const int kClassDim = 1;
constexpr int kBatchDim = 0;
constexpr int kClassDim = 1;

const int batch_size = softmax.dimension(kBatchDim);
const int num_classes = softmax.dimension(kClassDim);
Expand All @@ -169,6 +169,48 @@ void SoftmaxGradFunctor<DeviceContext, T>::operator()(
logits_grad.device(*context.eigen_device()) = (softmax_grad - dot) * softmax;
}

template <typename DeviceContext, typename T, typename Enable>
void SoftmaxGradFunctor<DeviceContext, T, Enable>::operator()(
const DeviceContext& context, const int axis_dim,
const framework::Tensor* y, const framework::Tensor* y_grad,
framework::Tensor* x_grad) {
SoftmaxGradEigen<DeviceContext, T>(context, axis_dim, y, y_grad, x_grad);
}

template <typename DeviceContext, typename T>
class SoftmaxGradFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
public:
void operator()(const DeviceContext& context, const int axis_dim,
const framework::Tensor* y, const framework::Tensor* y_grad,
framework::Tensor* x_grad) {
auto out_dims = y->dims();
constexpr int kBatchDim = 0;
constexpr int kClassDim = 1;
const int num_classes = out_dims[kClassDim];
const int batch_size = out_dims[kBatchDim];
const int num_remain = num_classes / axis_dim;

if (num_remain == 1 && platform::MayIUse(platform::avx)) {
const T* out_data = y->data<T>();
const T* out_grad = y_grad->data<T>();
T* in_grad = x_grad->data<T>();
for (int bs = 0; bs < batch_size; ++bs) {
T scalar;
vec_mul_reduce<T, platform::avx>(num_classes, out_grad, out_data,
&scalar);
scalar *= static_cast<T>(-1);
vec_add_bias<T, platform::avx>(num_classes, scalar, out_grad, in_grad);
vec_mul<T, platform::avx>(num_classes, out_data, in_grad, in_grad);
out_data += num_classes;
out_grad += num_classes;
in_grad += num_classes;
}
} else {
SoftmaxGradEigen<DeviceContext, T>(context, axis_dim, y, y_grad, x_grad);
}
}
};

} // namespace math
} // namespace operators
} // namespace paddle