Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion paddle/fluid/framework/ir/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ pass_library(anakin_fillconstant_elementwisemul_fuse inference)
# be detected by our pass. The index here represents the number of structures in the
# pattern. We use index 3 ~ 6, because these quantities of structures are
# common in the models.
foreach (index RANGE 3 6)
foreach (index RANGE 2 6)
file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n")
endforeach()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include <memory>
#include <string>
#include <unordered_set>
#include <vector>

#include "paddle/fluid/framework/ir/graph_viz_pass.h"
Expand Down Expand Up @@ -123,6 +125,7 @@ std::unique_ptr<ir::Graph> TransposeFlattenConcatFusePass<times>::ApplyImpl(
}

template class TransposeFlattenConcatFusePass<1>;
template class TransposeFlattenConcatFusePass<2>;
template class TransposeFlattenConcatFusePass<3>;
template class TransposeFlattenConcatFusePass<4>;
template class TransposeFlattenConcatFusePass<5>;
Expand All @@ -135,6 +138,9 @@ template class TransposeFlattenConcatFusePass<6>;
REGISTER_PASS(transpose_flatten_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<1>);

REGISTER_PASS(transpose_flatten2_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<2>);

REGISTER_PASS(transpose_flatten3_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<3>);

Expand Down
21 changes: 4 additions & 17 deletions paddle/fluid/inference/anakin/convert/op_converter.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,27 +117,14 @@ class AnakinOpConverter {
}
temp_max_input_shape[input] = input_shape;
engine->SetInputShape(input, input_shape);
// engine->Graph()->RegistVar(input); // For share from data.
engine->Graph()->RegistVar(input); // For share from data.
}
engine->SetMaxInputShape(temp_max_input_shape);

engine->Optimize();

// For anakin share with fluid tensor.
engine->AllocTmpMem();
engine->InitGraph();
/*
for(auto& input : inputs) {
platform::CUDAPlace gpu_place(engine->GetDevice());
auto input_var = scope->Var();
auto input_tensor = input_var->GetMutable<framework::LoDTensor>();
auto input_max_shape = temp_max_input_shape[input];
input_tensor->Resize(framework::make_ddim(input_max_shape));
auto input_data = input_tensor->mutable_data<float>(gpu_place);
auto* anakin_input = engine->Net()->get_in(input);

::anakin::saber::Tensor<::anakin::saber::NV> tmp_anakin_tensor(input_data,
::anakin::saber::NV(), 0, input_max_shape);
anakin_input->share_from(tmp_anakin_tensor);
}
*/
}

void SetEngine(AnakinNvEngine *engine) { engine_ = engine; }
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/inference/anakin/engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
const std::map<std::string, framework::LoDTensor *> &inputs,
const std::map<std::string, framework::LoDTensor *> &outputs,
cudaStream_t stream) {
cudaDeviceSynchronize();
for (const auto &input : inputs) {
auto *tensor = input.second;
auto *data = tensor->data<float>();
Expand All @@ -97,15 +98,14 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
anakin_input = net_->get_in(input.first);
}
*/

anakin_input->reshape(fluid_input_shape);

::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
fluid_input_shape);
anakin_input->copy_from(tmp_anakin_tensor);
}
cudaDeviceSynchronize();
net_->prediction();
cudaDeviceSynchronize();
for (const auto &output : outputs) {
platform::CUDAPlace gpu_place(device_);
auto *tensor = output.second;
Expand Down
11 changes: 7 additions & 4 deletions paddle/fluid/inference/anakin/engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,17 +84,20 @@ class AnakinEngine {
int GetMaxBatchSize() { return max_batch_size_; }
void Freeze();
void Optimize();
void AllocTmpMem() {
PADDLE_ENFORCE(net_->alloc_memory_first(*graph_),
"anakin alloc temp memory first failed");
}
void Save(std::string path) { graph_->save(path); }

bool IsInit() { return initialized_; }
int GetDevice() { return device_; }
// void SaveSerializedData(std::string& data) { graph_->save_to_string(data);
// }
// void LoadSerializedData(const std::string& data) {
// graph_->load_from_string(data); }
void Execute(const std::map<std::string, framework::LoDTensor *> &inputs,
const std::map<std::string, framework::LoDTensor *> &outputs,
cudaStream_t stream);

private:
bool initialized_{false};
int max_batch_size_;
std::map<std::string, std::vector<int>> max_input_shape_;
int device_;
Expand Down
14 changes: 14 additions & 0 deletions paddle/fluid/inference/api/details/zero_copy_tensor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,19 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const {
return res;
}

PaddleDType ZeroCopyTensor::type() {
EAGER_GET_TENSOR;
auto type = tensor->type();
if (type == framework::proto::VarType::FP32) {
return PaddleDType::FLOAT32;
} else if (type == framework::proto::VarType::INT64) {
return PaddleDType::INT64;
} else {
LOG(ERROR) << "unknown type, only support float32 and int64 now.";
}
return PaddleDType::FLOAT32;
}

template <typename T>
void ZeroCopyTensor::copy_from_cpu(const T *data) {
EAGER_GET_TENSOR;
Expand Down Expand Up @@ -119,6 +132,7 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place,
t_data, ele_num * sizeof(T), dev_ctx->stream());
cudaDeviceSynchronize();
#else
PADDLE_THROW("Not compile with CUDA, should not reach here.");
#endif
Expand Down
3 changes: 3 additions & 0 deletions paddle/fluid/inference/api/paddle_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,8 @@ class ZeroCopyTensor {
device_ = device;
}

PaddleDType type();

protected:
explicit ZeroCopyTensor(void* scope) : scope_{scope} {}
void SetName(const std::string& name) { name_ = name; }
Expand All @@ -190,6 +192,7 @@ class ZeroCopyTensor {
// performance.
mutable void* tensor_{nullptr};
PaddlePlace place_;
PaddleDType dtype_;
int device_;
};

Expand Down
4 changes: 1 addition & 3 deletions paddle/fluid/inference/api/paddle_pass_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -99,12 +99,10 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
#endif
});

/*
for (int i = 6; i >= 3; i--) {
for (int i = 6; i >= 2; i--) {
passes_.push_back("transpose_flatten" + std::to_string(i) +
"_concat_fuse_pass");
}
*/
use_gpu_ = true;
}

Expand Down
53 changes: 36 additions & 17 deletions paddle/fluid/operators/anakin/anakin_engine_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,23 +97,7 @@ class AnakinEngineOp : public framework::OperatorBase {
if (param_names_.count(x)) continue;
auto &t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
/*
auto t_shape = framework::vectorize(t.dims());
auto *anakin_input = engine->Net()->get_in(x);
auto net_shape = anakin_input->shape();
size_t anakin_net_input_size = net_shape.count() * sizeof(float);
size_t fluid_input_size = t.memory_size();

if (fluid_input_size < anakin_net_input_size) {
framework::LoDTensor temp_t;
auto t_dims = t.dims();
temp_t.Resize(t_dims);
TensorCopySync(t, dev_place, &temp_t);
t.Resize(framework::make_ddim(net_shape));
t.mutable_data<float>(dev_place);
TensorCopySync(temp_t, dev_place, &t);
}
*/

inputs.insert({x, &t});
}

Expand All @@ -136,6 +120,41 @@ class AnakinEngineOp : public framework::OperatorBase {
inference::Singleton<inference::anakin::AnakinEngineManager>::Global()
.Get(engine_key_);
}
// BUG here, detect that the tensor data pointer here will change sometime.
// Will fix it later.
/*
// For share with the tensor from fluid, We do the net init in the first net
precit.
if (!anakin_engine_->IsInit()) {
auto temp_max_input_shape = anakin_engine_->GetMaxInputShape();
anakin_engine_->AllocTmpMem();
for(auto& input : Inputs("Xs")) {
if (param_names_.count(input)) continue;
platform::CUDAPlace
gpu_place(boost::get<platform::CUDAPlace>(dev_place).device);
auto *input_var = scope.FindVar(input);
auto input_tensor = input_var->GetMutable<framework::LoDTensor>();
auto input_max_shape = temp_max_input_shape[input];

framework::LoDTensor temp_t;
auto t_dims = input_tensor->dims();
temp_t.Resize(t_dims);
TensorCopySync(*input_tensor, dev_place, &temp_t);
input_tensor->Resize(framework::make_ddim(input_max_shape));
input_tensor->mutable_data<float>(dev_place);
TensorCopySync(temp_t, dev_place, input_tensor);

auto* input_data = input_tensor->mutable_data<float>(gpu_place);
auto* anakin_input = anakin_engine_->Net()->get_in(input);

::anakin::saber::Tensor<::anakin::saber::NV>
tmp_anakin_tensor(input_data,
::anakin::saber::NV(), 0, input_max_shape);
anakin_input->share_from(tmp_anakin_tensor);
}
anakin_engine_->InitGraph();
}
*/
return anakin_engine_;
}

Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/operators/conv_fusion_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
int64_t max_user_size =
std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
user_workspace_size);
max_user_size = 64;
workspace_size_limit = max_user_size * 1024 * 1024;
}

Expand Down