From 1fc444f3f44e223ae3cd244e0e701f3118548ed1 Mon Sep 17 00:00:00 2001 From: zly Date: Thu, 10 Jul 2025 10:06:54 +0800 Subject: [PATCH 1/6] cpu nms --- python/oneflow/nn/modules/nms.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/python/oneflow/nn/modules/nms.py b/python/oneflow/nn/modules/nms.py index 7fdb64f0087..b3b3c4517bc 100644 --- a/python/oneflow/nn/modules/nms.py +++ b/python/oneflow/nn/modules/nms.py @@ -19,8 +19,20 @@ def nms_op(boxes, scores, iou_threshold: float): - score_inds = flow.argsort(scores, dim=0, descending=True) - boxes = flow._C.gather(boxes, score_inds, axis=0) - keep = flow._C.nms(boxes, iou_threshold) - index = flow.squeeze(flow.argwhere(keep), dim=[1]) - return flow._C.gather(score_inds, index, axis=0) + device = boxes.device + if device == flow.device("npu"): + cpu_boxes = boxes.detach().to("cpu") + cpu_scores = scores.detach().to("cpu") + # cpu_boxes.requires_grad=False + # cpu_scores.requires_grad=False + score_inds = flow.argsort(cpu_scores, dim=0, descending=True) + cpu_boxes = flow._C.gather(cpu_boxes, score_inds, axis=0) + keep = flow._C.nms(cpu_boxes, iou_threshold) + index = flow.squeeze(flow.argwhere(keep), dim=[1]) + return flow._C.gather(score_inds, index, axis=0).to(device=device) + else: + score_inds = flow.argsort(scores, dim=0, descending=True) + boxes = flow._C.gather(boxes, score_inds, axis=0) + keep = flow._C.nms(boxes, iou_threshold) + index = flow.squeeze(flow.argwhere(keep), dim=[1]) + return flow._C.gather(score_inds, index, axis=0) From 276900314178959876dd47787351f53213638abb Mon Sep 17 00:00:00 2001 From: zly Date: Thu, 10 Jul 2025 10:07:19 +0800 Subject: [PATCH 2/6] cpu unique --- python/oneflow/nn/modules/unique.py | 32 +++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/python/oneflow/nn/modules/unique.py b/python/oneflow/nn/modules/unique.py index b4d0cefde6c..7eb84a92261 100755 --- a/python/oneflow/nn/modules/unique.py +++ b/python/oneflow/nn/modules/unique.py @@ -66,16 +66,30 @@ def unique_op( tensor([3, 1, 2, 0, 2], dtype=oneflow.int64) """ - if not return_inverse and not return_counts: - return flow._C.unique(input, sorted, dtype=dtype) + device = input.device + if device == flow.device("npu"): + cpu_input = input.to("cpu") + if not return_inverse and not return_counts: + return flow._C.unique(cpu_input, sorted, dtype=dtype).to(device=device) + else: + return flow._C.unique( + input, + sorted, + return_inverse=return_inverse, + return_counts=return_counts, + dtype=dtype, + ).to(device=device) else: - return flow._C.unique( - input, - sorted, - return_inverse=return_inverse, - return_counts=return_counts, - dtype=dtype, - ) + if not return_inverse and not return_counts: + return flow._C.unique(input, sorted, dtype=dtype) + else: + return flow._C.unique( + input, + sorted, + return_inverse=return_inverse, + return_counts=return_counts, + dtype=dtype, + ) if __name__ == "__main__": From 7eaa9c5f4074459df4178beef278a7d23aaa8066 Mon Sep 17 00:00:00 2001 From: zly Date: Thu, 10 Jul 2025 10:08:42 +0800 Subject: [PATCH 3/6] export torch.ops.torchvision empty apis --- python/oneflow/ops/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/oneflow/ops/__init__.py b/python/oneflow/ops/__init__.py index 1201baf23ee..53e0674a219 100644 --- a/python/oneflow/ops/__init__.py +++ b/python/oneflow/ops/__init__.py @@ -13,7 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. """ - +from .torchvision import _cuda_version,nms def load_library(path): - raise ImportError("load_library is not implemented") + print("\nload_library: %s >>> do nothing. \n"%(path)) + # raise ImportError("load_library is not implemented") From ed561bfa1905fdc899bf26453ccbefd0f5ef34b9 Mon Sep 17 00:00:00 2001 From: zly Date: Thu, 10 Jul 2025 10:09:21 +0800 Subject: [PATCH 4/6] export torch.ops.torchvision empty apis --- python/oneflow/ops/torchvision/__init__.py | 40 ++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 python/oneflow/ops/torchvision/__init__.py diff --git a/python/oneflow/ops/torchvision/__init__.py b/python/oneflow/ops/torchvision/__init__.py new file mode 100644 index 00000000000..1eff43e0cd2 --- /dev/null +++ b/python/oneflow/ops/torchvision/__init__.py @@ -0,0 +1,40 @@ +import oneflow as flow +from oneflow.framework.tensor import Tensor + +def _cuda_version(): + return -1 +# int64_t cuda_version() { +# #ifdef WITH_CUDA +# return CUDA_VERSION; +# #else +# return -1; +# #endif +# } + + +def nms(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor: + """ + Performs non-maximum suppression (NMS) on the boxes according + to their intersection-over-union (IoU). + + NMS iteratively removes lower scoring boxes which have an + IoU greater than ``iou_threshold`` with another (higher scoring) + box. + + If multiple boxes have the exact same score and satisfy the IoU + criterion with respect to a reference box, the selected box is + not guaranteed to be the same between CPU and GPU. This is similar + to the behavior of argsort in PyTorch when repeated values are present. + + Args: + boxes (Tensor[N, 4])): boxes to perform NMS on. They + are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and + ``0 <= y1 < y2``. + scores (Tensor[N]): scores for each one of the boxes + iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold + + Returns: + Tensor: int64 tensor with the indices of the elements that have been kept + by NMS, sorted in decreasing order of scores + """ + return flow.nms(boxes, scores, iou_threshold) \ No newline at end of file From 6c26e9a17700d529cfa1cdeb20cd8244e43b5061 Mon Sep 17 00:00:00 2001 From: zly Date: Thu, 10 Jul 2025 10:10:45 +0800 Subject: [PATCH 5/6] cpu L2Normalize --- oneflow/core/functional/impl/nn_functor.cpp | 45 +++++++++++++++++---- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp index 51f42367c07..c35b6f36c26 100644 --- a/oneflow/core/functional/impl/nn_functor.cpp +++ b/oneflow/core/functional/impl/nn_functor.cpp @@ -930,9 +930,9 @@ class SkipLayerNormFunctor { std::tuple(has_skip, has_gamma, has_beta, has_bias), op_expr)); } // has_bias - } // has_beta - } // has_gamma - } // has_skip + } // has_beta + } // has_gamma + } // has_skip } Maybe operator()(const std::shared_ptr& x, @@ -1170,8 +1170,8 @@ class SkipRMSNormFunctor { ops_.insert(std::pair, std::shared_ptr>( std::tuple(has_weight, has_skip, has_bias), op_expr)); } // has_bias - } // has_skip - } // has_weight + } // has_skip + } // has_weight } Maybe operator()(const std::shared_ptr& x, @@ -1477,7 +1477,7 @@ class MaxUnpoolNDFunctor { .Input("x") .Input("indices") .Output("y") - .Build())){}; + .Build())) {}; Maybe operator()(const std::shared_ptr& x, const std::shared_ptr& indices, const std::vector& kernel_size, @@ -3456,6 +3456,37 @@ class L2NormalizeFunctor { auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("epsilon", "axis"); attrs.SetAllAttrs(epsilon, final_dim); + auto device_type = DeviceType::kCPU; +#ifdef WITH_NPU + if (input->is_global()) { + device_type = JUST(input->parallel_desc())->device_type(); + } else { + device_type = JUST(input->device())->enum_type(); + } + if (device_type == DeviceType::kNPU) { + auto cpu_input = JUST(one::functional::To(input, "cpu")); + if (axis_ == final_dim) { + auto cpu_output = OpInterpUtil::Dispatch(*op_, {cpu_input}, attrs); + return one::functional::To(JUST(cpu_output), "npu"); + // if (input->is_global()) { + // return one::functional::To(JUST(cpu_output), input->parallel_desc()); + // } else { + // return one::functional::To(JUST(cpu_output), input->device()); + // } + } else { + std::vector input_perm(input->shape()->dim_vec().size(), 0); + for (size_t i = 0; i < input_perm.size(); ++i) { input_perm[i] = static_cast(i); } + std::swap(input_perm[final_dim], input_perm[static_cast(axis_)]); + + const auto cpu_result = JUST(OpInterpUtil::Dispatch( + *op_, {JUST(functional::Transpose(cpu_input, input_perm))}, attrs)); + auto cpu_output = functional::Transpose((*cpu_result)[0], input_perm); + return one::functional::To(JUST(cpu_output), "npu"); + } + } + +#endif + if (axis_ == final_dim) { return OpInterpUtil::Dispatch(*op_, {input}, attrs); } std::vector input_perm(input->shape()->dim_vec().size(), 0); @@ -3476,7 +3507,7 @@ class NormalizeFunctor { Maybe operator()(const std::shared_ptr& input, const float& p, const int32_t& dim, const float& eps, const bool& use_l2_norm_kernel) const { - if (use_l2_norm_kernel && (std::fabs(p - 2.0f) < std::numeric_limits::min())) { + if ((std::fabs(p - 2.0f) < std::numeric_limits::min())) { return functional::L2Normalize(input, dim, eps); } return SequenceFunction(const std::shared_ptr&, const float&, From c0d42468d8e94c265b80b15a594e964190c4c46c Mon Sep 17 00:00:00 2001 From: zly Date: Thu, 10 Jul 2025 10:11:46 +0800 Subject: [PATCH 6/6] cpu ArgWhere, TensorScatterNdUpdate --- .../core/functional/impl/array_functor.cpp | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/oneflow/core/functional/impl/array_functor.cpp b/oneflow/core/functional/impl/array_functor.cpp index aef7ef62a3b..4fa112e242b 100644 --- a/oneflow/core/functional/impl/array_functor.cpp +++ b/oneflow/core/functional/impl/array_functor.cpp @@ -588,7 +588,27 @@ class ArgWhereFunctor { const Symbol& dtype) const { auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("dtype"); attrs.SetAllAttrs(dtype->data_type()); +#ifdef WITH_NPU + auto device_type = DeviceType::kCPU; + if (x->is_global()) { + device_type = JUST(x->parallel_desc())->device_type(); + } else { + device_type = JUST(x->device())->enum_type(); + } + if (device_type == DeviceType::kNPU) { + // NOTE: use cpu argwhere when device="npu" + auto cpu_tensor = JUST(one::functional::To(x, "cpu")); + auto result = JUST(OpInterpUtil::Dispatch(*op_, {cpu_tensor}, attrs)); + for (int i = 0; i < result->size(); ++i) { + (*result)[i] = JUST(one::functional::To((*result)[i], "npu")); + } + return result; + } else { + return OpInterpUtil::Dispatch(*op_, {x}, attrs); + } +#else return OpInterpUtil::Dispatch(*op_, {x}, attrs); +#endif // WITH_NPU } private: @@ -1494,6 +1514,50 @@ class TensorScatterNdUpdateFunctor { CHECK_OR_RETURN(*tensor->dtype() == *updates->dtype()) << Error::RuntimeError() << "The dtype of tensor and updates must be same."; std::shared_ptr contiguous_index = JUST(functional::ToContiguous(indices)); +#ifdef WITH_NPU + auto device_type = DeviceType::kCPU; + if (inplace) { + if (tensor->is_global()) { + device_type = JUST(tensor->parallel_desc())->device_type(); + if (device_type == DeviceType::kNPU) { + auto cpu_tensor = JUST(one::functional::To(tensor, "cpu")); + auto cpu_contiguous_index = JUST(one::functional::To(contiguous_index, "cpu")); + auto cpu_updates = JUST(one::functional::To(updates, "cpu")); + auto cpu_output = JUST(OpInterpUtil::Dispatch( + *op_, {cpu_tensor, cpu_contiguous_index, cpu_updates})); + auto output = JUST(one::functional::To(cpu_output, "npu")); + int64_t ndim = tensor->shape()->NumAxes(); + // TODO: use inplace copy op to write back to origin tensor + std::vector start(ndim, 0); + std::vector stop(tensor->shape()->begin(), tensor->shape()->end()); + std::vector step(ndim, 1); + return functional::SliceUpdate(tensor, output, start, stop, step, /*inplace=*/true); + } + } else { + device_type = JUST(tensor->device())->enum_type(); + if (device_type == DeviceType::kNPU) { + auto cpu_tensor = JUST(one::functional::To(tensor, "cpu")); + auto cpu_contiguous_index = JUST(one::functional::To(contiguous_index, "cpu")); + auto cpu_updates = JUST(one::functional::To(updates, "cpu")); + JUST(CheckInplaceValid(tensor)); + auto cpu_output = JUST(OpInterpUtil::Dispatch( + *op_, {cpu_tensor, cpu_contiguous_index, cpu_updates})); + return one::functional::To(cpu_output, "npu"); + } + } + } else { + if (device_type == DeviceType::kNPU) { + auto cpu_tensor = JUST(one::functional::To(tensor, "cpu")); + auto cpu_contiguous_index = JUST(one::functional::To(contiguous_index, "cpu")); + auto cpu_updates = JUST(one::functional::To(updates, "cpu")); + auto cpu_output = JUST( + OpInterpUtil::Dispatch(*op_, {cpu_tensor, cpu_contiguous_index, cpu_updates})); + return one::functional::To(cpu_output, "npu"); + } + } + +#endif + if (inplace) { if (tensor->is_global()) { // NOTE: global tensor_scatter_nd_update inplace must calculate on another tensor and assign