Oneflow-Inc · Flowingsun007 · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025
@@ -588,7 +588,27 @@ class ArgWhereFunctor {
                                 const Symbol<DType>& dtype) const {
     auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("dtype");
     attrs.SetAllAttrs(dtype->data_type());
+#ifdef WITH_NPU
+    auto device_type = DeviceType::kCPU;
+    if (x->is_global()) {
+      device_type = JUST(x->parallel_desc())->device_type();
+    } else {
+      device_type = JUST(x->device())->enum_type();
+    }
+    if (device_type == DeviceType::kNPU) {
+      // NOTE: use cpu argwhere when device="npu"
+      auto cpu_tensor = JUST(one::functional::To(x, "cpu"));
+      auto result = JUST(OpInterpUtil::Dispatch<TensorTuple>(*op_, {cpu_tensor}, attrs));
+      for (int i = 0; i < result->size(); ++i) {
+        (*result)[i] = JUST(one::functional::To((*result)[i], "npu"));
+      }
+      return result;
+    } else {
+      return OpInterpUtil::Dispatch<TensorTuple>(*op_, {x}, attrs);
+    }
+#else
     return OpInterpUtil::Dispatch<TensorTuple>(*op_, {x}, attrs);
+#endif  // WITH_NPU
   }
 
  private:
@@ -1494,6 +1514,50 @@ class TensorScatterNdUpdateFunctor {
     CHECK_OR_RETURN(*tensor->dtype() == *updates->dtype())
         << Error::RuntimeError() << "The dtype of tensor and updates must be same.";
     std::shared_ptr<Tensor> contiguous_index = JUST(functional::ToContiguous(indices));
+#ifdef WITH_NPU
+    auto device_type = DeviceType::kCPU;
+    if (inplace) {
+      if (tensor->is_global()) {
+        device_type = JUST(tensor->parallel_desc())->device_type();
+        if (device_type == DeviceType::kNPU) {
+          auto cpu_tensor = JUST(one::functional::To(tensor, "cpu"));
+          auto cpu_contiguous_index = JUST(one::functional::To(contiguous_index, "cpu"));
+          auto cpu_updates = JUST(one::functional::To(updates, "cpu"));
+          auto cpu_output = JUST(OpInterpUtil::Dispatch<Tensor>(
+              *op_, {cpu_tensor, cpu_contiguous_index, cpu_updates}));
+          auto output = JUST(one::functional::To(cpu_output, "npu"));
+          int64_t ndim = tensor->shape()->NumAxes();
+          // TODO: use inplace copy op to write back to origin tensor
+          std::vector<int64_t> start(ndim, 0);
+          std::vector<int64_t> stop(tensor->shape()->begin(), tensor->shape()->end());
+          std::vector<int64_t> step(ndim, 1);
+          return functional::SliceUpdate(tensor, output, start, stop, step, /*inplace=*/true);
+        }
+      } else {
+        device_type = JUST(tensor->device())->enum_type();
+        if (device_type == DeviceType::kNPU) {
+          auto cpu_tensor = JUST(one::functional::To(tensor, "cpu"));
+          auto cpu_contiguous_index = JUST(one::functional::To(contiguous_index, "cpu"));
+          auto cpu_updates = JUST(one::functional::To(updates, "cpu"));
+          JUST(CheckInplaceValid(tensor));
+          auto cpu_output = JUST(OpInterpUtil::Dispatch<Tensor>(
+              *op_, {cpu_tensor, cpu_contiguous_index, cpu_updates}));
+          return one::functional::To(cpu_output, "npu");
+        }
+      }
+    } else {
+      if (device_type == DeviceType::kNPU) {
+        auto cpu_tensor = JUST(one::functional::To(tensor, "cpu"));
+        auto cpu_contiguous_index = JUST(one::functional::To(contiguous_index, "cpu"));
+        auto cpu_updates = JUST(one::functional::To(updates, "cpu"));
+        auto cpu_output = JUST(
+            OpInterpUtil::Dispatch<Tensor>(*op_, {cpu_tensor, cpu_contiguous_index, cpu_updates}));
+        return one::functional::To(cpu_output, "npu");
+      }
+    }
+
+#endif
+
     if (inplace) {
       if (tensor->is_global()) {
         // NOTE: global tensor_scatter_nd_update inplace must calculate on another tensor and assign

@@ -930,9 +930,9 @@ class SkipLayerNormFunctor {
                 std::tuple<bool, bool, bool, bool>(has_skip, has_gamma, has_beta, has_bias),
                 op_expr));
           }  // has_bias
-        }    // has_beta
-      }      // has_gamma
-    }        // has_skip
+        }  // has_beta
+      }  // has_gamma
+    }  // has_skip
   }
 
   Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x,
@@ -1170,8 +1170,8 @@ class SkipRMSNormFunctor {
           ops_.insert(std::pair<std::tuple<bool, bool, bool>, std::shared_ptr<OpExpr>>(
               std::tuple<bool, bool, bool>(has_weight, has_skip, has_bias), op_expr));
         }  // has_bias
-      }    // has_skip
-    }      // has_weight
+      }  // has_skip
+    }  // has_weight
   }
 
   Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x,
@@ -1477,7 +1477,7 @@ class MaxUnpoolNDFunctor {
                            .Input("x")
                            .Input("indices")
                            .Output("y")
-                           .Build())){};
+                           .Build())) {};
   Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x,
                            const std::shared_ptr<one::Tensor>& indices,
                            const std::vector<int32_t>& kernel_size,
@@ -3456,6 +3456,37 @@ class L2NormalizeFunctor {
     auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("epsilon", "axis");
     attrs.SetAllAttrs(epsilon, final_dim);
 
+    auto device_type = DeviceType::kCPU;
+#ifdef WITH_NPU
+    if (input->is_global()) {
+      device_type = JUST(input->parallel_desc())->device_type();
+    } else {
+      device_type = JUST(input->device())->enum_type();
+    }
+    if (device_type == DeviceType::kNPU) {
+      auto cpu_input = JUST(one::functional::To(input, "cpu"));
+      if (axis_ == final_dim) {
+        auto cpu_output = OpInterpUtil::Dispatch<Tensor>(*op_, {cpu_input}, attrs);
+        return one::functional::To(JUST(cpu_output), "npu");
+        // if (input->is_global()) {
+        //   return one::functional::To(JUST(cpu_output), input->parallel_desc());
+        // } else {
+        //   return one::functional::To(JUST(cpu_output), input->device());
+        // }
+      } else {
+        std::vector<int> input_perm(input->shape()->dim_vec().size(), 0);
+        for (size_t i = 0; i < input_perm.size(); ++i) { input_perm[i] = static_cast<int>(i); }
+        std::swap(input_perm[final_dim], input_perm[static_cast<size_t>(axis_)]);
+
+        const auto cpu_result = JUST(OpInterpUtil::Dispatch<TensorTuple>(
+            *op_, {JUST(functional::Transpose(cpu_input, input_perm))}, attrs));
+        auto cpu_output = functional::Transpose((*cpu_result)[0], input_perm);
+        return one::functional::To(JUST(cpu_output), "npu");
+      }
+    }
+
+#endif
+
     if (axis_ == final_dim) { return OpInterpUtil::Dispatch<Tensor>(*op_, {input}, attrs); }
 
     std::vector<int> input_perm(input->shape()->dim_vec().size(), 0);
@@ -3476,7 +3507,7 @@ class NormalizeFunctor {
   Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& input, const float& p,
                            const int32_t& dim, const float& eps,
                            const bool& use_l2_norm_kernel) const {
-    if (use_l2_norm_kernel && (std::fabs(p - 2.0f) < std::numeric_limits<float>::min())) {
+    if ((std::fabs(p - 2.0f) < std::numeric_limits<float>::min())) {
       return functional::L2Normalize(input, dim, eps);
     }
     return SequenceFunction<Maybe<Tensor>(const std::shared_ptr<Tensor>&, const float&,

diff --git a/python/oneflow/nn/modules/nms.py b/python/oneflow/nn/modules/nms.py
@@ -19,8 +19,20 @@
 
 
 def nms_op(boxes, scores, iou_threshold: float):
-    score_inds = flow.argsort(scores, dim=0, descending=True)
-    boxes = flow._C.gather(boxes, score_inds, axis=0)
-    keep = flow._C.nms(boxes, iou_threshold)
-    index = flow.squeeze(flow.argwhere(keep), dim=[1])
-    return flow._C.gather(score_inds, index, axis=0)
+    device = boxes.device
+    if device == flow.device("npu"):
+        cpu_boxes = boxes.detach().to("cpu")
+        cpu_scores = scores.detach().to("cpu")
+        # cpu_boxes.requires_grad=False
+        # cpu_scores.requires_grad=False
+        score_inds = flow.argsort(cpu_scores, dim=0, descending=True)
+        cpu_boxes = flow._C.gather(cpu_boxes, score_inds, axis=0)
+        keep = flow._C.nms(cpu_boxes, iou_threshold)
+        index = flow.squeeze(flow.argwhere(keep), dim=[1])
+        return flow._C.gather(score_inds, index, axis=0).to(device=device)
+    else:
+        score_inds = flow.argsort(scores, dim=0, descending=True)
+        boxes = flow._C.gather(boxes, score_inds, axis=0)
+        keep = flow._C.nms(boxes, iou_threshold)
+        index = flow.squeeze(flow.argwhere(keep), dim=[1])
+        return flow._C.gather(score_inds, index, axis=0)
diff --git a/python/oneflow/nn/modules/unique.py b/python/oneflow/nn/modules/unique.py
@@ -66,16 +66,30 @@ def unique_op(
         tensor([3, 1, 2, 0, 2], dtype=oneflow.int64)
 
     """
-    if not return_inverse and not return_counts:
-        return flow._C.unique(input, sorted, dtype=dtype)
+    device = input.device
+    if device == flow.device("npu"):
+        cpu_input = input.to("cpu")
+        if not return_inverse and not return_counts:
+            return flow._C.unique(cpu_input, sorted, dtype=dtype).to(device=device)
+        else:
+            return flow._C.unique(
+                input,
+                sorted,
+                return_inverse=return_inverse,
+                return_counts=return_counts,
+                dtype=dtype,
+            ).to(device=device)
     else:
-        return flow._C.unique(
-            input,
-            sorted,
-            return_inverse=return_inverse,
-            return_counts=return_counts,
-            dtype=dtype,
-        )
+        if not return_inverse and not return_counts:
+            return flow._C.unique(input, sorted, dtype=dtype)
+        else:
+            return flow._C.unique(
+                input,
+                sorted,
+                return_inverse=return_inverse,
+                return_counts=return_counts,
+                dtype=dtype,
+            )
 
 
 if __name__ == "__main__":

diff --git a/python/oneflow/ops/__init__.py b/python/oneflow/ops/__init__.py
@@ -13,7 +13,8 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
-
+from .torchvision import _cuda_version,nms
 
 def load_library(path):
-    raise ImportError("load_library is not implemented")
+    print("\nload_library: %s >>> do nothing. \n"%(path))
+    # raise ImportError("load_library is not implemented")
diff --git a/python/oneflow/ops/torchvision/__init__.py b/python/oneflow/ops/torchvision/__init__.py
@@ -0,0 +1,40 @@
+import oneflow as flow
+from oneflow.framework.tensor import Tensor
+
+def _cuda_version():
+    return -1
+# int64_t cuda_version() {
+# #ifdef WITH_CUDA
+#   return CUDA_VERSION;
+# #else
+#   return -1;
+# #endif
+# }
+
+
+def nms(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor:
+    """
+    Performs non-maximum suppression (NMS) on the boxes according
+    to their intersection-over-union (IoU).
+
+    NMS iteratively removes lower scoring boxes which have an
+    IoU greater than ``iou_threshold`` with another (higher scoring)
+    box.
+
+    If multiple boxes have the exact same score and satisfy the IoU
+    criterion with respect to a reference box, the selected box is
+    not guaranteed to be the same between CPU and GPU. This is similar
+    to the behavior of argsort in PyTorch when repeated values are present.
+
+    Args:
+        boxes (Tensor[N, 4])): boxes to perform NMS on. They
+            are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
+            ``0 <= y1 < y2``.
+        scores (Tensor[N]): scores for each one of the boxes
+        iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold
+
+    Returns:
+        Tensor: int64 tensor with the indices of the elements that have been kept
+        by NMS, sorted in decreasing order of scores
+    """
+    return flow.nms(boxes, scores, iou_threshold)