Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions oneflow/core/functional/impl/array_functor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,27 @@ class ArgWhereFunctor {
const Symbol<DType>& dtype) const {
auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("dtype");
attrs.SetAllAttrs(dtype->data_type());
#ifdef WITH_NPU
auto device_type = DeviceType::kCPU;
if (x->is_global()) {
device_type = JUST(x->parallel_desc())->device_type();
} else {
device_type = JUST(x->device())->enum_type();
}
if (device_type == DeviceType::kNPU) {
// NOTE: use cpu argwhere when device="npu"
auto cpu_tensor = JUST(one::functional::To(x, "cpu"));
auto result = JUST(OpInterpUtil::Dispatch<TensorTuple>(*op_, {cpu_tensor}, attrs));
for (int i = 0; i < result->size(); ++i) {
(*result)[i] = JUST(one::functional::To((*result)[i], "npu"));
}
return result;
} else {
return OpInterpUtil::Dispatch<TensorTuple>(*op_, {x}, attrs);
}
#else
return OpInterpUtil::Dispatch<TensorTuple>(*op_, {x}, attrs);
#endif // WITH_NPU
}

private:
Expand Down Expand Up @@ -1494,6 +1514,50 @@ class TensorScatterNdUpdateFunctor {
CHECK_OR_RETURN(*tensor->dtype() == *updates->dtype())
<< Error::RuntimeError() << "The dtype of tensor and updates must be same.";
std::shared_ptr<Tensor> contiguous_index = JUST(functional::ToContiguous(indices));
#ifdef WITH_NPU
auto device_type = DeviceType::kCPU;
if (inplace) {
if (tensor->is_global()) {
device_type = JUST(tensor->parallel_desc())->device_type();
if (device_type == DeviceType::kNPU) {
auto cpu_tensor = JUST(one::functional::To(tensor, "cpu"));
auto cpu_contiguous_index = JUST(one::functional::To(contiguous_index, "cpu"));
auto cpu_updates = JUST(one::functional::To(updates, "cpu"));
auto cpu_output = JUST(OpInterpUtil::Dispatch<Tensor>(
*op_, {cpu_tensor, cpu_contiguous_index, cpu_updates}));
auto output = JUST(one::functional::To(cpu_output, "npu"));
int64_t ndim = tensor->shape()->NumAxes();
// TODO: use inplace copy op to write back to origin tensor
std::vector<int64_t> start(ndim, 0);
std::vector<int64_t> stop(tensor->shape()->begin(), tensor->shape()->end());
std::vector<int64_t> step(ndim, 1);
return functional::SliceUpdate(tensor, output, start, stop, step, /*inplace=*/true);
}
} else {
device_type = JUST(tensor->device())->enum_type();
if (device_type == DeviceType::kNPU) {
auto cpu_tensor = JUST(one::functional::To(tensor, "cpu"));
auto cpu_contiguous_index = JUST(one::functional::To(contiguous_index, "cpu"));
auto cpu_updates = JUST(one::functional::To(updates, "cpu"));
JUST(CheckInplaceValid(tensor));
auto cpu_output = JUST(OpInterpUtil::Dispatch<Tensor>(
*op_, {cpu_tensor, cpu_contiguous_index, cpu_updates}));
return one::functional::To(cpu_output, "npu");
}
}
} else {
if (device_type == DeviceType::kNPU) {
auto cpu_tensor = JUST(one::functional::To(tensor, "cpu"));
auto cpu_contiguous_index = JUST(one::functional::To(contiguous_index, "cpu"));
auto cpu_updates = JUST(one::functional::To(updates, "cpu"));
auto cpu_output = JUST(
OpInterpUtil::Dispatch<Tensor>(*op_, {cpu_tensor, cpu_contiguous_index, cpu_updates}));
return one::functional::To(cpu_output, "npu");
}
}

#endif

if (inplace) {
if (tensor->is_global()) {
// NOTE: global tensor_scatter_nd_update inplace must calculate on another tensor and assign
Expand Down
45 changes: 38 additions & 7 deletions oneflow/core/functional/impl/nn_functor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -930,9 +930,9 @@ class SkipLayerNormFunctor {
std::tuple<bool, bool, bool, bool>(has_skip, has_gamma, has_beta, has_bias),
op_expr));
} // has_bias
} // has_beta
} // has_gamma
} // has_skip
} // has_beta
} // has_gamma
} // has_skip
}

Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x,
Expand Down Expand Up @@ -1170,8 +1170,8 @@ class SkipRMSNormFunctor {
ops_.insert(std::pair<std::tuple<bool, bool, bool>, std::shared_ptr<OpExpr>>(
std::tuple<bool, bool, bool>(has_weight, has_skip, has_bias), op_expr));
} // has_bias
} // has_skip
} // has_weight
} // has_skip
} // has_weight
}

Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x,
Expand Down Expand Up @@ -1477,7 +1477,7 @@ class MaxUnpoolNDFunctor {
.Input("x")
.Input("indices")
.Output("y")
.Build())){};
.Build())) {};
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x,
const std::shared_ptr<one::Tensor>& indices,
const std::vector<int32_t>& kernel_size,
Expand Down Expand Up @@ -3456,6 +3456,37 @@ class L2NormalizeFunctor {
auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("epsilon", "axis");
attrs.SetAllAttrs(epsilon, final_dim);

auto device_type = DeviceType::kCPU;
#ifdef WITH_NPU
if (input->is_global()) {
device_type = JUST(input->parallel_desc())->device_type();
} else {
device_type = JUST(input->device())->enum_type();
}
if (device_type == DeviceType::kNPU) {
auto cpu_input = JUST(one::functional::To(input, "cpu"));
if (axis_ == final_dim) {
auto cpu_output = OpInterpUtil::Dispatch<Tensor>(*op_, {cpu_input}, attrs);
return one::functional::To(JUST(cpu_output), "npu");
// if (input->is_global()) {
// return one::functional::To(JUST(cpu_output), input->parallel_desc());
// } else {
// return one::functional::To(JUST(cpu_output), input->device());
// }
} else {
std::vector<int> input_perm(input->shape()->dim_vec().size(), 0);
for (size_t i = 0; i < input_perm.size(); ++i) { input_perm[i] = static_cast<int>(i); }
std::swap(input_perm[final_dim], input_perm[static_cast<size_t>(axis_)]);

const auto cpu_result = JUST(OpInterpUtil::Dispatch<TensorTuple>(
*op_, {JUST(functional::Transpose(cpu_input, input_perm))}, attrs));
auto cpu_output = functional::Transpose((*cpu_result)[0], input_perm);
return one::functional::To(JUST(cpu_output), "npu");
}
}

#endif

if (axis_ == final_dim) { return OpInterpUtil::Dispatch<Tensor>(*op_, {input}, attrs); }

std::vector<int> input_perm(input->shape()->dim_vec().size(), 0);
Expand All @@ -3476,7 +3507,7 @@ class NormalizeFunctor {
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& input, const float& p,
const int32_t& dim, const float& eps,
const bool& use_l2_norm_kernel) const {
if (use_l2_norm_kernel && (std::fabs(p - 2.0f) < std::numeric_limits<float>::min())) {
if ((std::fabs(p - 2.0f) < std::numeric_limits<float>::min())) {
return functional::L2Normalize(input, dim, eps);
}
return SequenceFunction<Maybe<Tensor>(const std::shared_ptr<Tensor>&, const float&,
Expand Down
22 changes: 17 additions & 5 deletions python/oneflow/nn/modules/nms.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,20 @@


def nms_op(boxes, scores, iou_threshold: float):
score_inds = flow.argsort(scores, dim=0, descending=True)
boxes = flow._C.gather(boxes, score_inds, axis=0)
keep = flow._C.nms(boxes, iou_threshold)
index = flow.squeeze(flow.argwhere(keep), dim=[1])
return flow._C.gather(score_inds, index, axis=0)
device = boxes.device
if device == flow.device("npu"):
cpu_boxes = boxes.detach().to("cpu")
cpu_scores = scores.detach().to("cpu")
# cpu_boxes.requires_grad=False
# cpu_scores.requires_grad=False
score_inds = flow.argsort(cpu_scores, dim=0, descending=True)
cpu_boxes = flow._C.gather(cpu_boxes, score_inds, axis=0)
keep = flow._C.nms(cpu_boxes, iou_threshold)
index = flow.squeeze(flow.argwhere(keep), dim=[1])
return flow._C.gather(score_inds, index, axis=0).to(device=device)
else:
score_inds = flow.argsort(scores, dim=0, descending=True)
boxes = flow._C.gather(boxes, score_inds, axis=0)
keep = flow._C.nms(boxes, iou_threshold)
index = flow.squeeze(flow.argwhere(keep), dim=[1])
return flow._C.gather(score_inds, index, axis=0)
32 changes: 23 additions & 9 deletions python/oneflow/nn/modules/unique.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,16 +66,30 @@ def unique_op(
tensor([3, 1, 2, 0, 2], dtype=oneflow.int64)

"""
if not return_inverse and not return_counts:
return flow._C.unique(input, sorted, dtype=dtype)
device = input.device
if device == flow.device("npu"):
cpu_input = input.to("cpu")
if not return_inverse and not return_counts:
return flow._C.unique(cpu_input, sorted, dtype=dtype).to(device=device)
else:
return flow._C.unique(
input,
sorted,
return_inverse=return_inverse,
return_counts=return_counts,
dtype=dtype,
).to(device=device)
else:
return flow._C.unique(
input,
sorted,
return_inverse=return_inverse,
return_counts=return_counts,
dtype=dtype,
)
if not return_inverse and not return_counts:
return flow._C.unique(input, sorted, dtype=dtype)
else:
return flow._C.unique(
input,
sorted,
return_inverse=return_inverse,
return_counts=return_counts,
dtype=dtype,
)


if __name__ == "__main__":
Expand Down
5 changes: 3 additions & 2 deletions python/oneflow/ops/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
See the License for the specific language governing permissions and
limitations under the License.
"""

from .torchvision import _cuda_version,nms

def load_library(path):
raise ImportError("load_library is not implemented")
print("\nload_library: %s >>> do nothing. \n"%(path))
# raise ImportError("load_library is not implemented")
40 changes: 40 additions & 0 deletions python/oneflow/ops/torchvision/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import oneflow as flow
from oneflow.framework.tensor import Tensor

def _cuda_version():
return -1
# int64_t cuda_version() {
# #ifdef WITH_CUDA
# return CUDA_VERSION;
# #else
# return -1;
# #endif
# }


def nms(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor:
"""
Performs non-maximum suppression (NMS) on the boxes according
to their intersection-over-union (IoU).

NMS iteratively removes lower scoring boxes which have an
IoU greater than ``iou_threshold`` with another (higher scoring)
box.

If multiple boxes have the exact same score and satisfy the IoU
criterion with respect to a reference box, the selected box is
not guaranteed to be the same between CPU and GPU. This is similar
to the behavior of argsort in PyTorch when repeated values are present.

Args:
boxes (Tensor[N, 4])): boxes to perform NMS on. They
are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
``0 <= y1 < y2``.
scores (Tensor[N]): scores for each one of the boxes
iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold

Returns:
Tensor: int64 tensor with the indices of the elements that have been kept
by NMS, sorted in decreasing order of scores
"""
return flow.nms(boxes, scores, iou_threshold)