diff --git a/examples/lib-samples/backend/mindspore/ResNet50/README.md b/examples/lib-samples/backend/mindspore/ResNet50/README.md new file mode 100644 index 000000000..c2085d75c --- /dev/null +++ b/examples/lib-samples/backend/mindspore/ResNet50/README.md @@ -0,0 +1,100 @@ +# Resnet Example with Mindspore Backend +This document describes how to use the mindspore backend to train Resnet-50 network with the cifar-10 dataset. + +## Script Description + +### Script and Sample Code +```shell +└──ResNet50 + ├── README.md + ├── scripts + ├── run_eval.sh # launch ascend evaluation + ├── run_eval_cpu.sh # launch cpu evaluation + ├── run_infer.sh # launch cpu inference + ├── run_standalone_train.sh # launch ascend standalone training + ├── run_standalone_train_cpu.sh # launch cpu training + ├── src + ├── config.py # parameter configuration + ├── dataset.py # data preprocessing + ├── CrossEntropySmooth.py # loss definition for ImageNet2012 dataset + ├── lr_generator.py # generate learning rate for each step + ├── resnet.py # resnet backbone, including resnet50 and resnet101 and se-resnet50 + ├── inference.py # Entrance to inference + ├── interface.py # Implements class "Estimator" + ├── eval.py # Entrance to evaluation + ├── train.py # Entrance to training +``` + +## Script Parameters + +Parameters for both training and evaluation can be set in `config.py`. + + +```bash +"class_num": 10, # dataset class num +"batch_size": 32, # batch size of input tensor +"loss_scale": 1024, # loss scale +"momentum": 0.9, # momentum +"weight_decay": 1e-4, # weight decay +"epoch_size": 90, # only valid for taining, which is always 1 for inference +"pretrain_epoch_size": 0, # epoch size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to epoch_size minus pretrain_epoch_size +"save_checkpoint": True, # whether save checkpoint or not +"save_checkpoint_epochs": 5, # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last step +"keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint +"warmup_epochs": 5, # number of warmup epoch +"lr_decay_mode": "poly" # decay mode can be selected in steps, ploy and default +"lr_init": 0.01, # initial learning rate +"lr_end": 0.00001, # final learning rate +"lr_max": 0.1, # maximum learning rate +``` + +## Preparatory Stage +### Prepare Dataset +In this example, we need to prepare the cifar10 dataset in advance, and put it into `/home/sedna/examples/backend/mindspore/resnet/`. +```bash +cd /home/sedna/examples/lib-samples/backend/mindspore/ResNet50 +wget http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz +tar -zxvf cifar-10-binary.tar.gz +``` +### Parameters +you can change the parameters of the model in `src/config.py`. + +## Modeling Stage +This example support CPU and NPU, you can follow these steps for training, testing and inference. + +### Training +* #### Running on CPU +```bash + bash scripts/run_standalone_train_cpu.sh [DATASET_PATH] [MODEL_SAVE_PATH] + # model_save_path must be ABSOLUTE PATH + # The log message would be showed in the terminal + # The ckpt file would be saved in [MODEL_SAVE_PATH] +``` +* #### Runing on NPU +```bash + bash scripts/run_standalone_train.sh [DATASET_PATH] [MODEL_SAVE_PATH] + # [MODEL_SAVE_PATH] must be ABSOLUTE PATH + # The log message would be saved to scripts/train/log + # The ckpt file would be saved in [MODEL_SAVE_PATH] +``` + +### Evaluation +* #### Running on CPU +```bash + bash scripts/run_eval_cpu.sh [DATASET_PATH] [CHECKPOINT_PATH] + # [CHECKPOINT_PATH] must be ABSOLUTE PATH + # The log message would be saved to scripts/test/log +``` +* #### Running on NPU +```bash + bash scripts/run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] + # [CHECKPOINT_PATH] must be ABSOLUTE PATH + # The log message would be saved to scripts/test/log +``` + +### Inference +```bash + bash scripts/run_infer.sh [IMAGE_PATH] [CHECKPOINT_PATH] + # [CHECKPOINT_PATH] must be ABSOLUTE PATH + # The log message would be saved to scripts/infer/log +``` \ No newline at end of file diff --git a/examples/lib-samples/backend/mindspore/ResNet50/eval.py b/examples/lib-samples/backend/mindspore/ResNet50/eval.py new file mode 100644 index 000000000..2135c7c68 --- /dev/null +++ b/examples/lib-samples/backend/mindspore/ResNet50/eval.py @@ -0,0 +1,37 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""train resnet.""" +import argparse +from mindspore.common import set_seed +from sedna.backend import set_backend +from interface import Estimator + +parser = argparse.ArgumentParser(description='Image classification') +parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path') +parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') +parser.add_argument('--device_target', type=str, default='Ascend', choices=("Ascend", "GPU", "CPU"), + help="Device target, support Ascend, GPU and CPU.") +set_seed(1) + + +def main(): + args_opt = parser.parse_args() + valid_data_path = args_opt.dataset_path + instance = set_backend(estimator=Estimator) + return instance.evaluate(valid_data_path, args_opt=args_opt) + + +if __name__ == '__main__': + main() diff --git a/examples/lib-samples/backend/mindspore/ResNet50/inference.py b/examples/lib-samples/backend/mindspore/ResNet50/inference.py new file mode 100644 index 000000000..2b62d55d2 --- /dev/null +++ b/examples/lib-samples/backend/mindspore/ResNet50/inference.py @@ -0,0 +1,47 @@ +import argparse +import mindspore as ms +from mindspore import Tensor +import mindspore.dataset.vision.c_transforms as C +import numpy as np +import cv2 +from sedna.backend import set_backend +from interface import Estimator + +parser = argparse.ArgumentParser(description="resnet50 infer") +parser.add_argument('--image_path', type=str, default="") +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=( + "Ascend", + "CPU"), + help="Device target, support Ascend, CPU") +parser.add_argument('--checkpoint_path', type=str) + + +def preprocess(): + resize = C.Resize((224, 224)) + rescale = C.Rescale(1.0 / 255.0, 0.0) + normalize = C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]) + transpose = C.HWC2CHW() + return [resize, rescale, normalize, transpose] + + +def main(): + args = parser.parse_args() + + # read image and preprocess + img = cv2.imread(args.image_path) + data_preprocess = preprocess() + for method in data_preprocess: + img = method(img) + img = np.expand_dims(img, 0) + data = Tensor(img, ms.float32) + + model = set_backend(estimator=Estimator) + return model.predict(data) + + +if __name__ == '__main__': + main() diff --git a/examples/lib-samples/backend/mindspore/ResNet50/interface.py b/examples/lib-samples/backend/mindspore/ResNet50/interface.py new file mode 100644 index 000000000..32ff61259 --- /dev/null +++ b/examples/lib-samples/backend/mindspore/ResNet50/interface.py @@ -0,0 +1,319 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""train resnet.""" +import os +import numpy as np +from mindspore import context +from mindspore import Tensor +from mindspore.nn.optim.momentum import Momentum +from mindspore.train.model import Model +from mindspore.context import ParallelMode +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits +from mindspore.train.loss_scale_manager import FixedLossScaleManager +from mindspore.train.serialization import load_checkpoint, load_param_into_net +from mindspore.communication.management import init, get_rank, get_group_size +from mindspore.parallel import set_algo_parameters +import mindspore.nn as nn +import mindspore.common.initializer as weight_init +from src.lr_generator import get_lr + +from src.resnet import resnet50 as resnet +from src.config import config1 as config +from src.dataset import create_dataset1 as create_dataset + + +class Estimator: + def __init__(self) -> None: + self.has_load = False + self.network = None + + def train(self, train_data, **kwargs): + """The whole process of model training + + The training process of the resnet model. At present, it supports single NPU and CPU. + Multi-GPU and multi-NPU will be supported in the future. + + Args: + train_data: training dataset path + kwargs: Including args_opt and other parameters. args_opt is passed by train.py, + includes some key parameters + + """ + args_opt = kwargs.get("args_opt") + target = args_opt.device_target + if target == "CPU": + args_opt.run_distribute = False + + ckpt_save_dir = args_opt.model_save_path + + # Multi-GPU/Multi-NPU + if args_opt.run_distribute: + if target == "Ascend": + device_id = int(os.getenv('DEVICE_ID')) + context.set_context( + device_id=device_id, + enable_auto_mixed_precision=True) + context.set_auto_parallel_context( + device_num=args_opt.device_num, + parallel_mode=ParallelMode.DATA_PARALLEL, + gradients_mean=True) + set_algo_parameters(elementwise_op_strategy_follow=True) + context.set_auto_parallel_context( + all_reduce_fusion_config=[85, 160]) + init() + # GPU target + else: + init() + context.set_auto_parallel_context( + device_num=get_group_size(), + parallel_mode=ParallelMode.DATA_PARALLEL, + gradients_mean=True) + if args_opt.net == "resnet50": + context.set_auto_parallel_context( + all_reduce_fusion_config=[85, 160]) + ckpt_save_dir = args_opt.save_checkpoint_path + \ + "ckpt_" + str(get_rank()) + "/" + + # create dataset + dataset = create_dataset( + dataset_path=train_data, + do_train=True, + repeat_num=1, + batch_size=config.batch_size, + target=target, + distribute=args_opt.run_distribute) + step_size = dataset.get_dataset_size() + + # define net + net = resnet(class_num=config.class_num) + + # init weight + if args_opt.pre_trained: + param_dict = load_checkpoint(args_opt.pre_trained) + load_param_into_net(net, param_dict) + else: + for _, cell in net.cells_and_names(): + if isinstance(cell, nn.Conv2d): + cell.weight.set_data( + weight_init.initializer( + weight_init.XavierUniform(), + cell.weight.shape, + cell.weight.dtype)) + if isinstance(cell, nn.Dense): + cell.weight.set_data( + weight_init.initializer( + weight_init.TruncatedNormal(), + cell.weight.shape, + cell.weight.dtype)) + + # init lr + lr = get_lr( + lr_init=config.lr_init, + lr_end=config.lr_end, + lr_max=config.lr_max, + warmup_epochs=config.warmup_epochs, + total_epochs=config.epoch_size, + steps_per_epoch=step_size, + lr_decay_mode=config.lr_decay_mode) + lr = Tensor(lr) + + # define opt + decayed_params = [] + no_decayed_params = [] + for param in net.trainable_params(): + if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: + decayed_params.append(param) + else: + no_decayed_params.append(param) + + group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay}, + {'params': no_decayed_params}, + {'order_params': net.trainable_params()}] + opt = Momentum( + group_params, + lr, + config.momentum, + loss_scale=config.loss_scale) + + # define loss, model + if target == "Ascend": + loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') + loss_scale = FixedLossScaleManager( + config.loss_scale, drop_overflow_update=False) + model = Model( + net, + loss_fn=loss, + optimizer=opt, + loss_scale_manager=loss_scale, + metrics={'acc'}, + amp_level="O2", + keep_batchnorm_fp32=False) + else: + # GPU and CPU target + loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + + if target != "CPU": + opt = Momentum( + filter( + lambda x: x.requires_grad, + net.get_parameters()), + lr, + config.momentum, + config.weight_decay, + config.loss_scale) + loss_scale = FixedLossScaleManager( + config.loss_scale, drop_overflow_update=False) + # Mixed precision + model = Model( + net, + loss_fn=loss, + optimizer=opt, + loss_scale_manager=loss_scale, + metrics={'acc'}, + amp_level="O2", + keep_batchnorm_fp32=False) + else: + # fp32 training + opt = Momentum( + filter( + lambda x: x.requires_grad, + net.get_parameters()), + lr, + config.momentum, + config.weight_decay) + model = Model( + net, + loss_fn=loss, + optimizer=opt, + metrics={'acc'}) + + # define callbacks + time_cb = TimeMonitor(data_size=step_size) + loss_cb = LossMonitor() + cb = [time_cb, loss_cb] + if config.save_checkpoint: + config_ck = CheckpointConfig( + save_checkpoint_steps=config.save_checkpoint_epochs * step_size, + keep_checkpoint_max=config.keep_checkpoint_max) + ckpt_cb = ModelCheckpoint( + prefix="resnet", + directory=ckpt_save_dir, + config=config_ck) + cb += [ckpt_cb] + + # train model + dataset_sink_mode = target != "CPU" + model.train( + config.epoch_size - config.pretrain_epoch_size, + dataset, + callbacks=cb, + sink_size=dataset.get_dataset_size(), + dataset_sink_mode=dataset_sink_mode) + + def evaluate(self, valid_data, **kwargs): + """The whole process of model evaluation. + + The evaluation process of the resnet model. At present, it supports single NPU and CPU. + GPU will be supported in the future. + + Args: + valid_data: evaluation dataset path. + kwargs: Including args_opt and other parameters. args_opt is passed by eval.py, + includes some key parameters. + + """ + + args_opt = kwargs.get("args_opt") + target = args_opt.device_target + if target == "Ascend": + device_id = int(os.getenv('DEVICE_ID')) + context.set_context(device_id=device_id) + + # create dataset + dataset = create_dataset( + dataset_path=valid_data, + do_train=False, + batch_size=config.batch_size, + target=target) + + # define net + net = self.network + + # define loss, model + loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') + + # define model + model = Model( + net, + loss_fn=loss, + metrics={ + 'top_1_accuracy', + 'top_5_accuracy'}) + + # eval model + res = model.eval(dataset) + print("result:", res, "ckpt=", args_opt.checkpoint_path) + + def predict(self, data): + """Inference for the image data + + Infer the image data and output its category + + Args: + data: image to be inferred + """ + + class_name = [ + 'airplane', + "automobile", + "bird", + "cat", + "deer", + "dog", + "frog", + "horse", + "ship", + "truck"] + + # define model + model = Model(self.network) + + # infer data + res = model.predict(data) + + # The output of the model is the score of each category, which needs to be softmax. + softmax = nn.Softmax() + # get label result + pred_class = class_name[np.argmax(softmax(res[0]))] + + print("This image belongs to: ", pred_class) + return pred_class + + def load(self, model_url): + """load checkpoint into model + + Initialize resnet model, and load the specified model file for evaluation and inference + + Args: + model_url: Url of model file + """ + + print("load model url: ", model_url) + self.network = resnet(class_num=config.class_num) + param_dict = load_checkpoint(model_url) + load_param_into_net(self.network, param_dict) + self.network.set_train(False) + self.has_load = True diff --git a/examples/lib-samples/backend/mindspore/ResNet50/scripts/run_eval.sh b/examples/lib-samples/backend/mindspore/ResNet50/scripts/run_eval.sh new file mode 100644 index 000000000..b89ffd829 --- /dev/null +++ b/examples/lib-samples/backend/mindspore/ResNet50/scripts/run_eval.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 2 ] +then + echo "Usage: sh run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH]" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) + + +if [ ! -d $PATH1 ] +then + echo "error: DATASET_PATH=$PATH1 is not a directory" +exit 1 +fi + +if [ ! -f $PATH2 ] +then + echo "error: CHECKPOINT_PATH=$PATH2 is not a file" +exit 1 +fi + +ulimit -u unlimited +dirPath=`dirname $PATH2` +ckpt=`basename $PATH2` +export MODEL_URL=$dirPath +export MODEL_NAME=$ckpt +export BACKEND_TYPE="MINDSPORE" +export DEVICE_CATEGORY="NPU" +export DEVICE_NUM=1 +export DEVICE_ID=0 +export RANK_SIZE=$DEVICE_NUM +export RANK_ID=0 + +if [ -d "eval" ]; +then + rm -rf ./eval +fi +mkdir ./eval +cp *.py ./eval +cp scripts/*.sh ./eval +cp -r src ./eval +cd ./eval || exit +env > env.log +echo "start evaluation for device $DEVICE_ID" +python3 eval.py --dataset_path=$PATH1 --checkpoint_path=$PATH2 +cd .. diff --git a/examples/lib-samples/backend/mindspore/ResNet50/scripts/run_eval_cpu.sh b/examples/lib-samples/backend/mindspore/ResNet50/scripts/run_eval_cpu.sh new file mode 100644 index 000000000..30b917a5c --- /dev/null +++ b/examples/lib-samples/backend/mindspore/ResNet50/scripts/run_eval_cpu.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 2 ] +then + echo "Usage: sh run_eval_cpu.sh [DATASET_PATH] [CHECKPOINT_PATH]" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) + +export BACKEND_TYPE="MINDSPORE" +export DEVICE_CATEGORY="CPU" + +if [ ! -d $PATH1 ] +then + echo "error: DATASET_PATH=$PATH1 is not a directory" +exit 1 +fi + +if [ ! -f $PATH2 ] +then + echo "error: CHECKPOINT_PATH=$PATH2 is not a file" +exit 1 +fi + +ulimit -u unlimited +dirPath=`dirname $PATH2` +ckpt=`basename $PATH2` +export MODEL_URL=$dirPath +export MODEL_NAME=$ckpt + +if [ -d "eval" ]; +then + rm -rf ./eval +fi +mkdir ./eval +cp *.py ./eval +cp scripts/*.sh ./eval +cp -r src ./eval +cd ./eval || exit +env > env.log +echo "start evaluation for CPU" +python3 eval.py --device_target="CPU" --dataset_path=$PATH1 --checkpoint_path=$PATH2 +cd .. diff --git a/examples/lib-samples/backend/mindspore/ResNet50/scripts/run_infer.sh b/examples/lib-samples/backend/mindspore/ResNet50/scripts/run_infer.sh new file mode 100644 index 000000000..baaa22383 --- /dev/null +++ b/examples/lib-samples/backend/mindspore/ResNet50/scripts/run_infer.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 2 ] +then + echo "Usage: sh run_infer.sh [IMAGE_PATH] [CHECKPOINT_PATH]" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) + + +if [ ! -f $PATH1 ] +then + echo "error: IMAGE_PATH=$PATH1 is not a directory" +exit 1 +fi + +if [ ! -f $PATH2 ] +then + echo "error: CHECKPOINT_PATH=$PATH2 is not a file" +exit 1 +fi + +ulimit -u unlimited +dirPath=`dirname $PATH2` +ckpt=`basename $PATH2` + +export MODEL_URL=$dirPath +export MODEL_NAME=$ckpt +export BACKEND_TYPE="MINDSPORE" +export DEVICE_CATEGORY="CPU" + +if [ -d "infer" ]; +then + rm -rf ./infer +fi +mkdir ./infer +cp *.py ./infer +cp scripts/*.sh ./infer +cp -r src ./infer +cd ./infer || exit +env > env.log +echo "start inference for device $DEVICE_ID" +python3 inference.py --image_path=$PATH1 --checkpoint_path=$PATH2 +cd .. diff --git a/examples/lib-samples/backend/mindspore/ResNet50/scripts/run_standalone_train.sh b/examples/lib-samples/backend/mindspore/ResNet50/scripts/run_standalone_train.sh new file mode 100644 index 000000000..799f4f1ff --- /dev/null +++ b/examples/lib-samples/backend/mindspore/ResNet50/scripts/run_standalone_train.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 2 ] +then + echo "Usage: sh run_standalone_train.sh [DATASET_PATH] [MODEL_SAVE_PATH]" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) + + +if [ ! -d $PATH1 ] +then + echo "error: DATASET_PATH=$PATH1 is not a directory" +exit 1 +fi + +if [ ! -d $PATH2 ] +then + echo "error: MODEL_SAVE_PATH=$PATH2 is not a directory" +exit 1 +fi + +ulimit -u unlimited +export BACKEND_TYPE="MINDSPORE" +export DEVICE_CATEGORY="NPU" +export DEVICE_NUM=1 +export DEVICE_ID=0 +export RANK_ID=0 +export RANK_SIZE=1 + +if [ -d "train" ]; +then + rm -rf ./train +fi +mkdir ./train +cp *.py ./train +cp scripts/*.sh ./train +cp -r src ./train +cd ./train || exit +echo "start training for device $DEVICE_ID" +env > env.log +python3 train.py --dataset_path=$PATH1 --model_save_path=$PATH2 +cd .. diff --git a/examples/lib-samples/backend/mindspore/ResNet50/scripts/run_standalone_train_cpu.sh b/examples/lib-samples/backend/mindspore/ResNet50/scripts/run_standalone_train_cpu.sh new file mode 100644 index 000000000..16ab23db7 --- /dev/null +++ b/examples/lib-samples/backend/mindspore/ResNet50/scripts/run_standalone_train_cpu.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 2 ] +then + echo "Usage: sh run_standalone_train_cpu.sh [DATASET_PATH] [MODEL_SAVE_PATH]" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) +export BACKEND_TYPE="MINDSPORE" +export DEVICE_CATEGORY="CPU" + +if [ ! -d $PATH1 ] +then + echo "error: DATASET_PATH=$PATH1 is not a directory" +exit 1 +fi + +if [ ! -d $PATH2 ] +then + echo "error: MODEL_SAVE_PATH=$PATH2 is not a directory" +exit 1 +fi + +ulimit -u unlimited + +if [ -d "train" ]; +then + rm -rf ./train +fi +mkdir ./train +cp *.py ./train +cp scripts/*.sh ./train +cp -r src ./train +cd ./train || exit +echo "start training for CPU" +env > env.log +python3 train.py --device_target="CPU" --dataset_path=$PATH1 --model_save_path=$PATH2 +cd .. diff --git a/examples/lib-samples/backend/mindspore/ResNet50/src/CrossEntropySmooth.py b/examples/lib-samples/backend/mindspore/ResNet50/src/CrossEntropySmooth.py new file mode 100644 index 000000000..bf38c6e77 --- /dev/null +++ b/examples/lib-samples/backend/mindspore/ResNet50/src/CrossEntropySmooth.py @@ -0,0 +1,38 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""define loss function for network""" +import mindspore.nn as nn +from mindspore import Tensor +from mindspore.common import dtype as mstype +from mindspore.nn.loss.loss import _Loss +from mindspore.ops import functional as F +from mindspore.ops import operations as P + + +class CrossEntropySmooth(_Loss): + """CrossEntropy""" + def __init__(self, sparse=True, reduction='mean', smooth_factor=0., num_classes=1000): + super(CrossEntropySmooth, self).__init__() + self.onehot = P.OneHot() + self.sparse = sparse + self.on_value = Tensor(1.0 - smooth_factor, mstype.float32) + self.off_value = Tensor(1.0 * smooth_factor / (num_classes - 1), mstype.float32) + self.ce = nn.SoftmaxCrossEntropyWithLogits(reduction=reduction) + + def construct(self, logit, label): + if self.sparse: + label = self.onehot(label, F.shape(logit)[1], self.on_value, self.off_value) + loss = self.ce(logit, label) + return loss diff --git a/examples/lib-samples/backend/mindspore/ResNet50/src/config.py b/examples/lib-samples/backend/mindspore/ResNet50/src/config.py new file mode 100644 index 000000000..5bf5b6d56 --- /dev/null +++ b/examples/lib-samples/backend/mindspore/ResNet50/src/config.py @@ -0,0 +1,37 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +network config setting, will be used in train.py and eval.py +""" +from easydict import EasyDict as ed + +# config for resent50, cifar10 +config1 = ed({ + "class_num": 10, + "batch_size": 32, + "loss_scale": 1024, + "momentum": 0.9, + "weight_decay": 1e-4, + "epoch_size": 90, + "pretrain_epoch_size": 0, + "save_checkpoint": True, + "save_checkpoint_epochs": 5, + "keep_checkpoint_max": 10, + "warmup_epochs": 5, + "lr_decay_mode": "poly", + "lr_init": 0.01, + "lr_end": 0.00001, + "lr_max": 0.1 +}) \ No newline at end of file diff --git a/examples/lib-samples/backend/mindspore/ResNet50/src/dataset.py b/examples/lib-samples/backend/mindspore/ResNet50/src/dataset.py new file mode 100644 index 000000000..0c2d116a1 --- /dev/null +++ b/examples/lib-samples/backend/mindspore/ResNet50/src/dataset.py @@ -0,0 +1,288 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +create train or eval dataset. +""" +import os +import mindspore.common.dtype as mstype +import mindspore.dataset as ds +import mindspore.dataset.vision.c_transforms as C +import mindspore.dataset.transforms.c_transforms as C2 +from mindspore.communication.management import init, get_rank, get_group_size + + +def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): + """ + create a train or evaluate cifar10 dataset for resnet50 + Args: + dataset_path(string): the path of dataset. + do_train(bool): whether dataset is used for train or eval. + repeat_num(int): the repeat times of dataset. Default: 1 + batch_size(int): the batch size of dataset. Default: 32 + target(str): the device target. Default: Ascend + distribute(bool): data for distribute or not. Default: False + + Returns: + dataset + """ + if target == "Ascend": + device_num, rank_id = _get_rank_info() + else: + if distribute: + init() + rank_id = get_rank() + device_num = get_group_size() + else: + device_num = 1 + if device_num == 1: + data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True) + else: + data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=device_num, shard_id=rank_id) + + # define map operations + trans = [] + if do_train: + trans += [ + C.RandomCrop((32, 32), (4, 4, 4, 4)), + C.RandomHorizontalFlip(prob=0.5) + ] + + trans += [ + C.Resize((224, 224)), + C.Rescale(1.0 / 255.0, 0.0), + C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]), + C.HWC2CHW() + ] + + type_cast_op = C2.TypeCast(mstype.int32) + + data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) + data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) + + # apply batch operations + data_set = data_set.batch(batch_size, drop_remainder=True) + # apply dataset repeat operation + data_set = data_set.repeat(repeat_num) + + return data_set + + +def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): + """ + create a train or eval imagenet2012 dataset for resnet50 + + Args: + dataset_path(string): the path of dataset. + do_train(bool): whether dataset is used for train or eval. + repeat_num(int): the repeat times of dataset. Default: 1 + batch_size(int): the batch size of dataset. Default: 32 + target(str): the device target. Default: Ascend + distribute(bool): data for distribute or not. Default: False + + Returns: + dataset + """ + if target == "Ascend": + device_num, rank_id = _get_rank_info() + else: + if distribute: + init() + rank_id = get_rank() + device_num = get_group_size() + else: + device_num = 1 + + if device_num == 1: + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) + else: + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=device_num, shard_id=rank_id) + + image_size = 224 + mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] + std = [0.229 * 255, 0.224 * 255, 0.225 * 255] + + # define map operations + if do_train: + trans = [ + C.RandomCropDecodeResize(image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)), + C.RandomHorizontalFlip(prob=0.5), + C.Normalize(mean=mean, std=std), + C.HWC2CHW() + ] + else: + trans = [ + C.Decode(), + C.Resize(256), + C.CenterCrop(image_size), + C.Normalize(mean=mean, std=std), + C.HWC2CHW() + ] + + type_cast_op = C2.TypeCast(mstype.int32) + + data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) + data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) + + # apply batch operations + data_set = data_set.batch(batch_size, drop_remainder=True) + + # apply dataset repeat operation + data_set = data_set.repeat(repeat_num) + + return data_set + + +def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): + """ + create a train or eval imagenet2012 dataset for resnet101 + Args: + dataset_path(string): the path of dataset. + do_train(bool): whether dataset is used for train or eval. + repeat_num(int): the repeat times of dataset. Default: 1 + batch_size(int): the batch size of dataset. Default: 32 + target(str): the device target. Default: Ascend + distribute(bool): data for distribute or not. Default: False + + Returns: + dataset + """ + if target == "Ascend": + device_num, rank_id = _get_rank_info() + else: + if distribute: + init() + rank_id = get_rank() + device_num = get_group_size() + else: + device_num = 1 + rank_id = 1 + if device_num == 1: + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) + else: + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=device_num, shard_id=rank_id) + image_size = 224 + mean = [0.475 * 255, 0.451 * 255, 0.392 * 255] + std = [0.275 * 255, 0.267 * 255, 0.278 * 255] + + # define map operations + if do_train: + trans = [ + C.RandomCropDecodeResize(image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)), + C.RandomHorizontalFlip(rank_id / (rank_id + 1)), + C.Normalize(mean=mean, std=std), + C.HWC2CHW() + ] + else: + trans = [ + C.Decode(), + C.Resize(256), + C.CenterCrop(image_size), + C.Normalize(mean=mean, std=std), + C.HWC2CHW() + ] + + type_cast_op = C2.TypeCast(mstype.int32) + + data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) + data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) + + # apply batch operations + data_set = data_set.batch(batch_size, drop_remainder=True) + # apply dataset repeat operation + data_set = data_set.repeat(repeat_num) + + return data_set + + +def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): + """ + create a train or eval imagenet2012 dataset for se-resnet50 + + Args: + dataset_path(string): the path of dataset. + do_train(bool): whether dataset is used for train or eval. + repeat_num(int): the repeat times of dataset. Default: 1 + batch_size(int): the batch size of dataset. Default: 32 + target(str): the device target. Default: Ascend + distribute(bool): data for distribute or not. Default: False + + Returns: + dataset + """ + if target == "Ascend": + device_num, rank_id = _get_rank_info() + else: + if distribute: + init() + rank_id = get_rank() + device_num = get_group_size() + else: + device_num = 1 + if device_num == 1: + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True) + else: + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True, + num_shards=device_num, shard_id=rank_id) + image_size = 224 + mean = [123.68, 116.78, 103.94] + std = [1.0, 1.0, 1.0] + + # define map operations + if do_train: + trans = [ + C.RandomCropDecodeResize(image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)), + C.RandomHorizontalFlip(prob=0.5), + C.Normalize(mean=mean, std=std), + C.HWC2CHW() + ] + else: + trans = [ + C.Decode(), + C.Resize(292), + C.CenterCrop(256), + C.Normalize(mean=mean, std=std), + C.HWC2CHW() + ] + + type_cast_op = C2.TypeCast(mstype.int32) + data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=12) + data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=12) + + # apply batch operations + data_set = data_set.batch(batch_size, drop_remainder=True) + + # apply dataset repeat operation + data_set = data_set.repeat(repeat_num) + + return data_set + + +def _get_rank_info(): + """ + get rank size and rank id + """ + rank_size = int(os.environ.get("RANK_SIZE", 1)) + + if rank_size > 1: + rank_size = get_group_size() + rank_id = get_rank() + else: + rank_size = 1 + rank_id = 0 + + return rank_size, rank_id diff --git a/examples/lib-samples/backend/mindspore/ResNet50/src/lr_generator.py b/examples/lib-samples/backend/mindspore/ResNet50/src/lr_generator.py new file mode 100644 index 000000000..7b6c70e94 --- /dev/null +++ b/examples/lib-samples/backend/mindspore/ResNet50/src/lr_generator.py @@ -0,0 +1,207 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""learning rate generator""" +import math +import numpy as np + + +def _generate_steps_lr(lr_init, lr_max, total_steps, warmup_steps): + """ + Applies three steps decay to generate learning rate array. + + Args: + lr_init(float): init learning rate. + lr_max(float): max learning rate. + total_steps(int): all steps in training. + warmup_steps(int): all steps in warmup epochs. + + Returns: + np.array, learning rate array. + """ + decay_epoch_index = [0.3 * total_steps, 0.6 * total_steps, 0.8 * total_steps] + lr_each_step = [] + for i in range(total_steps): + if i < warmup_steps: + lr = lr_init + (lr_max - lr_init) * i / warmup_steps + else: + if i < decay_epoch_index[0]: + lr = lr_max + elif i < decay_epoch_index[1]: + lr = lr_max * 0.1 + elif i < decay_epoch_index[2]: + lr = lr_max * 0.01 + else: + lr = lr_max * 0.001 + lr_each_step.append(lr) + return lr_each_step + + +def _generate_poly_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps): + """ + Applies polynomial decay to generate learning rate array. + + Args: + lr_init(float): init learning rate. + lr_end(float): end learning rate + lr_max(float): max learning rate. + total_steps(int): all steps in training. + warmup_steps(int): all steps in warmup epochs. + + Returns: + np.array, learning rate array. + """ + lr_each_step = [] + if warmup_steps != 0: + inc_each_step = (float(lr_max) - float(lr_init)) / float(warmup_steps) + else: + inc_each_step = 0 + for i in range(total_steps): + if i < warmup_steps: + lr = float(lr_init) + inc_each_step * float(i) + else: + base = (1.0 - (float(i) - float(warmup_steps)) / (float(total_steps) - float(warmup_steps))) + lr = float(lr_max) * base * base + if lr < 0.0: + lr = 0.0 + lr_each_step.append(lr) + return lr_each_step + + +def _generate_cosine_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps): + """ + Applies cosine decay to generate learning rate array. + + Args: + lr_init(float): init learning rate. + lr_end(float): end learning rate + lr_max(float): max learning rate. + total_steps(int): all steps in training. + warmup_steps(int): all steps in warmup epochs. + + Returns: + np.array, learning rate array. + """ + decay_steps = total_steps - warmup_steps + lr_each_step = [] + for i in range(total_steps): + if i < warmup_steps: + lr_inc = (float(lr_max) - float(lr_init)) / float(warmup_steps) + lr = float(lr_init) + lr_inc * (i + 1) + else: + linear_decay = (total_steps - i) / decay_steps + cosine_decay = 0.5 * (1 + math.cos(math.pi * 2 * 0.47 * i / decay_steps)) + decayed = linear_decay * cosine_decay + 0.00001 + lr = lr_max * decayed + lr_each_step.append(lr) + return lr_each_step + + +def _generate_liner_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps): + """ + Applies liner decay to generate learning rate array. + + Args: + lr_init(float): init learning rate. + lr_end(float): end learning rate + lr_max(float): max learning rate. + total_steps(int): all steps in training. + warmup_steps(int): all steps in warmup epochs. + + Returns: + np.array, learning rate array. + """ + lr_each_step = [] + for i in range(total_steps): + if i < warmup_steps: + lr = lr_init + (lr_max - lr_init) * i / warmup_steps + else: + lr = lr_max - (lr_max - lr_end) * (i - warmup_steps) / (total_steps - warmup_steps) + lr_each_step.append(lr) + return lr_each_step + + + +def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode): + """ + generate learning rate array + + Args: + lr_init(float): init learning rate + lr_end(float): end learning rate + lr_max(float): max learning rate + warmup_epochs(int): number of warmup epochs + total_epochs(int): total epoch of training + steps_per_epoch(int): steps of one epoch + lr_decay_mode(string): learning rate decay mode, including steps, poly, cosine or liner(default) + + Returns: + np.array, learning rate array + """ + lr_each_step = [] + total_steps = steps_per_epoch * total_epochs + warmup_steps = steps_per_epoch * warmup_epochs + + if lr_decay_mode == 'steps': + lr_each_step = _generate_steps_lr(lr_init, lr_max, total_steps, warmup_steps) + elif lr_decay_mode == 'poly': + lr_each_step = _generate_poly_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps) + elif lr_decay_mode == 'cosine': + lr_each_step = _generate_cosine_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps) + else: + lr_each_step = _generate_liner_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps) + + lr_each_step = np.array(lr_each_step).astype(np.float32) + return lr_each_step + + +def linear_warmup_lr(current_step, warmup_steps, base_lr, init_lr): + lr_inc = (float(base_lr) - float(init_lr)) / float(warmup_steps) + lr = float(init_lr) + lr_inc * current_step + return lr + + +def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch=120, global_step=0): + """ + generate learning rate array with cosine + + Args: + lr(float): base learning rate + steps_per_epoch(int): steps size of one epoch + warmup_epochs(int): number of warmup epochs + max_epoch(int): total epochs of training + global_step(int): the current start index of lr array + Returns: + np.array, learning rate array + """ + base_lr = lr + warmup_init_lr = 0 + total_steps = int(max_epoch * steps_per_epoch) + warmup_steps = int(warmup_epochs * steps_per_epoch) + decay_steps = total_steps - warmup_steps + + lr_each_step = [] + for i in range(total_steps): + if i < warmup_steps: + lr = linear_warmup_lr(i + 1, warmup_steps, base_lr, warmup_init_lr) + else: + linear_decay = (total_steps - i) / decay_steps + cosine_decay = 0.5 * (1 + math.cos(math.pi * 2 * 0.47 * i / decay_steps)) + decayed = linear_decay * cosine_decay + 0.00001 + lr = base_lr * decayed + lr_each_step.append(lr) + + lr_each_step = np.array(lr_each_step).astype(np.float32) + learning_rate = lr_each_step[global_step:] + return learning_rate diff --git a/examples/lib-samples/backend/mindspore/ResNet50/src/resnet.py b/examples/lib-samples/backend/mindspore/ResNet50/src/resnet.py new file mode 100644 index 000000000..ae1a382f6 --- /dev/null +++ b/examples/lib-samples/backend/mindspore/ResNet50/src/resnet.py @@ -0,0 +1,393 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""ResNet.""" +import numpy as np +import mindspore.nn as nn +import mindspore.common.dtype as mstype +from mindspore.ops import operations as P +from mindspore.ops import functional as F +from mindspore.common.tensor import Tensor +from scipy.stats import truncnorm + +def _conv_variance_scaling_initializer(in_channel, out_channel, kernel_size): + fan_in = in_channel * kernel_size * kernel_size + scale = 1.0 + scale /= max(1., fan_in) + stddev = (scale ** 0.5) / .87962566103423978 + mu, sigma = 0, stddev + weight = truncnorm(-2, 2, loc=mu, scale=sigma).rvs(out_channel * in_channel * kernel_size * kernel_size) + weight = np.reshape(weight, (out_channel, in_channel, kernel_size, kernel_size)) + return Tensor(weight, dtype=mstype.float32) + +def _weight_variable(shape, factor=0.01): + init_value = np.random.randn(*shape).astype(np.float32) * factor + return Tensor(init_value) + + +def _conv3x3(in_channel, out_channel, stride=1, use_se=False): + if use_se: + weight = _conv_variance_scaling_initializer(in_channel, out_channel, kernel_size=3) + else: + weight_shape = (out_channel, in_channel, 3, 3) + weight = _weight_variable(weight_shape) + return nn.Conv2d(in_channel, out_channel, + kernel_size=3, stride=stride, padding=0, pad_mode='same', weight_init=weight) + + +def _conv1x1(in_channel, out_channel, stride=1, use_se=False): + if use_se: + weight = _conv_variance_scaling_initializer(in_channel, out_channel, kernel_size=1) + else: + weight_shape = (out_channel, in_channel, 1, 1) + weight = _weight_variable(weight_shape) + return nn.Conv2d(in_channel, out_channel, + kernel_size=1, stride=stride, padding=0, pad_mode='same', weight_init=weight) + + +def _conv7x7(in_channel, out_channel, stride=1, use_se=False): + if use_se: + weight = _conv_variance_scaling_initializer(in_channel, out_channel, kernel_size=7) + else: + weight_shape = (out_channel, in_channel, 7, 7) + weight = _weight_variable(weight_shape) + return nn.Conv2d(in_channel, out_channel, + kernel_size=7, stride=stride, padding=0, pad_mode='same', weight_init=weight) + + +def _bn(channel): + return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9, + gamma_init=1, beta_init=0, moving_mean_init=0, moving_var_init=1) + + +def _bn_last(channel): + return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9, + gamma_init=0, beta_init=0, moving_mean_init=0, moving_var_init=1) + + +def _fc(in_channel, out_channel, use_se=False): + if use_se: + weight = np.random.normal(loc=0, scale=0.01, size=out_channel*in_channel) + weight = Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=mstype.float32) + else: + weight_shape = (out_channel, in_channel) + weight = _weight_variable(weight_shape) + return nn.Dense(in_channel, out_channel, has_bias=True, weight_init=weight, bias_init=0) + + +class ResidualBlock(nn.Cell): + """ + ResNet V1 residual block definition. + + Args: + in_channel (int): Input channel. + out_channel (int): Output channel. + stride (int): Stride size for the first convolutional layer. Default: 1. + use_se (bool): enable SE-ResNet50 net. Default: False. + se_block(bool): use se block in SE-ResNet50 net. Default: False. + + Returns: + Tensor, output tensor. + + Examples: + >>> ResidualBlock(3, 256, stride=2) + """ + expansion = 4 + + def __init__(self, + in_channel, + out_channel, + stride=1, + use_se=False, se_block=False): + super(ResidualBlock, self).__init__() + self.stride = stride + self.use_se = use_se + self.se_block = se_block + channel = out_channel // self.expansion + self.conv1 = _conv1x1(in_channel, channel, stride=1, use_se=self.use_se) + self.bn1 = _bn(channel) + if self.use_se and self.stride != 1: + self.e2 = nn.SequentialCell([_conv3x3(channel, channel, stride=1, use_se=True), _bn(channel), + nn.ReLU(), nn.MaxPool2d(kernel_size=2, stride=2, pad_mode='same')]) + else: + self.conv2 = _conv3x3(channel, channel, stride=stride, use_se=self.use_se) + self.bn2 = _bn(channel) + + self.conv3 = _conv1x1(channel, out_channel, stride=1, use_se=self.use_se) + self.bn3 = _bn_last(out_channel) + if self.se_block: + self.se_global_pool = P.ReduceMean(keep_dims=False) + self.se_dense_0 = _fc(out_channel, int(out_channel/4), use_se=self.use_se) + self.se_dense_1 = _fc(int(out_channel/4), out_channel, use_se=self.use_se) + self.se_sigmoid = nn.Sigmoid() + self.se_mul = P.Mul() + self.relu = nn.ReLU() + + self.down_sample = False + + if stride != 1 or in_channel != out_channel: + self.down_sample = True + self.down_sample_layer = None + + if self.down_sample: + if self.use_se: + if stride == 1: + self.down_sample_layer = nn.SequentialCell([_conv1x1(in_channel, out_channel, + stride, use_se=self.use_se), _bn(out_channel)]) + else: + self.down_sample_layer = nn.SequentialCell([nn.MaxPool2d(kernel_size=2, stride=2, pad_mode='same'), + _conv1x1(in_channel, out_channel, 1, + use_se=self.use_se), _bn(out_channel)]) + else: + self.down_sample_layer = nn.SequentialCell([_conv1x1(in_channel, out_channel, stride, + use_se=self.use_se), _bn(out_channel)]) + self.add = P.Add() + + def construct(self, x): + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + if self.use_se and self.stride != 1: + out = self.e2(out) + else: + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + out = self.conv3(out) + out = self.bn3(out) + if self.se_block: + out_se = out + out = self.se_global_pool(out, (2, 3)) + out = self.se_dense_0(out) + out = self.relu(out) + out = self.se_dense_1(out) + out = self.se_sigmoid(out) + out = F.reshape(out, F.shape(out) + (1, 1)) + out = self.se_mul(out, out_se) + + if self.down_sample: + identity = self.down_sample_layer(identity) + + out = self.add(out, identity) + out = self.relu(out) + + return out + + +class ResNet(nn.Cell): + """ + ResNet architecture. + + Args: + block (Cell): Block for network. + layer_nums (list): Numbers of block in different layers. + in_channels (list): Input channel in each layer. + out_channels (list): Output channel in each layer. + strides (list): Stride size in each layer. + num_classes (int): The number of classes that the training images are belonging to. + use_se (bool): enable SE-ResNet50 net. Default: False. + se_block(bool): use se block in SE-ResNet50 net in layer 3 and layer 4. Default: False. + Returns: + Tensor, output tensor. + + Examples: + >>> ResNet(ResidualBlock, + >>> [3, 4, 6, 3], + >>> [64, 256, 512, 1024], + >>> [256, 512, 1024, 2048], + >>> [1, 2, 2, 2], + >>> 10) + """ + + def __init__(self, + block, + layer_nums, + in_channels, + out_channels, + strides, + num_classes, + use_se=False): + super(ResNet, self).__init__() + + if not len(layer_nums) == len(in_channels) == len(out_channels) == 4: + raise ValueError("the length of layer_num, in_channels, out_channels list must be 4!") + self.use_se = use_se + self.se_block = False + if self.use_se: + self.se_block = True + + if self.use_se: + self.conv1_0 = _conv3x3(3, 32, stride=2, use_se=self.use_se) + self.bn1_0 = _bn(32) + self.conv1_1 = _conv3x3(32, 32, stride=1, use_se=self.use_se) + self.bn1_1 = _bn(32) + self.conv1_2 = _conv3x3(32, 64, stride=1, use_se=self.use_se) + else: + self.conv1 = _conv7x7(3, 64, stride=2) + self.bn1 = _bn(64) + self.relu = P.ReLU() + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same") + self.layer1 = self._make_layer(block, + layer_nums[0], + in_channel=in_channels[0], + out_channel=out_channels[0], + stride=strides[0], + use_se=self.use_se) + self.layer2 = self._make_layer(block, + layer_nums[1], + in_channel=in_channels[1], + out_channel=out_channels[1], + stride=strides[1], + use_se=self.use_se) + self.layer3 = self._make_layer(block, + layer_nums[2], + in_channel=in_channels[2], + out_channel=out_channels[2], + stride=strides[2], + use_se=self.use_se, + se_block=self.se_block) + self.layer4 = self._make_layer(block, + layer_nums[3], + in_channel=in_channels[3], + out_channel=out_channels[3], + stride=strides[3], + use_se=self.use_se, + se_block=self.se_block) + + self.mean = P.ReduceMean(keep_dims=True) + self.flatten = nn.Flatten() + self.end_point = _fc(out_channels[3], num_classes, use_se=self.use_se) + + def _make_layer(self, block, layer_num, in_channel, out_channel, stride, use_se=False, se_block=False): + """ + Make stage network of ResNet. + + Args: + block (Cell): Resnet block. + layer_num (int): Layer number. + in_channel (int): Input channel. + out_channel (int): Output channel. + stride (int): Stride size for the first convolutional layer. + se_block(bool): use se block in SE-ResNet50 net. Default: False. + Returns: + SequentialCell, the output layer. + + Examples: + >>> _make_layer(ResidualBlock, 3, 128, 256, 2) + """ + layers = [] + + resnet_block = block(in_channel, out_channel, stride=stride, use_se=use_se) + layers.append(resnet_block) + if se_block: + for _ in range(1, layer_num - 1): + resnet_block = block(out_channel, out_channel, stride=1, use_se=use_se) + layers.append(resnet_block) + resnet_block = block(out_channel, out_channel, stride=1, use_se=use_se, se_block=se_block) + layers.append(resnet_block) + else: + for _ in range(1, layer_num): + resnet_block = block(out_channel, out_channel, stride=1, use_se=use_se) + layers.append(resnet_block) + return nn.SequentialCell(layers) + + def construct(self, x): + if self.use_se: + x = self.conv1_0(x) + x = self.bn1_0(x) + x = self.relu(x) + x = self.conv1_1(x) + x = self.bn1_1(x) + x = self.relu(x) + x = self.conv1_2(x) + else: + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + c1 = self.maxpool(x) + + c2 = self.layer1(c1) + c3 = self.layer2(c2) + c4 = self.layer3(c3) + c5 = self.layer4(c4) + + out = self.mean(c5, (2, 3)) + out = self.flatten(out) + out = self.end_point(out) + + return out + + +def resnet50(class_num=10): + """ + Get ResNet50 neural network. + + Args: + class_num (int): Class number. + + Returns: + Cell, cell instance of ResNet50 neural network. + + Examples: + >>> net = resnet50(10) + """ + return ResNet(ResidualBlock, + [3, 4, 6, 3], + [64, 256, 512, 1024], + [256, 512, 1024, 2048], + [1, 2, 2, 2], + class_num) + +def se_resnet50(class_num=1001): + """ + Get SE-ResNet50 neural network. + + Args: + class_num (int): Class number. + + Returns: + Cell, cell instance of SE-ResNet50 neural network. + + Examples: + >>> net = se-resnet50(1001) + """ + return ResNet(ResidualBlock, + [3, 4, 6, 3], + [64, 256, 512, 1024], + [256, 512, 1024, 2048], + [1, 2, 2, 2], + class_num, + use_se=True) + +def resnet101(class_num=1001): + """ + Get ResNet101 neural network. + + Args: + class_num (int): Class number. + + Returns: + Cell, cell instance of ResNet101 neural network. + + Examples: + >>> net = resnet101(1001) + """ + return ResNet(ResidualBlock, + [3, 4, 23, 3], + [64, 256, 512, 1024], + [256, 512, 1024, 2048], + [1, 2, 2, 2], + class_num) diff --git a/examples/lib-samples/backend/mindspore/ResNet50/train.py b/examples/lib-samples/backend/mindspore/ResNet50/train.py new file mode 100644 index 000000000..d64729dcf --- /dev/null +++ b/examples/lib-samples/backend/mindspore/ResNet50/train.py @@ -0,0 +1,44 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""train resnet.""" +import argparse +import ast +from mindspore.common import set_seed +from lib.sedna.backend import set_backend +from interface import Estimator + + +parser = argparse.ArgumentParser(description='Image classification') +parser.add_argument('--run_distribute', type=ast.literal_eval, default=False, help='Run distribute') +parser.add_argument('--device_num', type=int, default=1, help='Device num.') +parser.add_argument('--model_save_path', type=str, default="") + +parser.add_argument('--dataset_path', type=str, default="", help='Dataset path') +parser.add_argument('--device_target', type=str, default='Ascend', choices=("Ascend", "GPU", "CPU"), + help="Device target, support Ascend, GPU and CPU.") +parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path') + +set_seed(1) + + +def main(): + args_opt = parser.parse_args() + train_data = args_opt.dataset_path + instance = set_backend(estimator=Estimator) + return instance.train(train_data, args_opt=args_opt) + + +if __name__ == '__main__': + main() diff --git a/lib/sedna/backend/__init__.py b/lib/sedna/backend/__init__.py index 5d3217fd3..ca79bde78 100644 --- a/lib/sedna/backend/__init__.py +++ b/lib/sedna/backend/__init__.py @@ -23,10 +23,11 @@ def set_backend(estimator=None, config=None): """Create Trainer class""" if estimator is None: - return + return None if config is None: config = BaseConfig() use_cuda = False + use_npu = False backend_type = os.getenv( 'BACKEND_TYPE', config.get("backend_type", "UNKNOWN") ) @@ -34,7 +35,12 @@ def set_backend(estimator=None, config=None): device_category = os.getenv( 'DEVICE_CATEGORY', config.get("device_category", "CPU") ) - if 'CUDA_VISIBLE_DEVICES' in os.environ: + + # NPU>GPU>CPU + if device_category == "NPU": + use_npu = True + os.environ['DEVICE_CATEGORY'] = "NPU" + elif 'CUDA_VISIBLE_DEVICES' in os.environ: os.environ['DEVICE_CATEGORY'] = 'GPU' use_cuda = True else: @@ -44,6 +50,8 @@ def set_backend(estimator=None, config=None): from sedna.backend.tensorflow import TFBackend as REGISTER elif backend_type == "KERAS": from sedna.backend.tensorflow import KerasBackend as REGISTER + elif backend_type == "MINDSPORE": + from sedna.backend.mindspore import MSBackend as REGISTER elif backend_type == "TORCH": from sedna.backend.torch import TorchBackend as REGISTER else: @@ -56,6 +64,7 @@ def set_backend(estimator=None, config=None): return REGISTER( estimator=estimator, use_cuda=use_cuda, + use_npu=use_npu, model_save_path=base_model_save, model_name=model_save_name, model_save_url=model_save_url diff --git a/lib/sedna/backend/base.py b/lib/sedna/backend/base.py index 72023b7eb..2f7a69d90 100644 --- a/lib/sedna/backend/base.py +++ b/lib/sedna/backend/base.py @@ -24,6 +24,7 @@ class BackendBase: def __init__(self, estimator, fine_tune=True, **kwargs): self.framework = "" self.estimator = estimator + self.use_npu = True if kwargs.get("use_npu") else False self.use_cuda = True if kwargs.get("use_cuda") else False self.fine_tune = fine_tune self.model_save_path = kwargs.get("model_save_path") or "/tmp" @@ -34,8 +35,13 @@ def __init__(self, estimator, fine_tune=True, **kwargs): def model_name(self): if self.default_name: return self.default_name - model_postfix = {"pytorch": [".pth", ".pt"], - "keras": ".pb", "tensorflow": ".pb"} + + model_postfix = { + "pytorch": [".pth", ".pt"], + "keras": ".pb", + "tensorflow": ".pb", + "mindspore": ".ckpt"} + continue_flag = "_finetune_" if self.fine_tune else "" post_fix = model_postfix.get(self.framework, ".pkl") return f"model{continue_flag}{self.framework}{post_fix}" diff --git a/lib/sedna/backend/mindspore/__init__.py b/lib/sedna/backend/mindspore/__init__.py new file mode 100644 index 000000000..b58d336ba --- /dev/null +++ b/lib/sedna/backend/mindspore/__init__.py @@ -0,0 +1,74 @@ +# Copyright 2021 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import mindspore.context as context +from sedna.backend.base import BackendBase +from sedna.common.file_ops import FileOps + + +class MSBackend(BackendBase): + def __init__(self, estimator, fine_tune=True, **kwargs): + super(MSBackend, self).__init__(estimator=estimator, + fine_tune=fine_tune, + **kwargs) + self.framework = "mindspore" + + if self.use_npu: + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") + elif self.use_cuda: + context.set_context(mode=context.GRAPH_MODE, device_target="GPU") + else: + context.set_context(mode=context.GRAPH_MODE, device_target="CPU") + + if callable(self.estimator): + self.estimator = self.estimator() + + def train(self, train_data, valid_data=None, **kwargs): + if callable(self.estimator): + self.estimator = self.estimator() + if self.fine_tune and FileOps.exists(self.model_save_path): + self.finetune() + self.has_load = True + varkw = self.parse_kwargs(self.estimator.train, **kwargs) + return self.estimator.train(train_data=train_data, + valid_data=valid_data, + **varkw) + + def predict(self, data, **kwargs): + if not self.has_load: + self.load() + varkw = self.parse_kwargs(self.estimator.predict, **kwargs) + return self.estimator.predict(data=data, **varkw) + + def evaluate(self, data, **kwargs): + if not self.has_load: + self.load() + varkw = self.parse_kwargs(self.estimator.evaluate, **kwargs) + return self.estimator.evaluate(data, **varkw) + + def finetune(self): + """todo: no support yet""" + + def load_weights(self): + model_path = FileOps.join_path(self.model_save_path, self.model_name) + if os.path.exists(model_path): + self.estimator.load_weights(model_path) + + def get_weights(self): + """todo: no support yet""" + + def set_weights(self, weights): + """todo: no support yet"""