Added gcp ai platform scripts to run trainings

vfdev-5 · vfdev-5 · commit acf00c47b27f · 2019-10-24T11:36:30.000+02:00
diff --git a/examples/contrib/cifar10/gcp_ai_platform/Dockerfile b/examples/contrib/cifar10/gcp_ai_platform/Dockerfile
@@ -0,0 +1,38 @@
+FROM pytorch/pytorch:latest
+
+WORKDIR /workspace
+
+# Setup dependencies
+RUN pip install tqdm tensorboardX git+https://github.com/pytorch/ignite.git@distrib
+
+# Setup dataset
+RUN mkdir /workspace/data && python -c 'from torchvision.datasets.cifar import CIFAR10; CIFAR10("/workspace/data", download=True)'
+
+# Copy code
+RUN mkdir /workspace/code
+COPY fastresnet.py /workspace/code/fastresnet.py
+COPY main.py /workspace/code/main.py
+COPY utils.py /workspace/code/utils.py
+COPY gcp_ai_platform/entrypoint.sh /workspace/code/entrypoint.sh
+COPY gcp_ai_platform/parse_cluster_spec.py /workspace/code/parse_cluster_spec.py
+
+# Installs google cloud sdk, this is mostly for using gsutil to export model.
+RUN curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz -o google-cloud-sdk.tar.gz && \
+    mkdir /root/tools && \
+    tar xvzf google-cloud-sdk.tar.gz -C /root/tools && \
+    rm google-cloud-sdk.tar.gz && \
+    /root/tools/google-cloud-sdk/install.sh --usage-reporting=false \
+        --path-update=false --bash-completion=false \
+        --disable-installation-options && \
+    rm -rf /root/.config/* && \
+    ln -s /root/.config /config && \
+    # Remove the backup directory that gcloud creates
+    rm -rf /root/tools/google-cloud-sdk/.install/.backup
+
+# Path configuration
+ENV PATH $PATH:/root/tools/google-cloud-sdk/bin
+# Make sure gsutil will use the default service account
+RUN echo '[GoogleCompute]\nservice_account = default' > /etc/boto.cfg
+
+# Sets up the entry point to invoke the trainer.
+ENTRYPOINT ["/bin/bash", "/workspace/code/entrypoint.sh"]
diff --git a/examples/contrib/cifar10/gcp_ai_platform/README.md b/examples/contrib/cifar10/gcp_ai_platform/README.md
@@ -0,0 +1,41 @@
+# Distributed training on CIFAR10 with GCP AI Platform
+
+Helper scripts to run distributed training on [GCP AI Platform](https://cloud.google.com/ml-engine/docs/).
+
+To use the scripts user needs to have
+- an account on GCP and enabled AI Platform, see [here for details](https://cloud.google.com/ml-engine/docs/tensorflow/getting-started-keras#set_up_your_project)
+- `gcloud` installed and properly configured
+- `docker`
+
+## Setup 
+
+- Create output bucket
+```
+gsutil mb -p your-project -l region gs://output-cifar10/ 
+# e.g. gsutil mb -p ignite-distrib -l us-east1 gs://output-cifar10/ 
+```
+
+- Configure local docker to push to GCR
+
+```
+gcloud auth configure-docker
+```
+
+## Start training
+
+By default, we use `n1-standard-4` and `nvidia-tesla-k80` for the training. For other configs, please edit `submit_job.sh`.
+To start training, simply execute :
+```
+sh gcp_ai_platform/submit_job.sh your-project region num_nodes num_gpus_per_node
+# sh gcp_ai_platform/submit_job.sh ignite-distrib us-east1
+```
+
+### Logs visualization - Tensorboard
+
+By default, AI platform provides stream logs in the web interface, see docs [here](https://cloud.google.com/ml-engine/docs/monitor-training#checking_job_status).
+In addition, user can setup [Tensorboard locally or from Cloud Shell](https://cloud.google.com/ml-engine/docs/tensorflow/getting-started-training-prediction#tensorboard-local) : 
+```
+# in cloud shell
+tensorboard --logdir=gs://output-cifar10
+```
+
diff --git a/examples/contrib/cifar10/gcp_ai_platform/entrypoint.sh b/examples/contrib/cifar10/gcp_ai_platform/entrypoint.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+echo "\nCLUSTER_SPEC=${CLUSTER_SPEC}\n"
+
+export NUM_NODES=$1
+export NUM_GPUS_PER_NODE=$2
+export OUTPUT_PATH=$3
+
+cd /workspace/code
+
+# Parse CLUSTER_SPEC
+data=`python parse_cluster_spec.py`
+data=(${data//,/ })
+master_addr=${data[0]}
+master_port=${data[1]}
+node_rank=${data[2]}
+
+echo "- NUM_NODES=$NUM_NODES"
+echo "- NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE"
+echo "- OUTPUT_PATH=$OUTPUT_PATH"
+echo "- master_addr=$master_addr"
+echo "- master_port=$master_port"
+echo "- node_rank=$node_rank"
+
+
+python -m torch.distributed.launch \
+    --nproc_per_node=$NUM_GPUS_PER_NODE \
+    --nnodes=$NUM_NODES \
+    --master_addr=$master_addr \
+    --master_port=$master_port \
+    --node_rank=$node_rank \
+    main.py --params="batch_size=512;dist_backend='nccl';data_path=/workspace/data/;output_path=/tmp/output;display_iters=False"
+
+
+if [ $node_rank == 0 ]; then
+    gsutil -m cp -R /tmp/output/ ${OUTPUT_PATH}
+fi
diff --git a/examples/contrib/cifar10/gcp_ai_platform/parse_cluster_spec.py b/examples/contrib/cifar10/gcp_ai_platform/parse_cluster_spec.py
@@ -0,0 +1,17 @@
+
+import os
+import json
+
+assert "CLUSTER_SPEC" in os.environ
+
+cluster_spec = json.loads(os.environ['CLUSTER_SPEC'])
+
+master_addr_port = cluster_spec['cluster']['master'][0].split(":")
+master_addr = master_addr_port[0]
+master_port = master_addr_port[1]
+
+rank = cluster_spec['task']['index']
+if cluster_spec['task']['type'] == "worker":
+    rank += 1
+
+print("{},{},{}".format(master_addr, master_port, rank))
diff --git a/examples/contrib/cifar10/gcp_ai_platform/submit_job.sh b/examples/contrib/cifar10/gcp_ai_platform/submit_job.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+if [ -z $4 ]; then
+    echo "Usage:"
+    echo "\tbash submit_job.sh <PROJECT> <REGION> <NUM_NODES> <NUM_GPUS_PER_NODE>"
+    echo "\te.g. bash submit_job.sh ignite-distrib us-east1 2 4"
+    exit 0
+fi
+
+PROJECT=$1
+REGION=$2
+NUM_NODES=$3
+NUM_GPUS_PER_NODE=$4
+
+JOB_NAME="training-${NUM_NODES}n-${NUM_GPUS_PER_NODE}g-$(date +%Y%m%d-%H%M%S)"
+OUTPUT_PATH="gs://output-cifar10/${JOB_NAME}"
+IMAGE_URI="gcr.io/$PROJECT/ignite-distrib:latest"
+
+accelerator_type=nvidia-tesla-k80
+machine_type=n1-standard-4
+
+echo "\nSetup and start training on $NUM_NODES nodes with $(( NUM_GPUS_PER_NODE * NUM_NODES)) total gpus\n"
+
+set -o errexit
+set -o pipefail
+set -u
+set -x
+
+echo "- Build and push docker images"
+
+docker build -f gcp_ai_platform/Dockerfile -t ${IMAGE_URI} .
+docker push ${IMAGE_URI}
+
+job_id=${JOB_NAME//-/_}
+echo "- Submit job : $job_id"
+
+workers_setup=""
+if [ $(( NUM_NODES - 1 )) -gt 0 ]; then
+    workers_setup="--worker-image-uri ${IMAGE_URI} --worker-count $(( NUM_NODES - 1 )) --worker-machine-type $machine_type --worker-accelerator count=$NUM_GPUS_PER_NODE,type=$accelerator_type"
+fi
+
+
+gcloud ai-platform jobs submit training $job_id \
+    --project $PROJECT \
+    --region $REGION \
+    --scale-tier custom \
+    --master-image-uri ${IMAGE_URI} \
+    --master-machine-type $machine_type \
+    --master-accelerator count=$NUM_GPUS_PER_NODE,type=$accelerator_type \
+    $workers_setup -- \
+    $NUM_NODES $NUM_GPUS_PER_NODE $OUTPUT_PATH