Skip to content

Commit acf00c4

Browse files
committed
Added gcp ai platform scripts to run trainings
1 parent 316480b commit acf00c4

5 files changed

Lines changed: 184 additions & 0 deletions

File tree

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
FROM pytorch/pytorch:latest
2+
3+
WORKDIR /workspace
4+
5+
# Setup dependencies
6+
RUN pip install tqdm tensorboardX git+https://github.com/pytorch/ignite.git@distrib
7+
8+
# Setup dataset
9+
RUN mkdir /workspace/data && python -c 'from torchvision.datasets.cifar import CIFAR10; CIFAR10("/workspace/data", download=True)'
10+
11+
# Copy code
12+
RUN mkdir /workspace/code
13+
COPY fastresnet.py /workspace/code/fastresnet.py
14+
COPY main.py /workspace/code/main.py
15+
COPY utils.py /workspace/code/utils.py
16+
COPY gcp_ai_platform/entrypoint.sh /workspace/code/entrypoint.sh
17+
COPY gcp_ai_platform/parse_cluster_spec.py /workspace/code/parse_cluster_spec.py
18+
19+
# Installs google cloud sdk, this is mostly for using gsutil to export model.
20+
RUN curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz -o google-cloud-sdk.tar.gz && \
21+
mkdir /root/tools && \
22+
tar xvzf google-cloud-sdk.tar.gz -C /root/tools && \
23+
rm google-cloud-sdk.tar.gz && \
24+
/root/tools/google-cloud-sdk/install.sh --usage-reporting=false \
25+
--path-update=false --bash-completion=false \
26+
--disable-installation-options && \
27+
rm -rf /root/.config/* && \
28+
ln -s /root/.config /config && \
29+
# Remove the backup directory that gcloud creates
30+
rm -rf /root/tools/google-cloud-sdk/.install/.backup
31+
32+
# Path configuration
33+
ENV PATH $PATH:/root/tools/google-cloud-sdk/bin
34+
# Make sure gsutil will use the default service account
35+
RUN echo '[GoogleCompute]\nservice_account = default' > /etc/boto.cfg
36+
37+
# Sets up the entry point to invoke the trainer.
38+
ENTRYPOINT ["/bin/bash", "/workspace/code/entrypoint.sh"]
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Distributed training on CIFAR10 with GCP AI Platform
2+
3+
Helper scripts to run distributed training on [GCP AI Platform](https://cloud.google.com/ml-engine/docs/).
4+
5+
To use the scripts user needs to have
6+
- an account on GCP and enabled AI Platform, see [here for details](https://cloud.google.com/ml-engine/docs/tensorflow/getting-started-keras#set_up_your_project)
7+
- `gcloud` installed and properly configured
8+
- `docker`
9+
10+
## Setup
11+
12+
- Create output bucket
13+
```
14+
gsutil mb -p your-project -l region gs://output-cifar10/
15+
# e.g. gsutil mb -p ignite-distrib -l us-east1 gs://output-cifar10/
16+
```
17+
18+
- Configure local docker to push to GCR
19+
20+
```
21+
gcloud auth configure-docker
22+
```
23+
24+
## Start training
25+
26+
By default, we use `n1-standard-4` and `nvidia-tesla-k80` for the training. For other configs, please edit `submit_job.sh`.
27+
To start training, simply execute :
28+
```
29+
sh gcp_ai_platform/submit_job.sh your-project region num_nodes num_gpus_per_node
30+
# sh gcp_ai_platform/submit_job.sh ignite-distrib us-east1
31+
```
32+
33+
### Logs visualization - Tensorboard
34+
35+
By default, AI platform provides stream logs in the web interface, see docs [here](https://cloud.google.com/ml-engine/docs/monitor-training#checking_job_status).
36+
In addition, user can setup [Tensorboard locally or from Cloud Shell](https://cloud.google.com/ml-engine/docs/tensorflow/getting-started-training-prediction#tensorboard-local) :
37+
```
38+
# in cloud shell
39+
tensorboard --logdir=gs://output-cifar10
40+
```
41+
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/bin/bash
2+
3+
echo "\nCLUSTER_SPEC=${CLUSTER_SPEC}\n"
4+
5+
export NUM_NODES=$1
6+
export NUM_GPUS_PER_NODE=$2
7+
export OUTPUT_PATH=$3
8+
9+
cd /workspace/code
10+
11+
# Parse CLUSTER_SPEC
12+
data=`python parse_cluster_spec.py`
13+
data=(${data//,/ })
14+
master_addr=${data[0]}
15+
master_port=${data[1]}
16+
node_rank=${data[2]}
17+
18+
echo "- NUM_NODES=$NUM_NODES"
19+
echo "- NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE"
20+
echo "- OUTPUT_PATH=$OUTPUT_PATH"
21+
echo "- master_addr=$master_addr"
22+
echo "- master_port=$master_port"
23+
echo "- node_rank=$node_rank"
24+
25+
26+
python -m torch.distributed.launch \
27+
--nproc_per_node=$NUM_GPUS_PER_NODE \
28+
--nnodes=$NUM_NODES \
29+
--master_addr=$master_addr \
30+
--master_port=$master_port \
31+
--node_rank=$node_rank \
32+
main.py --params="batch_size=512;dist_backend='nccl';data_path=/workspace/data/;output_path=/tmp/output;display_iters=False"
33+
34+
35+
if [ $node_rank == 0 ]; then
36+
gsutil -m cp -R /tmp/output/ ${OUTPUT_PATH}
37+
fi
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
2+
import os
3+
import json
4+
5+
assert "CLUSTER_SPEC" in os.environ
6+
7+
cluster_spec = json.loads(os.environ['CLUSTER_SPEC'])
8+
9+
master_addr_port = cluster_spec['cluster']['master'][0].split(":")
10+
master_addr = master_addr_port[0]
11+
master_port = master_addr_port[1]
12+
13+
rank = cluster_spec['task']['index']
14+
if cluster_spec['task']['type'] == "worker":
15+
rank += 1
16+
17+
print("{},{},{}".format(master_addr, master_port, rank))
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#!/bin/bash
2+
3+
if [ -z $4 ]; then
4+
echo "Usage:"
5+
echo "\tbash submit_job.sh <PROJECT> <REGION> <NUM_NODES> <NUM_GPUS_PER_NODE>"
6+
echo "\te.g. bash submit_job.sh ignite-distrib us-east1 2 4"
7+
exit 0
8+
fi
9+
10+
PROJECT=$1
11+
REGION=$2
12+
NUM_NODES=$3
13+
NUM_GPUS_PER_NODE=$4
14+
15+
JOB_NAME="training-${NUM_NODES}n-${NUM_GPUS_PER_NODE}g-$(date +%Y%m%d-%H%M%S)"
16+
OUTPUT_PATH="gs://output-cifar10/${JOB_NAME}"
17+
IMAGE_URI="gcr.io/$PROJECT/ignite-distrib:latest"
18+
19+
accelerator_type=nvidia-tesla-k80
20+
machine_type=n1-standard-4
21+
22+
echo "\nSetup and start training on $NUM_NODES nodes with $(( NUM_GPUS_PER_NODE * NUM_NODES)) total gpus\n"
23+
24+
set -o errexit
25+
set -o pipefail
26+
set -u
27+
set -x
28+
29+
echo "- Build and push docker images"
30+
31+
docker build -f gcp_ai_platform/Dockerfile -t ${IMAGE_URI} .
32+
docker push ${IMAGE_URI}
33+
34+
job_id=${JOB_NAME//-/_}
35+
echo "- Submit job : $job_id"
36+
37+
workers_setup=""
38+
if [ $(( NUM_NODES - 1 )) -gt 0 ]; then
39+
workers_setup="--worker-image-uri ${IMAGE_URI} --worker-count $(( NUM_NODES - 1 )) --worker-machine-type $machine_type --worker-accelerator count=$NUM_GPUS_PER_NODE,type=$accelerator_type"
40+
fi
41+
42+
43+
gcloud ai-platform jobs submit training $job_id \
44+
--project $PROJECT \
45+
--region $REGION \
46+
--scale-tier custom \
47+
--master-image-uri ${IMAGE_URI} \
48+
--master-machine-type $machine_type \
49+
--master-accelerator count=$NUM_GPUS_PER_NODE,type=$accelerator_type \
50+
$workers_setup -- \
51+
$NUM_NODES $NUM_GPUS_PER_NODE $OUTPUT_PATH

0 commit comments

Comments
 (0)