mlbench · Panaetius · Aug 17, 2018 · Aug 14, 2018 · Aug 14, 2018 · Aug 15, 2018
diff --git a/.gitignore b/.gitignore
@@ -108,5 +108,6 @@ ENV/
 
 # helm
 **/charts/*.tgz
+myvalues.yaml
 
 setup_telepresence.sh
diff --git a/.travis.yml b/.travis.yml
@@ -66,6 +66,8 @@ jobs:
           --set master.image.tag=travis-ci-test \
           --set worker.image.repository=${docker_url}/${docker_repository}/mlbench_worker \
           --set worker.image.tag=travis-ci-test \
+          --set worker.imagePullSecret=regcred \
+          --set master.imagePullSecret=regcred \
           --wait \
           charts/mlbench/
         - ./kubectl --context=dind get services |grep 'mlbench-master'

diff --git a/Makefile b/Makefile
@@ -83,4 +83,4 @@ publish-docker: ## Build, Tag and Publish a docker file to a local repository. U
 	docker push $(docker_registry)/mlbench_$(component):latest
 
 release: ## Install or upgrade a release with specified ${name}
-	helm upgrade --wait --recreate-pods --install ${name} charts/mlbench
+	helm upgrade --wait --recreate-pods ${args} --install ${name} charts/mlbench
diff --git a/charts/mlbench/templates/master-service-account.yaml b/charts/mlbench/templates/master-service-account.yaml
@@ -8,5 +8,7 @@ metadata:
     release: {{ .Release.Name }}
     heritage: {{ .Release.Service }}
   name: {{ template "mlbench.master.fullname" . }}
+{{if .Values.master.imagePullSecret}}
 imagePullSecrets:
-- name: {{.Values.master.imagePullSecret}}
+- name: {{.Values.master.imagePullSecret}}
+{{end}}
diff --git a/charts/mlbench/templates/worker-role.yaml b/charts/mlbench/templates/worker-role.yaml
@@ -0,0 +1,8 @@
+kind: Role
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: {{ template "mlbench.worker.fullname" . }}-role
+rules:
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["get", "watch", "list"]
diff --git a/charts/mlbench/templates/worker-rolebinding.yaml b/charts/mlbench/templates/worker-rolebinding.yaml
@@ -0,0 +1,11 @@
+kind: RoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: {{ template "mlbench.worker.fullname" . }}-rolebinding
+roleRef:
+  kind: Role
+  name: {{ template "mlbench.worker.fullname" . }}-role
+  apiGroup: rbac.authorization.k8s.io
+subjects:
+- kind: ServiceAccount
+  name: {{ template "mlbench.worker.fullname" . }}-sa
diff --git a/charts/mlbench/templates/worker-service-account.yaml b/charts/mlbench/templates/worker-service-account.yaml
@@ -0,0 +1,8 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ template "mlbench.worker.fullname" . }}-sa
+{{if .Values.worker.imagePullSecret}}
+imagePullSecrets:
+- name: {{.Values.worker.imagePullSecret}}
+{{end}}
diff --git a/charts/mlbench/templates/worker-statefulset.yaml b/charts/mlbench/templates/worker-statefulset.yaml
@@ -24,6 +24,7 @@ spec:
         release: {{ .Release.Name }}
         role: worker
     spec:
+      serviceAccountName: {{ template "mlbench.worker.fullname" . }}-sa
       containers:
         - name: {{ template "mlbench.name" . }}-{{ .Values.worker.name }}
           image: "{{ .Values.worker.image.repository }}:{{ .Values.worker.image.tag }}"

diff --git a/charts/mlbench/values.yaml b/charts/mlbench/values.yaml
@@ -5,10 +5,8 @@ master:
 
   name: master
 
-  imagePullSecret: regcred
-
   image:
-    repository: localhost:5000/mlbench_master
+    repository: mlbench/mlbench_master
     tag: latest
     pullPolicy: Always
 
@@ -23,7 +21,7 @@ worker:
   name: worker
 
   image:
-    repository: localhost:5000/mlbench_worker
+    repository: mlbench/mlbench_worker
     tag: latest
     pullPolicy: Always
 

diff --git a/compose/worker/Dockerfile b/compose/worker/Dockerfile
@@ -11,4 +11,7 @@ ADD ./mlbench/worker/ /app/
 RUN mkdir /codes
 ADD ./mlbench/refimpls/pytorch /codes
 
-ENV PYTHONPATH /codes
+ENV PYTHONPATH /codes
+
+ADD ./compose/worker/requirements.txt /requirements.txt
+RUN pip install --no-cache-dir -r /requirements.txt
diff --git a/compose/worker/requirements.txt b/compose/worker/requirements.txt
@@ -0,0 +1 @@
+kubernetes==6.0.0
diff --git a/docs/api.rst b/docs/api.rst
@@ -168,7 +168,7 @@ Metrics
 
 .. http:post:: /api/metrics
 
-   Save metrics. "pod_name" and "run_id" are mutually exclusive
+   Save metrics. "pod_name" and "run_id" are mutually exclusive. The fields of metrics and their types are defined in `mlbench/api/models/kubemetrics.py`.
 
    **Example request**:
 
@@ -184,6 +184,7 @@ Metrics
         "name": "accuracy",
         "date": "2018-08-03T09:21:44.331823Z",
         "value": "0.7845",
+        "cumulative": False,
         "metadata": "some additional data"
       }
 
@@ -200,6 +201,7 @@ Metrics
         "name": "accuracy",
         "date": "2018-08-03T09:21:44.331823Z",
         "value": "0.7845",
+        "cumulative": False,
         "metadata": "some additional data"
       }
 

diff --git a/mlbench/master/api/serializers.py b/mlbench/master/api/serializers.py
@@ -1,5 +1,5 @@
 from rest_framework import serializers
-from api.models import KubePod, ModelRun
+from api.models import KubePod, ModelRun, KubeMetric
 
 
 class KubePodSerializer(serializers.HyperlinkedModelSerializer):
@@ -18,3 +18,10 @@ class Meta:
             'state',
             'job_id',
             'job_metadata']
+
+
+class KubeMetricsSerializer(serializers.HyperlinkedModelSerializer):
+    # TODO:
+    class Meta:
+        model = KubeMetric
+        fields = ['name', 'value']
diff --git a/mlbench/master/api/views.py b/mlbench/master/api/views.py
@@ -2,7 +2,7 @@
 from rest_framework.response import Response
 from rest_framework import status
 from api.models import KubePod, KubeMetric, ModelRun
-from api.serializers import KubePodSerializer, ModelRunSerializer
+from api.serializers import KubePodSerializer, ModelRunSerializer, KubeMetricsSerializer
 import django_rq
 from rq.job import Job
 
@@ -93,18 +93,18 @@ def retrieve(self, request, pk=None, format=None):
             metrics = run.metrics.all()
 
         result = {
-                g[0]: [
-                    {
-                        'date': e.date,
-                        'value': e.value,
-                        'cumulative': e.cumulative
-                    }
-                    for e in sorted(g[1], key=lambda x: x.date)
-                    if since is None or e.date > since
-                ] for g in groupby(
-                    sorted(metrics, key=lambda m: m.name),
-                    key=lambda m: m.name)
-            }
+            g[0]: [
+                {
+                    'date': e.date,
+                    'value': e.value,
+                    'cumulative': e.cumulative
+                }
+                for e in sorted(g[1], key=lambda x: x.date)
+                if since is None or e.date > since
+            ] for g in groupby(
+                sorted(metrics, key=lambda m: m.name),
+                key=lambda m: m.name)
+        }
 
         return Response(result, status=status.HTTP_200_OK)
 
@@ -140,6 +140,10 @@ def create(self, request):
                 pod=pod)
             metric.save()
 
+            return Response(
+                metric, status=status.HTTP_201_CREATED
+            )
+
         elif 'run_id' in d:
             run = ModelRun.objects.get(pk=d['run_id'])
 
@@ -158,16 +162,19 @@ def create(self, request):
                 model_run=run)
             metric.save()
 
+            serializer = KubeMetricsSerializer(metric, many=False)
+
+            return Response(
+                serializer.data, status=status.HTTP_201_CREATED
+            )
+
         else:
             return Response({
                 'status': 'Bad Request',
-                'message': 'Pod Name or run id have to be supplied'
+                'message': 'Pod Name or run id have to be supplied',
+                'data': d,
             }, status=status.HTTP_400_BAD_REQUEST)
 
-        return Response(
-            metric, status=status.HTTP_201_CREATED
-        )
-
 
 class ModelRunView(ViewSet):
     """Handles Model Runs

diff --git a/mlbench/refimpls/pytorch/controlflow/base.py b/mlbench/refimpls/pytorch/controlflow/base.py
@@ -1,6 +1,7 @@
 import time
 import torch
 import torch.distributed as dist
+import datetime
 
 from utils import log
 from utils import checkpoint
@@ -31,6 +32,19 @@ def train_epoch(model, optimizer, criterion, context):
         optimizer.step()
 
         log.debug("Training Batch {:5}: loss={:.3f}".format(batch_idx, loss.item()))
+
+        log.post_metrics({
+            "run_id": context.meta.run_id,
+            "name": "train loss @ rank{}".format(context.meta.rank),
+            "value": loss.item(),
+            "date": str(datetime.datetime.now()),
+            "cumulative": False,
+            "metadata":
+            "Training loss at rank {}, epoch {} and batch {}".format(
+                context.meta.rank, context.runtime.current_epoch,
+                batch_idx
+            )
+        }, context.meta.rank)
         if context.meta.debug and batch_idx >= 10:
             break
 
@@ -106,6 +120,18 @@ def do_validate(model, optimizer, criterion, metrics, context):
             'best_prec1': context.runtime.best_prec1,
         }, is_best, context)
 
+    log.post_metrics({
+        "run_id": context.meta.run_id,
+        "name": "Prec@1",
+        "value": "{:.3f}".format(val_prec1),
+        "date": str(datetime.datetime.now()),
+        "cumulative": False,
+        "metadata":
+        "Validation Prec1 at epoch {}".format(
+            context.runtime.current_epoch
+        )
+    }, context.meta.rank)
+
 
 class TrainValidation(object):
     def __call__(self, model, optimizer, criterion, metrics, context):

diff --git a/mlbench/refimpls/pytorch/utils/log.py b/mlbench/refimpls/pytorch/utils/log.py
@@ -3,6 +3,9 @@
 """
 import logging
 import torch.distributed as dist
+import json
+import time
+import subprocess
 
 logger = logging.getLogger('mlbench')
 
@@ -31,17 +34,94 @@ def debug(content, who='all'):
 
 def warning(content, who='all'):
     if who == 'all' or who == dist.get_rank():
-        logger.warning("\033[0;31m{}\033[0m".format(content))
+        logger.warning("{}".format(content))
 
 
 def critical(content, who='all'):
     if who == 'all' or who == dist.get_rank():
-        logger.critical("\033[0;104m{}\033[0m".format(content))
+        logger.critical("{}".format(content))
+
+
+class AsyncMetricsPost(object):
+    """Post metrics payload to endpoint in an asynchronized way."""
+
+    def __init__(self):
+        self._initialized = False
+
+    def init(self):
+        from kubernetes import config, client
+        config.load_incluster_config()
+        configuration = client.Configuration()
+
+        class MyApiClient(client.ApiClient):
+            """
+            A bug introduced by a fix.
+
+            https://github.com/kubernetes-client/python/issues/411
+            https://github.com/swagger-api/swagger-codegen/issues/6392
+            """
+
+            def __del__(self):
+                pass
+
+        self.api_instance = client.CoreV1Api(MyApiClient(configuration))
+
+        # TODO: remove hardcoded part in the future.
+        self.namespace = 'default'
+        label_selector = 'component=master,app=mlbench'
+
+        try:
+            api_response = self.api_instance.list_namespaced_pod(
+                self.namespace, label_selector=label_selector)
+        except Exception as e:
+            print("Exception when calling CoreV1Api->list_namespaced_pod: %s\n" % e)
+
+        assert len(api_response.items) == 1
+        master_pod = api_response.items[0]
+        ip = master_pod.status.pod_ip
+        self.endpoint = "http://{ip}/api/metrics/".format(ip=ip)
+        self._initialized = True
+
+    def post(self, payload):
+        """Post information via kubernetes client.
+
+        Example:
+
+            payload = {
+                "run_id": "1",
+                "name": "accuracy",
+                "cumulative": False,
+                "date": "2018-08-14T09:21:44.331823Z",
+                "value": "1.0",
+                "metadata": "some additional data"
+            }
+
+        See `KubeMetric` in kubemetric.py for the fields and types.
+
+        """
+        if not self._initialized:
+            self.init()
+
+        command = [
+            "/usr/bin/curl",
+            "-d", json.dumps(payload),
+            "-H", "Content-Type: application/json",
+            "-X", "POST", self.endpoint
+        ]
+        subprocess.Popen(command)
+
+
+async_post = AsyncMetricsPost()
+
+
+def post_metrics(payload, rank):
+    if rank == 0:
+        async_post.post(payload)
 
 
 def todo(content, who='all'):
     if who == 'all' or who == dist.get_rank():
-        logger.warning("\033[0;33m{}\033[0m".format(content))
+        logger.warning("{}".format(content))
 
 
 def configuration_information(context):