Skip to content
This repository was archived by the owner on May 4, 2020. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -108,5 +108,6 @@ ENV/

# helm
**/charts/*.tgz
myvalues.yaml

setup_telepresence.sh
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ jobs:
--set master.image.tag=travis-ci-test \
--set worker.image.repository=${docker_url}/${docker_repository}/mlbench_worker \
--set worker.image.tag=travis-ci-test \
--set worker.imagePullSecret=regcred \
--set master.imagePullSecret=regcred \
--wait \
charts/mlbench/
- ./kubectl --context=dind get services |grep 'mlbench-master'
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,4 @@ publish-docker: ## Build, Tag and Publish a docker file to a local repository. U
docker push $(docker_registry)/mlbench_$(component):latest

release: ## Install or upgrade a release with specified ${name}
helm upgrade --wait --recreate-pods --install ${name} charts/mlbench
helm upgrade --wait --recreate-pods ${args} --install ${name} charts/mlbench
4 changes: 3 additions & 1 deletion charts/mlbench/templates/master-service-account.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,7 @@ metadata:
release: {{ .Release.Name }}
heritage: {{ .Release.Service }}
name: {{ template "mlbench.master.fullname" . }}
{{if .Values.master.imagePullSecret}}
imagePullSecrets:
- name: {{.Values.master.imagePullSecret}}
- name: {{.Values.master.imagePullSecret}}
{{end}}
8 changes: 8 additions & 0 deletions charts/mlbench/templates/worker-role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: {{ template "mlbench.worker.fullname" . }}-role
rules:
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "watch", "list"]
11 changes: 11 additions & 0 deletions charts/mlbench/templates/worker-rolebinding.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
kind: RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: {{ template "mlbench.worker.fullname" . }}-rolebinding
roleRef:
kind: Role
name: {{ template "mlbench.worker.fullname" . }}-role
apiGroup: rbac.authorization.k8s.io
subjects:
- kind: ServiceAccount
name: {{ template "mlbench.worker.fullname" . }}-sa
8 changes: 8 additions & 0 deletions charts/mlbench/templates/worker-service-account.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ template "mlbench.worker.fullname" . }}-sa
{{if .Values.worker.imagePullSecret}}
imagePullSecrets:
- name: {{.Values.worker.imagePullSecret}}
{{end}}
1 change: 1 addition & 0 deletions charts/mlbench/templates/worker-statefulset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ spec:
release: {{ .Release.Name }}
role: worker
spec:
serviceAccountName: {{ template "mlbench.worker.fullname" . }}-sa
containers:
- name: {{ template "mlbench.name" . }}-{{ .Values.worker.name }}
image: "{{ .Values.worker.image.repository }}:{{ .Values.worker.image.tag }}"
Expand Down
6 changes: 2 additions & 4 deletions charts/mlbench/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,8 @@ master:

name: master

imagePullSecret: regcred

image:
repository: localhost:5000/mlbench_master
repository: mlbench/mlbench_master
tag: latest
pullPolicy: Always

Expand All @@ -23,7 +21,7 @@ worker:
name: worker

image:
repository: localhost:5000/mlbench_worker
repository: mlbench/mlbench_worker
tag: latest
pullPolicy: Always

Expand Down
5 changes: 4 additions & 1 deletion compose/worker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,7 @@ ADD ./mlbench/worker/ /app/
RUN mkdir /codes
ADD ./mlbench/refimpls/pytorch /codes

ENV PYTHONPATH /codes
ENV PYTHONPATH /codes

ADD ./compose/worker/requirements.txt /requirements.txt
RUN pip install --no-cache-dir -r /requirements.txt
1 change: 1 addition & 0 deletions compose/worker/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
kubernetes==6.0.0
4 changes: 3 additions & 1 deletion docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ Metrics

.. http:post:: /api/metrics

Save metrics. "pod_name" and "run_id" are mutually exclusive
Save metrics. "pod_name" and "run_id" are mutually exclusive. The fields of metrics and their types are defined in `mlbench/api/models/kubemetrics.py`.

**Example request**:

Expand All @@ -184,6 +184,7 @@ Metrics
"name": "accuracy",
"date": "2018-08-03T09:21:44.331823Z",
"value": "0.7845",
"cumulative": False,
"metadata": "some additional data"
}

Expand All @@ -200,6 +201,7 @@ Metrics
"name": "accuracy",
"date": "2018-08-03T09:21:44.331823Z",
"value": "0.7845",
"cumulative": False,
"metadata": "some additional data"
}

Expand Down
9 changes: 8 additions & 1 deletion mlbench/master/api/serializers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from rest_framework import serializers
from api.models import KubePod, ModelRun
from api.models import KubePod, ModelRun, KubeMetric


class KubePodSerializer(serializers.HyperlinkedModelSerializer):
Expand All @@ -18,3 +18,10 @@ class Meta:
'state',
'job_id',
'job_metadata']


class KubeMetricsSerializer(serializers.HyperlinkedModelSerializer):
# TODO:
class Meta:
model = KubeMetric
fields = ['name', 'value']
43 changes: 25 additions & 18 deletions mlbench/master/api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from rest_framework.response import Response
from rest_framework import status
from api.models import KubePod, KubeMetric, ModelRun
from api.serializers import KubePodSerializer, ModelRunSerializer
from api.serializers import KubePodSerializer, ModelRunSerializer, KubeMetricsSerializer
import django_rq
from rq.job import Job

Expand Down Expand Up @@ -93,18 +93,18 @@ def retrieve(self, request, pk=None, format=None):
metrics = run.metrics.all()

result = {
g[0]: [
{
'date': e.date,
'value': e.value,
'cumulative': e.cumulative
}
for e in sorted(g[1], key=lambda x: x.date)
if since is None or e.date > since
] for g in groupby(
sorted(metrics, key=lambda m: m.name),
key=lambda m: m.name)
}
g[0]: [
{
'date': e.date,
'value': e.value,
'cumulative': e.cumulative
}
for e in sorted(g[1], key=lambda x: x.date)
if since is None or e.date > since
] for g in groupby(
sorted(metrics, key=lambda m: m.name),
key=lambda m: m.name)
}

return Response(result, status=status.HTTP_200_OK)

Expand Down Expand Up @@ -140,6 +140,10 @@ def create(self, request):
pod=pod)
metric.save()

return Response(
metric, status=status.HTTP_201_CREATED
)

elif 'run_id' in d:
run = ModelRun.objects.get(pk=d['run_id'])

Expand All @@ -158,16 +162,19 @@ def create(self, request):
model_run=run)
metric.save()

serializer = KubeMetricsSerializer(metric, many=False)

return Response(
serializer.data, status=status.HTTP_201_CREATED
)

else:
return Response({
'status': 'Bad Request',
'message': 'Pod Name or run id have to be supplied'
'message': 'Pod Name or run id have to be supplied',
'data': d,
}, status=status.HTTP_400_BAD_REQUEST)

return Response(
metric, status=status.HTTP_201_CREATED
)


class ModelRunView(ViewSet):
"""Handles Model Runs
Expand Down
26 changes: 26 additions & 0 deletions mlbench/refimpls/pytorch/controlflow/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import time
import torch
import torch.distributed as dist
import datetime

from utils import log
from utils import checkpoint
Expand Down Expand Up @@ -31,6 +32,19 @@ def train_epoch(model, optimizer, criterion, context):
optimizer.step()

log.debug("Training Batch {:5}: loss={:.3f}".format(batch_idx, loss.item()))

log.post_metrics({
"run_id": context.meta.run_id,
"name": "train loss @ rank{}".format(context.meta.rank),
"value": loss.item(),
"date": str(datetime.datetime.now()),
"cumulative": False,
"metadata":
"Training loss at rank {}, epoch {} and batch {}".format(
context.meta.rank, context.runtime.current_epoch,
batch_idx
)
}, context.meta.rank)
if context.meta.debug and batch_idx >= 10:
break

Expand Down Expand Up @@ -106,6 +120,18 @@ def do_validate(model, optimizer, criterion, metrics, context):
'best_prec1': context.runtime.best_prec1,
}, is_best, context)

log.post_metrics({
"run_id": context.meta.run_id,
"name": "Prec@1",
"value": "{:.3f}".format(val_prec1),
"date": str(datetime.datetime.now()),
"cumulative": False,
"metadata":
"Validation Prec1 at epoch {}".format(
context.runtime.current_epoch
)
}, context.meta.rank)


class TrainValidation(object):
def __call__(self, model, optimizer, criterion, metrics, context):
Expand Down
86 changes: 83 additions & 3 deletions mlbench/refimpls/pytorch/utils/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
"""
import logging
import torch.distributed as dist
import json
import time
import subprocess

logger = logging.getLogger('mlbench')

Expand Down Expand Up @@ -31,17 +34,94 @@ def debug(content, who='all'):

def warning(content, who='all'):
if who == 'all' or who == dist.get_rank():
logger.warning("\033[0;31m{}\033[0m".format(content))
logger.warning("{}".format(content))


def critical(content, who='all'):
if who == 'all' or who == dist.get_rank():
logger.critical("\033[0;104m{}\033[0m".format(content))
logger.critical("{}".format(content))


class AsyncMetricsPost(object):
"""Post metrics payload to endpoint in an asynchronized way."""

def __init__(self):
self._initialized = False

def init(self):
from kubernetes import config, client
config.load_incluster_config()
configuration = client.Configuration()

class MyApiClient(client.ApiClient):
"""
A bug introduced by a fix.

https://github.com/kubernetes-client/python/issues/411
https://github.com/swagger-api/swagger-codegen/issues/6392
"""

def __del__(self):
pass

self.api_instance = client.CoreV1Api(MyApiClient(configuration))

# TODO: remove hardcoded part in the future.
self.namespace = 'default'
label_selector = 'component=master,app=mlbench'

try:
api_response = self.api_instance.list_namespaced_pod(
self.namespace, label_selector=label_selector)
except Exception as e:
print("Exception when calling CoreV1Api->list_namespaced_pod: %s\n" % e)

assert len(api_response.items) == 1
master_pod = api_response.items[0]
ip = master_pod.status.pod_ip
self.endpoint = "http://{ip}/api/metrics/".format(ip=ip)
self._initialized = True

def post(self, payload):
"""Post information via kubernetes client.

Example:

payload = {
"run_id": "1",
"name": "accuracy",
"cumulative": False,
"date": "2018-08-14T09:21:44.331823Z",
"value": "1.0",
"metadata": "some additional data"
}

See `KubeMetric` in kubemetric.py for the fields and types.

"""
if not self._initialized:
self.init()

command = [
"/usr/bin/curl",
"-d", json.dumps(payload),
"-H", "Content-Type: application/json",
"-X", "POST", self.endpoint
]
subprocess.Popen(command)


async_post = AsyncMetricsPost()


def post_metrics(payload, rank):
if rank == 0:
async_post.post(payload)


def todo(content, who='all'):
if who == 'all' or who == dist.get_rank():
logger.warning("\033[0;33m{}\033[0m".format(content))
logger.warning("{}".format(content))


def configuration_information(context):
Expand Down