pytorch
diff --git a/‎.travis.yml‎
Lines changed: 18 additions & 9 deletions b/‎.travis.yml‎
Lines changed: 18 additions & 9 deletions
diff --git a/‎docs/source/concepts.rst‎
Lines changed: 27 additions & 0 deletions b/‎docs/source/concepts.rst‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎docs/source/contrib/handlers.rst‎
Lines changed: 16 additions & 0 deletions b/‎docs/source/contrib/handlers.rst‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎docs/source/engine.rst‎
Lines changed: 4 additions & 0 deletions b/‎docs/source/engine.rst‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/source/examples.rst‎
Lines changed: 2 additions & 0 deletions b/‎docs/source/examples.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/faq.rst‎
Lines changed: 1 addition & 3 deletions b/‎docs/source/faq.rst‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎docs/source/metrics.rst‎
Lines changed: 150 additions & 33 deletions b/‎docs/source/metrics.rst‎
Lines changed: 150 additions & 33 deletions
diff --git a/‎examples/contrib/cifar10/.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎examples/contrib/cifar10/.gitignore‎
Lines changed: 2 additions & 0 deletions
@@ -5,8 +5,8 @@ python:
   - "3.6"
 
 env:
-  - PYTORCH_PACKAGE=pytorch-cpu
-  - PYTORCH_PACKAGE=pytorch-nightly-cpu
+  - PYTORCH_CHANNEL=pytorch
+  - PYTORCH_CHANNEL=pytorch-nightly
 
 stages:
   - Lint check
@@ -25,25 +25,27 @@ before_install: &before_install
   - conda update -q conda
   # Useful for debugging any issues with conda
   - conda info -a
-  - conda create -q -n test-environment -c pytorch python=$TRAVIS_PYTHON_VERSION $PYTORCH_PACKAGE
+  - conda create -q -n test-environment pytorch cpuonly torchvision python=$TRAVIS_PYTHON_VERSION -c $PYTORCH_CHANNEL
   - source activate test-environment
   - if [[ $TRAVIS_PYTHON_VERSION == 2.7 ]]; then pip install enum34; fi
   # Test contrib dependencies
-  - pip install tqdm scikit-learn tensorboardX visdom polyaxon-client
+  - pip install tqdm scikit-learn tensorboardX visdom polyaxon-client mlflow
   # Futures should be already installed via visdom -> tornado -> futures
   # Let's reinstall it anyway to be sure
   - if [[ $TRAVIS_PYTHON_VERSION == 2.7 ]]; then pip install futures; fi
 
 install:
   - python setup.py install
-  - pip install numpy mock pytest codecov pytest-cov
+  - pip install numpy mock pytest codecov pytest-cov pytest-xdist
   # Examples dependencies
   - pip install matplotlib pandas
-  - conda install torchvision-cpu -c pytorch
   - pip install gym==0.10.11
 
 script:
-  - py.test --cov ignite --cov-report term-missing
+  - CUDA_VISIBLE_DEVICES="" py.test --tx 4*popen//python=python$TRAVIS_PYTHON_VERSION --cov ignite --cov-report term-missing -vvv tests/
+  # Run test on cuda device
+  # As no GPUs on travis -> all tests will be skipped
+  - CUDA_VISIBLE_DEVICES=0 py.test --cov ignite --cov-append --cov-report term-missing -vvv tests/ -k "on_cuda"
 
   # Smoke tests for the examples
   # Mnist
@@ -69,8 +71,15 @@ script:
 
   #fast-neural-style
   #train
+  - if [[ $TRAVIS_PYTHON_VERSION == 2.7 ]]; then mkdir -p /home/travis/.cache/torch/checkpoints/ && wget "https://download.pytorch.org/models/vgg16-397923af.pth" -O/home/travis/.cache/torch/checkpoints/vgg16-397923af.pth; fi
   - python examples/fast_neural_style/neural_style.py train --epochs 1 --cuda 0 --dataset test --dataroot . --image_size 32 --style_image examples/fast_neural_style/images/style_images/mosaic.jpg --style_size 32
 
+  # tests for distributed ops
+  # As no GPUs on travis -> all tests will be skipped
+  # 2 is the number of processes <-> number of available GPUs
+  - export WORLD_SIZE=2
+  - py.test --cov ignite --cov-append --cov-report term-missing --dist=each --tx $WORLD_SIZE*popen//python=python$TRAVIS_PYTHON_VERSION tests -m distributed -vvv
+
 after_success:
   - codecov
 
@@ -114,7 +123,7 @@ jobs:
     - stage: Deploy
       python: "3.6"
       env:
-        - PYTORCH_PACKAGE=pytorch-cpu
+        - PYTORCH_CHANNEL=pytorch
       if: tag IS present
 
       # Use previously defined before_install
@@ -168,7 +177,7 @@ jobs:
     - stage: Nightly
       python: "3.6"
       env:
-        - PYTORCH_PACKAGE=pytorch-nightly-cpu
+        - PYTORCH_CHANNEL=pytorch-nightly
       if: branch = nightly
       # Use previously defined before_install
       before_install: *before_install
 
@@ -88,6 +88,33 @@ Attaching an event handler is simple using method :meth:`~ignite.engine.Engine.a
 
     trainer.add_event_handler(Events.COMPLETED, on_training_ended, mydata)
 
+Event handlers can be detached via :meth:`~ignite.engine.Engine.remove_event_handler` or via the :class:`~ignite.engine.RemovableEventHandler`
+reference returned by :meth:`~ignite.engine.Engine.add_event_handler`. This can be used to reuse a configured engine for multiple loops:
+
+.. code-block:: python
+
+    model = ...
+    train_loader, validation_loader, test_loader = ...
+
+    trainer = create_supervised_trainer(model, optimizer, loss)
+    evaluator = create_supervised_evaluator(model, metrics={'acc': Accuracy()})
+
+    def log_metrics(engine, title):
+        print("Epoch: {} - {} accuracy: {:.2f}"
+               .format(trainer.state.epoch, title, engine.state.metrics['acc']))
+
+    @trainer.on(Events.EPOCH_COMPLETED)
+    def evaluate(trainer):
+        with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "train"):
+            evaluator.run(train_loader)
+
+        with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "validation"):
+            evaluator.run(validation_loader)
+
+        with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "test"):
+            evaluator.run(test_loader)
+
+    trainer.run(train_loader, max_epochs=100)
 
 .. Note ::
 
 
@@ -23,13 +23,29 @@ tensorboard_logger
    :members:
    :inherited-members:
 
+See `tensorboardX mnist example <https://github.com/pytorch/ignite/blob/master/examples/contrib/mnist/mnist_with_tensorboard_logger.py>`_
+and `CycleGAN and EfficientNet notebooks <https://github.com/pytorch/ignite/tree/master/examples/notebooks>`_ for detailed usage.
+
+
 visdom_logger
 -------------
 
 .. automodule:: ignite.contrib.handlers.visdom_logger
    :members:
    :inherited-members:
 
+See `visdom mnist example <https://github.com/pytorch/ignite/blob/master/examples/contrib/mnist/mnist_with_visdom_logger.py>`_
+for detailed usage.
+
+
+mlflow_logger
+-------------
+
+.. automodule:: ignite.contrib.handlers.mlflow_logger
+   :members:
+   :inherited-members:
+
+
 tqdm_logger
 -----------
 
 
@@ -14,3 +14,7 @@ ignite.engine
    :undoc-members:
 
 .. autoclass:: State
+
+.. autoclass:: RemovableEventHandler
+   :members:
+   :undoc-members:
@@ -8,6 +8,8 @@ to display how it helps to write compact and full-featured training loops in a f
 - `DCGAN <https://github.com/pytorch/ignite/tree/master/examples/gan>`_
 - `Reinforcement Learning <https://github.com/pytorch/ignite/tree/master/examples/reinforcement_learning>`_
 - `Fast Neural Style <https://github.com/pytorch/ignite/tree/master/examples/fast_neural_style>`_
+- `Distributed Cifar10 <https://github.com/pytorch/ignite/tree/master/examples/contrib/cifar10>`_
+
 
 Notebooks:
 
 
@@ -100,16 +100,14 @@ do this, the most simple is the following:
     def update_fn(engine, batch):
         model.train()
 
-        if engine.state.iteration % accumulation_steps == 0:
-            optimizer.zero_grad()
-
         x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
         y_pred = model(x)
         loss = criterion(y_pred, y) / accumulation_steps
         loss.backward()
 
         if engine.state.iteration % accumulation_steps == 0:
             optimizer.step()
+            optimizer.zero_grad()
 
         return loss.item()
 
 
@@ -7,65 +7,182 @@ fashion without having to store the entire output history of a model.
 In practice a user needs to attach the metric instance to an engine. The metric
 value is then computed using the output of the engine's `process_function`:
 
-    .. code-block:: python
+.. code-block:: python
 
-        def process_function(engine, batch):
-            # ...
-            return y_pred, y
+    def process_function(engine, batch):
+        # ...
+        return y_pred, y
 
-        engine = Engine(process_function)
-        metric = Accuracy()
-        metric.attach(engine, "accuracy")
+    engine = Engine(process_function)
+    metric = Accuracy()
+    metric.attach(engine, "accuracy")
 
 If the engine's output is not in the format `y_pred, y`, the user can
 use the `output_transform` argument to transform it:
 
+.. code-block:: python
+
+    def process_function(engine, batch):
+        # ...
+        return {'y_pred': y_pred, 'y_true': y, ...}
+
+    engine = Engine(process_function)
+
+    def output_transform(output):
+        # `output` variable is returned by above `process_function`
+        y_pred = output['y_pred']
+        y = output['y_true']
+        return y_pred, y  # output format is according to `Accuracy` docs
+
+    metric = Accuracy(output_transform=output_transform)
+    metric.attach(engine, "accuracy")
+
+
+.. Note ::
+
+    Most of implemented metrics are adapted to distributed computations and reduce their internal states across the GPUs
+    before computing metric value. This can be helpful to run the evaluation on multiple nodes/GPU instances with a
+    distributed data sampler. Following code snippet shows in detail how to adapt metrics:
+
     .. code-block:: python
 
-        def process_function(engine, batch):
-            # ...
-            return {'y_pred': y_pred, 'y_true': y, ...}
+        device = "cuda:{}".format(local_rank)
+        model = torch.nn.parallel.DistributedDataParallel(model,
+                                                          device_ids=[local_rank, ],
+                                                          output_device=local_rank)
+        test_sampler = DistributedSampler(test_dataset)
+        test_loader = DataLoader(test_dataset, batch_size=batch_size, sampler=test_sampler,
+                                 num_workers=num_workers, pin_memory=True)
 
-        engine = Engine(process_function)
+        evaluator = create_supervised_evaluator(model, metrics={'accuracy': Accuracy(device=device)}, device=device)
 
-        def output_transform(output):
-            # `output` variable is returned by above `process_function`
-            y_pred = output['y_pred']
-            y = output['y_true']
-            return y_pred, y  # output format is according to `Accuracy` docs
 
-        metric = Accuracy(output_transform=output_transform)
-        metric.attach(engine, "accuracy")
+Metric arithmetics
+------------------
 
 Metrics could be combined together to form new metrics. This could be done through arithmetics, such
 as ``metric1 + metric2``, use PyTorch operators, such as ``(metric1 + metric2).pow(2).mean()``,
 or use a lambda function, such as ``MetricsLambda(lambda a, b: torch.mean(a + b), metric1, metric2)``.
 
 For example:
 
-    .. code-block:: python
+.. code-block:: python
 
-        precision = Precision(average=False)
-        recall = Recall(average=False)
-        F1 = (precision * recall * 2 / (precision + recall)).mean()
+    precision = Precision(average=False)
+    recall = Recall(average=False)
+    F1 = (precision * recall * 2 / (precision + recall)).mean()
 
-    .. note::  This example computes the mean of F1 across classes. To combine
-        precision and recall to get F1 or other F metrics, we have to be careful
-        that `average=False`, i.e. to use the unaveraged precision and recall,
-        otherwise we will not be computing F-beta metrics.
+.. note::  This example computes the mean of F1 across classes. To combine
+    precision and recall to get F1 or other F metrics, we have to be careful
+    that `average=False`, i.e. to use the unaveraged precision and recall,
+    otherwise we will not be computing F-beta metrics.
 
 Metrics also support indexing operation (if metric's result is a vector/matrix/tensor). For example, this can be useful to compute mean metric (e.g. precision, recall or IoU) ignoring the background:
 
-    .. code-block:: python
+.. code-block:: python
+
+    cm = ConfusionMatrix(num_classes=10)
+    iou_metric = IoU(cm)
+    iou_no_bg_metric = iou_metric[:9]  # We assume that the background index is 9
+    mean_iou_no_bg_metric = iou_no_bg_metric.mean()
+    # mean_iou_no_bg_metric.compute() -> tensor(0.12345)
+
+How to create a custom metric
+-----------------------------
+
+To create a custom metric one needs to create a new class inheriting from :class:`~ignite.metrics.Metric` and override
+three methods :
+
+- `reset()` : resets internal variables and accumulators
+- `update(output)` : updates internal variables and accumulators with provided batch output `(y_pred, y)`
+- `compute()` : computes custom metric and return the result
+
+For example, we would like to implement for illustration purposes a multi-class accuracy metric with some
+specific condition (e.g. ignore user-defined classes):
+
+.. code-block:: python
+
+    from ignite.metrics import Metric
+    from ignite.exceptions import NotComputableError
+
+    # These decorators helps with distributed settings
+    from ignite.metrics.metric import sync_all_reduce, reinit_is_reduced
+
+
+    class CustomAccuracy(Metric):
+
+        def __init__(self, ignored_class, output_transform=lambda x: x, device=None):
+            self.ignored_class = ignored_class
+            self._num_correct = None
+            self._num_examples = None
+            super(CustomAccuracy, self).__init__(output_transform=output_transform, device=device)
+
+        @reinit_is_reduced
+        def reset(self):
+            self._num_correct = 0
+            self._num_examples = 0
+            super(CustomAccuracy, self).reset()
+
+        @reinit_is_reduced
+        def update(self, output):
+            y_pred, y = output
+
+            indices = torch.argmax(y_pred, dim=1)
+
+            mask = (y != self.ignored_class)
+            mask &= (indices != self.ignored_class)
+            y = y[mask]
+            indices = indices[mask]
+            correct = torch.eq(indices, y).view(-1)
+
+            self._num_correct += torch.sum(correct).item()
+            self._num_examples += correct.shape[0]
+
+        @sync_all_reduce("_num_examples", "_num_correct")
+        def compute(self):
+            if self._num_examples == 0:
+                raise NotComputableError('CustomAccuracy must have at least one example before it can be computed.')
+            return self._num_correct / self._num_examples
+
+
+We imported necessary classes as :class:`~ignite.metrics.Metric`, :class:`~ignite.exceptions.NotComputableError` and
+decorators to adapt the metric for distributed setting. In `reset` method, we reset internal variables `_num_correct`
+and `_num_examples` which are used to compute the custom metric. In `updated` method we define how to update
+the internal variables. And finally in `compute` method, we compute metric value.
+
+We can check this implementation in a simple case:
+
+.. code-block:: python
+
+    import torch
+    torch.manual_seed(8)
+
+    m = CustomAccuracy(ignored_class=3)
+
+    batch_size = 4
+    num_classes = 5
+
+    y_pred = torch.rand(batch_size, num_classes)
+    y = torch.randint(0, num_classes, size=(batch_size, ))
+
+    m.update((y_pred, y))
+    res = m.compute()
+
+    print(y, torch.argmax(y_pred, dim=1))
+    # Out: tensor([2, 2, 2, 3]) tensor([2, 1, 0, 0])
+
+    print(m._num_correct, m._num_examples, res)
+    # Out: 1 3 0.3333333333333333
+
+
+Metrics and distributed computations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-        cm = ConfusionMatrix(num_classes=10)
-        iou_metric = IoU(cm)
-        iou_no_bg_metric = iou_metric[:9]  # We assume that the background index is 9
-        mean_iou_no_bg_metric = iou_no_bg_metric.mean()
-        # mean_iou_no_bg_metric.compute() -> tensor(0.12345)
+In the above example, `CustomAccuracy` constructor has `device` argument and `reset`, `update`, `compute` methods are decorated with `reinit_is_reduced`, `sync_all_reduce`. The purpose of these features is to adapt metrics in distributed computations on CUDA devices and assuming the backend to support `"all_reduce" operation <https://pytorch.org/docs/stable/distributed.html#torch.distributed.all_reduce>`_. User can specify the device (by default, `cuda`) at metric's initialization. This device _can_ be used to store internal variables on and to collect all results from all participating devices. More precisely, in the above example we added `@sync_all_reduce("_num_examples", "_num_correct")` over `compute` method. This means that when `compute` method is called, metric's interal variables `self._num_examples` and `self._num_correct` are summed up over all participating devices. Therefore, once collected, these internal variables can be used to compute the final metric value.
 
 
-Complete list of metrics:
+Complete list of metrics
+------------------------
 
     - :class:`~ignite.metrics.Accuracy`
     - :class:`~ignite.metrics.Average`
 
@@ -0,0 +1,2 @@
+output
+cifar10