facebookresearch
diff --git a/‎classy_vision/models/classy_block.py‎
Lines changed: 3 additions & 8 deletions b/‎classy_vision/models/classy_block.py‎
Lines changed: 3 additions & 8 deletions
diff --git a/‎classy_vision/models/classy_model.py‎
Lines changed: 51 additions & 17 deletions b/‎classy_vision/models/classy_model.py‎
Lines changed: 51 additions & 17 deletions
diff --git a/‎classy_vision/models/densenet.py‎
Lines changed: 17 additions & 19 deletions b/‎classy_vision/models/densenet.py‎
Lines changed: 17 additions & 19 deletions
diff --git a/‎classy_vision/models/resnext.py‎
Lines changed: 59 additions & 18 deletions b/‎classy_vision/models/resnext.py‎
Lines changed: 59 additions & 18 deletions
diff --git a/‎classy_vision/models/resnext3d.py‎
Lines changed: 9 additions & 2 deletions b/‎classy_vision/models/resnext3d.py‎
Lines changed: 9 additions & 2 deletions
@@ -19,16 +19,11 @@ def __init__(self, name, module):
         self.name = name
         self.output = torch.zeros(0)
         self._module = module
-        self._should_cache_output = False
 
-    def set_cache_output(self, should_cache_output: bool = True):
-        """
-        Whether to cache the output of wrapped module for head execution.
-        """
-        self._should_cache_output = should_cache_output
+    def wrapped_module(self):
+        return self._module
 
     def forward(self, input):
         output = self._module(input)
-        if self._should_cache_output:
-            self.output = output
+        self.output = output
         return output
@@ -6,7 +6,7 @@
 
 import copy
 from enum import Enum
-from typing import Any, Dict
+from typing import Any, Dict, List
 
 import torch
 import torch.nn as nn
@@ -39,11 +39,13 @@ class ClassyModel(nn.Module):
 
     """
 
+    _attachable_block_names: List[str]
+
     def __init__(self):
         """Constructor for ClassyModel."""
         super().__init__()
-
         self._attachable_blocks = {}
+        self._attachable_block_names = []
         self._heads = nn.ModuleDict()
         self._head_outputs = {}
 
@@ -72,20 +74,23 @@ def get_classy_state(self, deep_copy=False):
 
         The returned state is used for checkpointing.
 
+        NOTE: For advanced users, the structure of the returned dict is -
+            `{"model": {"trunk": trunk_state, "heads": heads_state}}`.
+            The trunk state is the state of the model when no heads are attached.
+
         Args:
             deep_copy: If True, creates a deep copy of the state Dict. Otherwise, the
                 returned Dict's state will be tied to the object's.
 
         Returns:
             A state dictionary containing the state of the model.
         """
-        # If the model doesn't have head for fine-tuning, all of model's state
-        # live in the trunk
         attached_heads = self.get_heads()
-        # clear heads to get trunk only states. There shouldn't be any component
-        # states depend on heads
+        # clear heads to get the state of the model without any heads, which we refer to
+        # as the trunk state. If the model doesn't have heads attached, all of the
+        # model's state lives in the trunk.
         self._clear_heads()
-        trunk_state_dict = super().state_dict()
+        trunk_state_dict = self.state_dict()
         self.set_heads(attached_heads)
 
         head_state_dict = {}
@@ -124,11 +129,19 @@ def set_classy_state(self, state):
 
         This is used to load the state of the model from a checkpoint.
         """
+        # load the state for heads
         self.load_head_states(state)
 
-        current_state = self.state_dict()
-        current_state.update(state["model"]["trunk"])
-        super().load_state_dict(current_state)
+        # clear the heads to set the trunk's state. This is done because when heads are
+        # attached to modules, we wrap them by ClassyBlocks, thereby changing the
+        # structure of the model and its state dict. So, the trunk state is always
+        # fetched / set when there are no blocks attached.
+        attached_heads = self.get_heads()
+        self._clear_heads()
+        self.load_state_dict(state["model"]["trunk"])
+
+        # set the heads back again
+        self.set_heads(attached_heads)
 
     def forward(self, x):
         """
@@ -145,27 +158,51 @@ def extract_features(self, x):
         """
         return self.forward(x)
 
-    def build_attachable_block(self, name, module):
+    def _build_attachable_block(self, name, module):
         """
         Add a wrapper to the module to allow to attach heads to the module.
         """
         if name in self._attachable_blocks:
             raise ValueError("Found duplicated block name {}".format(name))
         block = ClassyBlock(name, module)
         self._attachable_blocks[name] = block
+        self._attachable_block_names.append(name)
         return block
 
     @property
     def attachable_block_names(self):
         """
         Return names of all attachable blocks.
         """
-        return self._attachable_blocks.keys()
+        return self._attachable_block_names
 
     def _clear_heads(self):
         # clear all existing heads
         self._heads.clear()
         self._head_outputs.clear()
+        self._strip_classy_blocks(self)
+        self._attachable_blocks = {}
+        self._attachable_block_names = []
+
+    def _strip_classy_blocks(self, module):
+        for name, child_module in module.named_children():
+            if isinstance(child_module, ClassyBlock):
+                module.add_module(name, child_module.wrapped_module())
+            self._strip_classy_blocks(child_module)
+
+    def _make_module_attachable(self, module, module_name):
+        found = False
+        for name, child_module in module.named_children():
+            if name == module_name:
+                module.add_module(
+                    name, self._build_attachable_block(name, child_module)
+                )
+                found = True
+                # do not exit - we will check all possible modules and raise an
+                # exception if there are duplicates
+            found_in_child = self._make_module_attachable(child_module, module_name)
+            found = found or found_in_child
+        return found
 
     def set_heads(self, heads: Dict[str, Dict[str, ClassyHead]]):
         """Attach all the heads to corresponding blocks.
@@ -190,11 +227,8 @@ def set_heads(self, heads: Dict[str, Dict[str, ClassyHead]]):
 
         head_ids = set()
         for block_name, block_heads in heads.items():
-            if block_name not in self._attachable_blocks:
-                raise ValueError(
-                    "block {} does not exist or can not be attached".format(block_name)
-                )
-            self._attachable_blocks[block_name].set_cache_output()
+            if not self._make_module_attachable(self, block_name):
+                raise KeyError(f"{block_name} not found in the model")
             for head in block_heads.values():
                 if head.unique_id in head_ids:
                     raise ValueError("head id {} already exists".format(head.unique_id))
 
@@ -8,6 +8,7 @@
 
 # dependencies:
 import math
+from collections import OrderedDict
 from typing import Any, Dict
 
 import torch
@@ -166,7 +167,7 @@ def __init__(
             )
         # loop over spatial resolutions:
         num_planes = init_planes
-        blocks = []
+        blocks = nn.Sequential()
         for idx, num_layers in enumerate(num_blocks):
             # add dense block
             block = self._make_dense_block(
@@ -178,18 +179,20 @@ def __init__(
                 use_se=use_se,
                 se_reduction_ratio=se_reduction_ratio,
             )
-            blocks.append(block)
+            blocks.add_module(f"block_{idx}", block)
             num_planes = num_planes + num_layers * growth_rate
 
             # add transition layer:
             if idx != len(num_blocks) - 1:
                 trans = _Transition(num_planes, num_planes // 2)
-                blocks.append(self.build_attachable_block(f"transition-{idx}", trans))
+                blocks.add_module(f"transition-{idx}", trans)
                 num_planes = num_planes // 2
 
-        blocks.append(self._make_trunk_output_block(num_planes, final_bn_relu))
+        blocks.add_module(
+            "trunk_output", self._make_trunk_output_block(num_planes, final_bn_relu)
+        )
 
-        self.features = nn.Sequential(*blocks)
+        self.features = blocks
 
         # initialize weights of convolutional and batchnorm layers:
         for m in self.modules():
@@ -208,7 +211,7 @@ def _make_trunk_output_block(self, num_planes, final_bn_relu):
             # final batch normalization:
             layers.add_module("norm-final", nn.BatchNorm2d(num_planes))
             layers.add_module("relu-final", nn.ReLU(inplace=INPLACE))
-        return self.build_attachable_block("trunk_output", layers)
+        return layers
 
     def _make_dense_block(
         self,
@@ -225,21 +228,16 @@ def _make_dense_block(
         assert is_pos_int(expansion)
 
         # create a block of dense layers at same resolution:
-        layers = []
+        layers = OrderedDict()
         for idx in range(num_layers):
-            layers.append(
-                self.build_attachable_block(
-                    f"block{block_idx}-{idx}",
-                    _DenseLayer(
-                        in_planes + idx * growth_rate,
-                        growth_rate=growth_rate,
-                        expansion=expansion,
-                        use_se=use_se,
-                        se_reduction_ratio=se_reduction_ratio,
-                    ),
-                )
+            layers[f"block{block_idx}-{idx}"] = _DenseLayer(
+                in_planes + idx * growth_rate,
+                growth_rate=growth_rate,
+                expansion=expansion,
+                use_se=use_se,
+                se_reduction_ratio=se_reduction_ratio,
             )
-        return nn.Sequential(*layers)
+        return nn.Sequential(layers)
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "DenseNet":
 
@@ -10,6 +10,9 @@
 
 import copy
 import math
+import re
+import warnings
+from collections import OrderedDict
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch.nn as nn
@@ -20,6 +23,8 @@
 from .squeeze_and_excitation_layer import SqueezeAndExcitationLayer
 
 
+# version number for the current implementation
+VERSION = 0.2
 # global setting for in-place ReLU:
 INPLACE = True
 
@@ -327,7 +332,7 @@ def __init__(
                 use_se=use_se,
                 se_reduction_ratio=se_reduction_ratio,
             )
-            blocks.append(nn.Sequential(*new_block))
+            blocks.append(new_block)
         self.blocks = nn.Sequential(*blocks)
 
         self.out_planes = out_planes[-1]
@@ -371,26 +376,21 @@ def _make_resolution_block(
         use_se=False,
         se_reduction_ratio=16,
     ):
-
         # add the desired number of residual blocks:
-        blocks = []
+        blocks = OrderedDict()
         for idx in range(num_blocks):
-            blocks.append(
-                self.build_attachable_block(
-                    "block{}-{}".format(resolution_idx, idx),
-                    self.layer_type(
-                        in_planes if idx == 0 else out_planes,
-                        out_planes,
-                        stride=stride if idx == 0 else 1,  # only first block has stride
-                        mid_planes_and_cardinality=mid_planes_and_cardinality,
-                        reduction=reduction,
-                        final_bn_relu=final_bn_relu or (idx != (num_blocks - 1)),
-                        use_se=use_se,
-                        se_reduction_ratio=se_reduction_ratio,
-                    ),
-                )
+            block_name = "block{}-{}".format(resolution_idx, idx)
+            blocks[block_name] = self.layer_type(
+                in_planes if idx == 0 else out_planes,
+                out_planes,
+                stride=stride if idx == 0 else 1,  # only first block has stride
+                mid_planes_and_cardinality=mid_planes_and_cardinality,
+                reduction=reduction,
+                final_bn_relu=final_bn_relu or (idx != (num_blocks - 1)),
+                use_se=use_se,
+                se_reduction_ratio=se_reduction_ratio,
             )
-        return blocks
+        return nn.Sequential(blocks)
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "ResNeXt":
@@ -459,6 +459,47 @@ def output_shape(self):
     def model_depth(self):
         return sum(self.num_blocks)
 
+    def _convert_model_state(self, state):
+        """Convert model state from the old implementation to the current format.
+
+        Updates the state dict in place and returns True if the state dict was updated.
+        """
+        pattern = r"blocks\.(?P<block_id_0>[0-9])\.(?P<block_id_1>[0-9])\._module\."
+        repl = r"blocks.\g<block_id_0>.block\g<block_id_0>-\g<block_id_1>."
+        trunk_dict = state["model"]["trunk"]
+        new_trunk_dict = {}
+        replaced_keys = False
+        for key, value in trunk_dict.items():
+            new_key = re.sub(pattern, repl, key)
+            if new_key != key:
+                replaced_keys = True
+            new_trunk_dict[new_key] = value
+        state["model"]["trunk"] = new_trunk_dict
+        state["version"] = VERSION
+        return replaced_keys
+
+    def get_classy_state(self):
+        state = super().get_classy_state()
+        state["version"] = VERSION
+
+    def set_classy_state(self, state):
+        version = state.get("version")
+        if version is None:
+            # convert the weights from the previous implementation of ResNeXt to the
+            # current one
+            if not self._convert_model_state(state):
+                raise RuntimeError("ResNeXt state conversion failed")
+            message = (
+                "Provided state dict is from an old implementation of ResNeXt. "
+                "This has been deprecated and will be removed soon."
+            )
+            warnings.warn(message, DeprecationWarning, stacklevel=2)
+        elif version != VERSION:
+            raise ValueError(
+                f"Unsupported ResNeXt version: {version}. Expected: {VERSION}"
+            )
+        super().set_classy_state(state)
+
 
 class _ResNeXt(ResNeXt):
     @classmethod
 
@@ -178,6 +178,11 @@ def set_classy_state(self, state):
         # We need to support both regular checkpoint loading and 2D conv weight
         # inflation into 3D conv weight in this function.
         self.load_head_states(state)
+
+        # clear the heads to set the trunk state
+        attached_heads = self.get_heads()
+        self._clear_heads()
+
         current_state = self.state_dict()
         for name, weight_src in state["model"]["trunk"].items():
             assert name in current_state, (
@@ -217,7 +222,10 @@ def set_classy_state(self, state):
                 )
 
             current_state[name] = weight_src.clone()
-        super().load_state_dict(current_state)
+        self.load_state_dict(current_state)
+
+        # set the heads back again
+        self.set_heads(attached_heads)
 
     def forward(self, x):
         """
@@ -400,7 +408,6 @@ def __init__(
                 [num_groups],
                 skip_transformation_type,
                 residual_transformation_type,
-                block_callback=self.build_attachable_block,
                 disable_pre_activation=(s == 0),
                 final_stage=(s == (num_stages - 1)),
             )