Merge remote-tracking branch 'upstream/develop' into numpy-generic-kernels

danieldk · danieldk · commit f3645da905a4 · 2022-03-31T13:20:07.000+02:00
diff --git a/README.md b/README.md
@@ -75,7 +75,7 @@ Also see the [`/examples`](examples) directory and [usage documentation](https:/
 
 ### 📖 Documentation & usage guides
 
-|                                                                                   |                                                       |
+| Documentation                                                                     | Description                                           |
 | --------------------------------------------------------------------------------- | ----------------------------------------------------- |
 | [Introduction](https://thinc.ai/docs)                                             | Everything you need to know.                          |
 | [Concept & Design](https://thinc.ai/docs/concept)                                 | Thinc's conceptual model and how it works.            |
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -20,7 +20,7 @@ jobs:
   strategy:
     matrix:
       Python36Windows:
-        imageName: 'windows-latest'
+        imageName: 'windows-2019'
         python.version: '3.6'
       Python37Mac:
         imageName: 'macos-10.15'
diff --git a/requirements.txt b/requirements.txt
@@ -17,7 +17,7 @@ contextvars>=2.4,<3; python_version < "3.7"
 # Development dependencies
 cython>=0.25.0,<3.0
 hypothesis>=3.27.0,<7.0.0
-pytest>=5.2.0,<7.1.0
+pytest>=5.2.0,!=7.1.0
 pytest-cov>=2.7.0,<2.8.0
 coverage>=5.0.0,<6.0.0
 mock>=2.0.0,<3.0.0
diff --git a/thinc/backends/_custom_kernels.py b/thinc/backends/_custom_kernels.py
@@ -126,6 +126,7 @@ def compile_mmh(src):
 
 def clipped_linear(
     X,
+    *,
     inplace=False,
     slope=1.0,
     offset=0.0,
@@ -154,7 +155,7 @@ def clipped_linear(
     return out
 
 
-def gelu(X, inplace=False, threshold=6.0, threads_per_block=128, num_blocks=128):
+def gelu(X, *, inplace=False, threshold=6.0, threads_per_block=128, num_blocks=128):
     _is_float_array(X)
 
     out = X
@@ -179,32 +180,32 @@ def check_seq2col_lengths(lengths, B):
     return lengths
 
 
-def seq2col(X, nW, *, lengths=None, threads_per_block=128, num_blocks=128):
-    _is_float_array(X)
+def seq2col(seq, nW, *, lengths=None, threads_per_block=128, num_blocks=128):
+    _is_float_array(seq)
 
-    B = X.shape[0]
+    B = seq.shape[0]
     nF = nW * 2 + 1
-    I = X.shape[1]
+    I = seq.shape[1]
 
     lengths = check_seq2col_lengths(lengths, B)
     nL = lengths.shape[0]
 
-    out = cupy.zeros((B, I * nF), dtype=X.dtype)
+    out = cupy.zeros((B, I * nF), dtype=seq.dtype)
 
-    if X.size != 0 and lengths.size != 0:
-        if X.dtype == "float32":
+    if seq.size != 0 and lengths.size != 0:
+        if seq.dtype == "float32":
             seq2col_kernel_float(
-                (num_blocks,), (threads_per_block,), (out, X, lengths, nW, B, I, nL)
+                (num_blocks,), (threads_per_block,), (out, seq, lengths, nW, B, I, nL)
             )
         else:
             seq2col_kernel_double(
-                (num_blocks,), (threads_per_block,), (out, X, lengths, nW, B, I, nL)
+                (num_blocks,), (threads_per_block,), (out, seq, lengths, nW, B, I, nL)
             )
 
     return out
 
 
-def maxout(X, threads_per_block=128, num_blocks=128):
+def maxout(X, *, threads_per_block=128, num_blocks=128):
     _is_float_array(X)
 
     B, I, P = X.shape
@@ -225,7 +226,7 @@ def maxout(X, threads_per_block=128, num_blocks=128):
     return best, which
 
 
-def mish(X, inplace=False, threshold=5, threads_per_block=128, num_blocks=128):
+def mish(X, *, inplace=False, threshold=5, threads_per_block=128, num_blocks=128):
     _is_float_array(X)
 
     out = X
@@ -244,7 +245,7 @@ def mish(X, inplace=False, threshold=5, threads_per_block=128, num_blocks=128):
     return out
 
 
-def reduce_sum(X, lengths, threads_per_block=128, num_blocks=128):
+def reduce_sum(X, lengths, *, threads_per_block=128, num_blocks=128):
     _is_float_array(X)
 
     B = len(lengths)
@@ -267,7 +268,7 @@ def reduce_sum(X, lengths, threads_per_block=128, num_blocks=128):
     return out
 
 
-def reduce_mean(X, lengths, threads_per_block=128, num_blocks=128):
+def reduce_mean(X, lengths, *, threads_per_block=128, num_blocks=128):
     _is_float_array(X)
 
     B = len(lengths)
@@ -292,7 +293,7 @@ def reduce_mean(X, lengths, threads_per_block=128, num_blocks=128):
     return out
 
 
-def reduce_max(X, lengths, threads_per_block=128, num_blocks=128):
+def reduce_max(X, lengths, *, threads_per_block=128, num_blocks=128):
     _is_float_array(X)
 
     B = len(lengths)
@@ -317,7 +318,7 @@ def reduce_max(X, lengths, threads_per_block=128, num_blocks=128):
     return maxes, which
 
 
-def swish(X, inplace=False, threshold=17.0, threads_per_block=128, num_blocks=128):
+def swish(X, *, inplace=False, threshold=17.0, threads_per_block=128, num_blocks=128):
     _is_float_array(X)
 
     out = X
@@ -362,6 +363,7 @@ def backprop_seq2col(dY, nW, *, lengths=None, threads_per_block=128, num_blocks=
 def backprop_clipped_linear(
     dY,
     X,
+    *,
     slope: float = 1.0,
     offset: float = 0.0,
     min_val: float = 0.0,
@@ -394,7 +396,7 @@ def backprop_clipped_linear(
 
 
 def backprop_hard_swish(
-    dY, X, inplace: bool = False, threads_per_block=128, num_blocks=128
+    dY, X, *, inplace: bool = False, threads_per_block=128, num_blocks=128
 ):
     _is_float_array(dY)
     _is_float_array(X, shape=dY.shape)
@@ -416,7 +418,7 @@ def backprop_hard_swish(
 
 
 def backprop_hard_swish_mobilenet(
-    dY, X, inplace: bool = False, threads_per_block=128, num_blocks=128
+    dY, X, *, inplace: bool = False, threads_per_block=128, num_blocks=128
 ):
     _is_float_array(dY)
     _is_float_array(X, shape=dY.shape)
@@ -438,7 +440,13 @@ def backprop_hard_swish_mobilenet(
 
 
 def backprop_gelu(
-    dY, X, inplace: bool = False, threshold=6.0, threads_per_block=128, num_blocks=128
+    dY,
+    X,
+    *,
+    inplace: bool = False,
+    threshold=6.0,
+    threads_per_block=128,
+    num_blocks=128,
 ):
     _is_float_array(dY)
     _is_float_array(X, shape=dY.shape)
@@ -459,7 +467,7 @@ def backprop_gelu(
     return out
 
 
-def backprop_maxout(dY, which, P, threads_per_block=128, num_blocks=128):
+def backprop_maxout(dY, which, P, *, threads_per_block=128, num_blocks=128):
     _is_float_array(dY)
 
     B = dY.shape[0]
@@ -482,7 +490,7 @@ def backprop_maxout(dY, which, P, threads_per_block=128, num_blocks=128):
 
 
 def backprop_mish(
-    dY, X, inplace: bool = False, threshold=5, threads_per_block=128, num_blocks=128
+    dY, X, *, inplace: bool = False, threshold=5, threads_per_block=128, num_blocks=128
 ):
     _is_float_array(dY)
     _is_float_array(X, shape=dY.shape)
@@ -503,51 +511,53 @@ def backprop_mish(
     return out
 
 
-def backprop_reduce_sum(d_sum, lengths, threads_per_block=128, num_blocks=128):
-    _is_float_array(d_sum)
+def backprop_reduce_sum(d_sums, lengths, *, threads_per_block=128, num_blocks=128):
+    _is_float_array(d_sums)
 
     B = len(lengths)
     T = int(lengths.sum())
-    O = d_sum.shape[1]
+    O = d_sums.shape[1]
     _check_lengths(lengths, T)
 
-    out = cupy.zeros((T, O), dtype=d_sum.dtype)
+    out = cupy.zeros((T, O), dtype=d_sums.dtype)
 
-    if d_sum.dtype == "float32":
+    if d_sums.dtype == "float32":
         backprop_reduce_sum_kernel_float(
-            (num_blocks,), (threads_per_block,), (out, d_sum, lengths, B, T, O)
+            (num_blocks,), (threads_per_block,), (out, d_sums, lengths, B, T, O)
         )
     else:
         backprop_reduce_sum_kernel_double(
-            (num_blocks,), (threads_per_block,), (out, d_sum, lengths, B, T, O)
+            (num_blocks,), (threads_per_block,), (out, d_sums, lengths, B, T, O)
         )
 
     return out
 
 
-def backprop_reduce_mean(d_mean, lengths, threads_per_block=128, num_blocks=128):
-    _is_float_array(d_mean)
+def backprop_reduce_mean(d_means, lengths, *, threads_per_block=128, num_blocks=128):
+    _is_float_array(d_means)
 
     B = len(lengths)
     T = int(lengths.sum())
-    O = d_mean.shape[1]
+    O = d_means.shape[1]
     _check_lengths(lengths, T)
 
-    out = cupy.zeros((T, O), dtype=d_mean.dtype)
+    out = cupy.zeros((T, O), dtype=d_means.dtype)
 
-    if d_mean.dtype == "float32":
+    if d_means.dtype == "float32":
         backprop_reduce_mean_kernel_float(
-            (num_blocks,), (threads_per_block,), (out, d_mean, lengths, B, T, O)
+            (num_blocks,), (threads_per_block,), (out, d_means, lengths, B, T, O)
         )
     else:
         backprop_reduce_mean_kernel_double(
-            (num_blocks,), (threads_per_block,), (out, d_mean, lengths, B, T, O)
+            (num_blocks,), (threads_per_block,), (out, d_means, lengths, B, T, O)
         )
 
     return out
 
 
-def backprop_reduce_max(d_maxes, which, lengths, threads_per_block=128, num_blocks=128):
+def backprop_reduce_max(
+    d_maxes, which, lengths, *, threads_per_block=128, num_blocks=128
+):
     _is_float_array(d_maxes)
 
     B = len(lengths)
@@ -572,7 +582,7 @@ def backprop_reduce_max(d_maxes, which, lengths, threads_per_block=128, num_bloc
 
 
 def backprop_swish(
-    dY, X, Y, inplace=False, threshold=17.0, threads_per_block=128, num_blocks=128
+    dY, X, Y, *, inplace=False, threshold=17.0, threads_per_block=128, num_blocks=128
 ):
     _is_float_array(dY)
     _is_float_array(X, shape=dY.shape)
@@ -594,7 +604,7 @@ def backprop_swish(
     return out
 
 
-def hash(ids, seed, threads_per_block=128, num_blocks=128):
+def hash(ids, seed, *, threads_per_block=128, num_blocks=128):
     out = cupy.zeros((ids.shape[0], 4), dtype="uint32")
 
     # sizeof(uint32_t) * 4
diff --git a/thinc/backends/cupy_ops.py b/thinc/backends/cupy_ops.py
@@ -150,8 +150,8 @@ def backprop_clipped_linear(
     ):
         if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
             return _custom_kernels.backprop_clipped_linear(
-                dY=dY,
-                X=X,
+                dY,
+                X,
                 slope=slope,
                 offset=offset,
                 min_val=min_val,
@@ -243,7 +243,7 @@ def backprop_seq2col(self, dY, nW, *, lengths=None):
 
     def reduce_mean(self, X, lengths):
         if X.dtype in ("float32", "float64") and lengths.dtype == "int32":
-            return _custom_kernels.reduce_mean(X, lengths)
+            return _custom_kernels.reduce_mean(X, lengths=lengths)
         else:
             super().reduce_mean(X, lengths)
 
diff --git a/thinc/backends/ops.py b/thinc/backends/ops.py
@@ -256,18 +256,17 @@ def unflatten(self, X: Floats2d, lengths: Ints1d, pad: int = 0) -> List[Floats2d
         """The reverse/backward operation of the `flatten` function: unflatten
         a large array into a list of arrays according to the given lengths.
         """
-        unflat = []
-        pad = int(pad)
-        for length in lengths:
-            length = int(length)
-            if pad >= 1 and length != 0:
-                X = X[pad:]
-            unflat.append(X[:length])
-            X = X[length:]
-        if pad >= 1:
-            X = X[pad:]
-        assert len(X) == 0
+        # cupy.split requires lengths to be in CPU memory.
+        lengths = to_numpy(lengths)
+
+        if pad > 0:
+            lengths = numpy.where(lengths > 0, lengths + pad, 0)  # type: ignore
+        unflat = self.xp.split(X, numpy.cumsum(lengths))[:-1]  # type: ignore
+        if pad > 0:
+            unflat = [a[pad:] for a in unflat]
+
         assert len(unflat) == len(lengths)
+
         return unflat
 
     @overload
diff --git a/thinc/shims/pytorch.py b/thinc/shims/pytorch.py
@@ -43,11 +43,6 @@ def __init__(
         mixed_precision: bool = False,
         grad_scaler: Optional[PyTorchGradScaler] = None,
     ):
-        if mixed_precision and not has_torch_amp:
-            raise ValueError(
-                "Mixed-precision training is not supported, requires capable GPU and torch>=1.9.0"
-            )
-
         super().__init__(model, config, optimizer)
 
         if grad_scaler is None:
diff --git a/thinc/shims/pytorch_grad_scaler.py b/thinc/shims/pytorch_grad_scaler.py
@@ -50,11 +50,6 @@ def __init__(
             When no overflows were found for this number of steps, the scale will
             be multiplied by "growth_factor".
         """
-        if enabled and not has_torch_amp:
-            raise ValueError(
-                "Gradient scaling is not supported, requires capable GPU and torch>=1.9.0"
-            )
-
         self._enabled = enabled
         self._growth_factor = growth_factor
         self._backoff_factor = backoff_factor
@@ -107,7 +102,18 @@ def _scale_tensor(
         scale_per_device: Dict["torch.device", "torch.Tensor"],
         inplace: bool,
     ):
-        assert tensor.is_cuda, "Gradient scaling is only supported for CUDA tensors"
+        if not has_torch_amp:
+            raise ValueError(
+                "Gradient scaling is not supported, requires capable GPU and torch>=1.9.0"
+            )
+
+        if not tensor.is_cuda:
+            msg = (
+                "Gradient scaling is only supported for CUDA tensors. "
+                "If you are using PyTorch models, you can avoid this "
+                "error by disabling mixed-precision support."
+            )
+            raise ValueError(msg)
 
         device = tensor.device
 
diff --git a/thinc/tests/layers/test_pytorch_wrapper.py b/thinc/tests/layers/test_pytorch_wrapper.py
@@ -148,18 +148,3 @@ def test_pytorch_convert_inputs(data, n_args, kwargs_keys):
     convert_inputs = model.attrs["convert_inputs"]
     Y, backprop = convert_inputs(model, data, is_train=True)
     check_input_converters(Y, backprop, data, n_args, kwargs_keys, torch.Tensor)
-
-
-@pytest.mark.skipif(not has_torch_gpu, reason="needs PyTorch with CUDA-capable GPU")
-@pytest.mark.skipif(
-    has_torch_amp, reason="needs PyTorch without mixed-precision support"
-)
-def test_raises_on_old_pytorch():
-    import torch.nn
-
-    pytorch_layer = torch.nn.Linear(5, 5)
-    with pytest.raises(ValueError, match=r"not supported.*1.9.0"):
-        PyTorchWrapper_v2(
-            pytorch_layer.cuda(),
-            mixed_precision=True,
-        )
diff --git a/thinc/tests/shims/test_pytorch_grad_scaler.py b/thinc/tests/shims/test_pytorch_grad_scaler.py