diff --git a/thinc/backends/ops.py b/thinc/backends/ops.py
index 2386e21cc..a7015d44d 100644
--- a/thinc/backends/ops.py
+++ b/thinc/backends/ops.py
@@ -229,56 +229,56 @@ def affine(self, X: Floats2d, W: Floats2d, b: Floats1d) -> Floats2d:
         Y += b
         return Y
 
-    @overload 
+    @overload
     def flatten(
         self,
         X: List[Floats2d],
         dtype: Optional[DTypes] = None,
         pad: int = 0,
         ndim_if_empty: int = 2,
-     ) -> Floats2d:
+    ) -> Floats2d:
         ...
 
-    @overload 
+    @overload
     def flatten(
         self,
         X: List[Ints1d],
         dtype: Optional[DTypes] = None,
         pad: int = 0,
         ndim_if_empty: int = 2,
-     ) -> Ints1d:
+    ) -> Ints1d:
         ...
 
-    @overload 
+    @overload
     def flatten(
         self,
         X: List2d,
         dtype: Optional[DTypes] = None,
         pad: int = 0,
         ndim_if_empty: int = 2,
-     ) -> Array2d:
+    ) -> Array2d:
         ...
 
     # further specific typed signatures can be added as necessary
 
-    @overload 
+    @overload
     def flatten(
         self,
         X: ListXd,
         dtype: Optional[DTypes] = None,
         pad: int = 0,
         ndim_if_empty: int = 2,
-     ) -> ArrayXd:
+    ) -> ArrayXd:
         ...
 
-    @overload 
+    @overload
     def flatten(
         self,
         X: Sequence[ArrayXd],
         dtype: Optional[DTypes] = None,
         pad: int = 0,
         ndim_if_empty: int = 2,
-     ) -> ArrayXd:
+    ) -> ArrayXd:
         ...
 
     def flatten(
diff --git a/thinc/layers/gelu.py b/thinc/layers/gelu.py
index d49ac77a9..cdb0fb6ee 100644
--- a/thinc/layers/gelu.py
+++ b/thinc/layers/gelu.py
@@ -34,8 +34,9 @@ def Gelu(
     return model
 
 
-def forward(model: Model[Floats2d, Floats2d],
-            X: Floats2d, is_train: bool) -> Tuple[Floats2d, Callable]:
+def forward(
+    model: Model[Floats2d, Floats2d], X: Floats2d, is_train: bool
+) -> Tuple[Floats2d, Callable]:
     W = cast(Floats2d, model.get_param("W"))
     b = cast(Floats1d, model.get_param("b"))
     Y_preact = model.ops.affine(X, W, b)
diff --git a/thinc/layers/hard_swish.py b/thinc/layers/hard_swish.py
index 81b1ad8dd..0478fd270 100644
--- a/thinc/layers/hard_swish.py
+++ b/thinc/layers/hard_swish.py
@@ -34,8 +34,9 @@ def HardSwish(
     return model
 
 
-def forward(model: Model[Floats2d, Floats2d],
-            X: Floats2d, is_train: bool) -> Tuple[Floats2d, Callable]:
+def forward(
+    model: Model[Floats2d, Floats2d], X: Floats2d, is_train: bool
+) -> Tuple[Floats2d, Callable]:
     W = cast(Floats2d, model.get_param("W"))
     b = cast(Floats1d, model.get_param("b"))
     Y_preact = model.ops.affine(X, W, b)
diff --git a/thinc/layers/hard_swish_mobilenet.py b/thinc/layers/hard_swish_mobilenet.py
index 38004c848..6a5dce388 100644
--- a/thinc/layers/hard_swish_mobilenet.py
+++ b/thinc/layers/hard_swish_mobilenet.py
@@ -34,17 +34,16 @@ def HardSwishMobilenet(
     return model
 
 
-def forward(model: Model[Floats2d, Floats2d],
-            X: Floats2d, is_train: bool) -> Tuple[Floats2d, Callable]:
+def forward(
+    model: Model[Floats2d, Floats2d], X: Floats2d, is_train: bool
+) -> Tuple[Floats2d, Callable]:
     W = cast(Floats2d, model.get_param("W"))
     b = cast(Floats1d, model.get_param("b"))
     Y_preact = model.ops.affine(X, W, b)
     Y = model.ops.hard_swish_mobilenet(Y_preact)
 
     def backprop(dY: Floats2d) -> Floats2d:
-        dY = model.ops.backprop_hard_swish_mobilenet(dY,
-                                                     Y_preact,
-                                                     inplace=False)
+        dY = model.ops.backprop_hard_swish_mobilenet(dY, Y_preact, inplace=False)
         model.inc_grad("b", dY.sum(axis=0))
         model.inc_grad("W", model.ops.gemm(dY, X, trans1=True))
         return model.ops.gemm(dY, W)
diff --git a/thinc/layers/layernorm.py b/thinc/layers/layernorm.py
index cf22015ed..684489c54 100644
--- a/thinc/layers/layernorm.py
+++ b/thinc/layers/layernorm.py
@@ -17,7 +17,7 @@ def LayerNorm(nI: Optional[int] = None) -> Model[InT, InT]:
         forward,
         init=init,
         dims={"nI": nI, "nO": nI},
-        params={"G": None, "b": None}
+        params={"G": None, "b": None},
     )
 
 
diff --git a/thinc/layers/swish.py b/thinc/layers/swish.py
index ea5444b49..a05a0dc72 100644
--- a/thinc/layers/swish.py
+++ b/thinc/layers/swish.py
@@ -34,8 +34,9 @@ def Swish(
     return model
 
 
-def forward(model: Model[Floats2d, Floats2d],
-            X: Floats2d, is_train: bool) -> Tuple[Floats2d, Callable]:
+def forward(
+    model: Model[Floats2d, Floats2d], X: Floats2d, is_train: bool
+) -> Tuple[Floats2d, Callable]:
     W = cast(Floats2d, model.get_param("W"))
     b = cast(Floats1d, model.get_param("b"))
     Y_preact = model.ops.affine(X, W, b)
diff --git a/thinc/model.py b/thinc/model.py
index 261858658..08366523e 100644
--- a/thinc/model.py
+++ b/thinc/model.py
@@ -464,7 +464,9 @@ def copy(self: SelfT) -> SelfT:
         """
         return self._copy()
 
-    def _copy(self: SelfT, seen: Optional[Dict[int, Union["Model", Shim]]] = None) -> SelfT:
+    def _copy(
+        self: SelfT, seen: Optional[Dict[int, Union["Model", Shim]]] = None
+    ) -> SelfT:
         if seen is None:
             seen = {}
         params = {}
diff --git a/thinc/optimizers.py b/thinc/optimizers.py
index c8e38e84b..f34cd2ff8 100644
--- a/thinc/optimizers.py
+++ b/thinc/optimizers.py
@@ -279,7 +279,7 @@ def _radam(self, ops, weights, grad, lr_scale, key, nr_upd):
 
         # exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
         exp_avg_sq *= beta2
-        exp_avg_sq += (1 - beta2) * (gradient_1D ** 2)
+        exp_avg_sq += (1 - beta2) * (gradient_1D**2)
         # exp_avg.mul_(beta1).add_(1 - beta1, grad)
         exp_avg *= beta1
         exp_avg += (1 - beta1) * gradient_1D
@@ -338,9 +338,9 @@ def _adam(self, ops, weights, gradient, lr_scale, key, nr_upd):
         mom2 = self.mom2[key]
         b1 = self.b1
         b2 = self.b2
-        fix1 = 1.0 - (b1 ** nr_upd)
-        fix2 = 1.0 - (b2 ** nr_upd)
-        lr = self.learn_rate * fix2 ** 0.5 / fix1
+        fix1 = 1.0 - (b1**nr_upd)
+        fix2 = 1.0 - (b2**nr_upd)
+        lr = self.learn_rate * fix2**0.5 / fix1
         eps = self.eps
         # needs to be 1D going into the adam function
         weights_1D, gradient_1D, mom1, mom2 = ops.adam(
diff --git a/thinc/tests/layers/test_combinators.py b/thinc/tests/layers/test_combinators.py
index ed9a2992a..ea5583108 100644
--- a/thinc/tests/layers/test_combinators.py
+++ b/thinc/tests/layers/test_combinators.py
@@ -271,10 +271,7 @@ def test_concatenate():
 def test_map_list():
     nI = 4
     nO = 9
-    Xs = [
-        numpy.zeros((6, nI), dtype="f"),
-        numpy.ones((3, nI), dtype="f")
-    ]
+    Xs = [numpy.zeros((6, nI), dtype="f"), numpy.ones((3, nI), dtype="f")]
     Y_shapes = [(x.shape[0], nO) for x in Xs]
     model = map_list(Linear())
     model.initialize(X=Xs, Y=[numpy.zeros(shape, dtype="f") for shape in Y_shapes])
diff --git a/thinc/tests/layers/test_pytorch_wrapper.py b/thinc/tests/layers/test_pytorch_wrapper.py
index e6f4edfb4..fc4396370 100644
--- a/thinc/tests/layers/test_pytorch_wrapper.py
+++ b/thinc/tests/layers/test_pytorch_wrapper.py
@@ -64,7 +64,9 @@ def test_pytorch_wrapper(nN, nI, nO):
     assert isinstance(model.predict(X), numpy.ndarray)
 
 
-@pytest.mark.skipif(not has_cupy or not has_torch_gpu, reason="needs PyTorch with CUDA-capable GPU")
+@pytest.mark.skipif(
+    not has_cupy or not has_torch_gpu, reason="needs PyTorch with CUDA-capable GPU"
+)
 @pytest.mark.parametrize("nN,nI,nO", [(2, 3, 4)])
 @pytest.mark.parametrize("mixed_precision", TORCH_MIXED_PRECISION)
 def test_pytorch_wrapper_thinc_input(nN, nI, nO, mixed_precision):
diff --git a/thinc/tests/layers/test_reduce.py b/thinc/tests/layers/test_reduce.py
index ba829f779..d26065c4a 100644
--- a/thinc/tests/layers/test_reduce.py
+++ b/thinc/tests/layers/test_reduce.py
@@ -92,6 +92,7 @@ def test_reduce_mean(Xs):
     dX = backprop(Y)
     assert dX.dataXd.shape == X.dataXd.shape
 
+
 def test_reduce_sum(Xs):
     model = reduce_sum()
     lengths = model.ops.asarray([x.shape[0] for x in Xs], dtype="i")
@@ -107,6 +108,7 @@ def test_reduce_sum(Xs):
     dX = backprop(Y)
     assert dX.dataXd.shape == X.dataXd.shape
 
+
 def test_size_mismatch(Xs):
     for reduce in [reduce_first, reduce_last, reduce_max, reduce_mean, reduce_sum]:
         model = reduce()
diff --git a/thinc/tests/layers/test_with_transforms.py b/thinc/tests/layers/test_with_transforms.py
index a01e20567..c23db1463 100644
--- a/thinc/tests/layers/test_with_transforms.py
+++ b/thinc/tests/layers/test_with_transforms.py
@@ -26,8 +26,8 @@ def list_input(shapes):
     for i, x in enumerate(data):
         # Give values that make it easy to see where rows or columns mismatch.
         x += i * 100
-        x += numpy.arange(x.shape[0]).reshape((-1, 1)) * 10 
-        x += numpy.arange(x.shape[1]).reshape((1, -1)) 
+        x += numpy.arange(x.shape[0]).reshape((-1, 1)) * 10
+        x += numpy.arange(x.shape[1]).reshape((1, -1))
     return data
 
 
@@ -68,8 +68,10 @@ def noop_models():
         with_array(noop()),
         with_array2d(noop()),
         with_list(noop()),
-        with_ragged(noop())
+        with_ragged(noop()),
     ]
+
+
 # As an example operation, lets just trim the last dimension. That
 # should catch stuff that confuses the input and output.
 
@@ -180,14 +182,14 @@ def test_noop_transforms(noop_models, ragged_input, padded_input, list_input):
     d_ragged = Ragged(ragged_input.data + 1, ragged_input.lengths)
     d_padded = padded_input.copy()
     d_padded.data += 1
-    d_list = [dx+1 for dx in list_input]
+    d_list = [dx + 1 for dx in list_input]
     for model in noop_models:
         print(model.name)
         check_transform_doesnt_change_noop_values(model, padded_input, d_padded)
         check_transform_doesnt_change_noop_values(model, list_input, d_list)
         check_transform_doesnt_change_noop_values(model, ragged_input, d_ragged)
 
-    
+
 def test_with_array_initialize(ragged_input, padded_input, list_input, array_input):
     for inputs in (ragged_input, padded_input, list_input, array_input):
         check_initialize(get_array_model(), inputs)
diff --git a/thinc/tests/mypy/test_mypy.py b/thinc/tests/mypy/test_mypy.py
index 287043578..e03d1c874 100644
--- a/thinc/tests/mypy/test_mypy.py
+++ b/thinc/tests/mypy/test_mypy.py
@@ -23,6 +23,7 @@ def test_mypy_results(
 ):
     pytest.importorskip("mypy")
     from mypy import api as mypy_api
+
     os.chdir(tmpdir)
     root_dir = Path(__file__).parent
     thinc_root_dir = Path(__file__).parent.parent.parent.parent
diff --git a/thinc/tests/test_loss.py b/thinc/tests/test_loss.py
index 710a88d61..75206d240 100644
--- a/thinc/tests/test_loss.py
+++ b/thinc/tests/test_loss.py
@@ -168,7 +168,9 @@ def test_sequence_categorical_crossentropy(guesses, labels, names):
     assert d_scores1[1][0] == pytest.approx(0.4, eps)
     assert d_scores1[1][1] == pytest.approx(-0.4, eps)
     # The normalization divides the difference (e.g. 0.4) by the number of seqs
-    d_scores = SequenceCategoricalCrossentropy(normalize=True, names=names).get_grad(guesses, labels)
+    d_scores = SequenceCategoricalCrossentropy(normalize=True, names=names).get_grad(
+        guesses, labels
+    )
     d_scores1 = d_scores[0]
     d_scores2 = d_scores[1]
 
@@ -189,7 +191,9 @@ def test_sequence_categorical_crossentropy(guesses, labels, names):
     assert d_scores2[0][0] == pytest.approx(0.1, eps)
     assert d_scores2[0][1] == pytest.approx(-0.35, eps)
 
-    loss = SequenceCategoricalCrossentropy(normalize=True, names=names).get_loss(guesses, labels)
+    loss = SequenceCategoricalCrossentropy(normalize=True, names=names).get_loss(
+        guesses, labels
+    )
     assert loss == pytest.approx(1.09, eps)
 
 
@@ -200,9 +204,9 @@ def test_sequence_categorical_crossentropy(guesses, labels, names):
     ],
 )
 def test_sequence_categorical_missing_negative(guesses, labels, names):
-    d_scores = SequenceCategoricalCrossentropy(normalize=False, names=names, neg_prefix="!", missing_value="").get_grad(
-        guesses, labels
-    )
+    d_scores = SequenceCategoricalCrossentropy(
+        normalize=False, names=names, neg_prefix="!", missing_value=""
+    ).get_grad(guesses, labels)
     d_scores0 = d_scores[0]
 
     # [0.1, 0.5, 0.6] should be A
@@ -292,8 +296,16 @@ def test_cosine_unmatched():
         ("SequenceCategoricalCrossentropy.v1", {}, ([scores0], [labels0])),
         ("CategoricalCrossentropy.v2", {"neg_prefix": "!"}, (scores0, labels0)),
         ("CategoricalCrossentropy.v3", {"neg_prefix": "!"}, (scores0, labels0)),
-        ("SequenceCategoricalCrossentropy.v2", {"neg_prefix": "!"}, ([scores0], [labels0])),
-        ("SequenceCategoricalCrossentropy.v3", {"neg_prefix": "!"}, ([scores0], [labels0])),
+        (
+            "SequenceCategoricalCrossentropy.v2",
+            {"neg_prefix": "!"},
+            ([scores0], [labels0]),
+        ),
+        (
+            "SequenceCategoricalCrossentropy.v3",
+            {"neg_prefix": "!"},
+            ([scores0], [labels0]),
+        ),
         ("L2Distance.v1", {}, (scores0, scores0)),
         (
             "CosineDistance.v1",
diff --git a/thinc/tests/test_serialize.py b/thinc/tests/test_serialize.py
index f3a937c34..b89fc2d94 100644
--- a/thinc/tests/test_serialize.py
+++ b/thinc/tests/test_serialize.py
@@ -55,7 +55,7 @@ def test_simple_model_roundtrip_bytes():
 
 
 def test_simple_model_roundtrip_bytes_length():
-    """ Ensure that serialization of non-initialized weight matrices goes fine """
+    """Ensure that serialization of non-initialized weight matrices goes fine"""
     model1 = Maxout(5, 10, nP=2)
     model2 = Maxout(5, 10, nP=2)
 
@@ -186,7 +186,7 @@ def test_simple_model_can_from_dict():
     assert model.can_from_dict(model_dict)
     # Test check without initialize
     assert Maxout(5, 10, nP=2).can_from_dict(model_dict)
-    # Test not-strict check 
+    # Test not-strict check
     assert not Maxout(10, 5, nP=2).can_from_dict(model_dict)
     assert Maxout(5, nP=2).can_from_dict(model_dict)