From 4b3e6358b6d05cc961fbab80babce14fd613a022 Mon Sep 17 00:00:00 2001
From: Samuel Marks <807580+SamuelMarks@users.noreply.github.com>
Date: Fri, 31 Mar 2023 00:20:02 -0400
Subject: [PATCH 1/4] [*.py] Move `Defaults to` to end of arg docstring and
 standardise values

---
 keras/applications/convnext.py               |  8 +++----
 keras/applications/efficientnet.py           |  8 +++----
 keras/applications/efficientnet_v2.py        |  8 +++----
 keras/applications/mobilenet_v3.py           |  8 +++----
 keras/applications/regnet.py                 |  8 +++----
 keras/applications/resnet_rs.py              | 14 +++++------
 keras/datasets/imdb.py                       | 12 +++++-----
 keras/datasets/reuters.py                    | 16 ++++++-------
 keras/engine/base_preprocessing_layer.py     |  8 +++----
 keras/engine/training.py                     | 21 +++++++++-------
 keras/estimator/__init__.py                  |  8 +++----
 keras/layers/preprocessing/integer_lookup.py |  2 +-
 keras/layers/preprocessing/normalization.py  |  2 +-
 keras/losses.py                              | 25 ++++++++------------
 keras/metrics/iou_metrics.py                 | 10 ++++----
 keras/metrics/probabilistic_metrics.py       |  4 ++--
 keras/metrics/regression_metrics.py          |  4 ++--
 keras/regularizers.py                        |  6 ++---
 18 files changed, 85 insertions(+), 87 deletions(-)

diff --git a/keras/applications/convnext.py b/keras/applications/convnext.py
index 8304d776e5d7..7915e3339bde 100644
--- a/keras/applications/convnext.py
+++ b/keras/applications/convnext.py
@@ -754,10 +754,10 @@ def preprocess_input(x, data_format=None):
 
     Args:
       x: A floating point `numpy.array` or a `tf.Tensor`.
-      data_format: Optional data format of the image tensor/array. Defaults to
-        None, in which case the global setting
-        `tf.keras.backend.image_data_format()` is used (unless you changed it,
-        it defaults to "channels_last").{mode}
+      data_format: Optional data format of the image tensor/array. `None` means
+        the global setting `tf.keras.backend.image_data_format()` is used
+        (unless you changed it, it defaults to "channels_last").{mode}.
+        Defaults to None
 
     Returns:
       Unchanged `numpy.array` or `tf.Tensor`.
diff --git a/keras/applications/efficientnet.py b/keras/applications/efficientnet.py
index 619499e671ac..2f699f9d0bc1 100644
--- a/keras/applications/efficientnet.py
+++ b/keras/applications/efficientnet.py
@@ -852,10 +852,10 @@ def preprocess_input(x, data_format=None):
 
     Args:
       x: A floating point `numpy.array` or a `tf.Tensor`.
-      data_format: Optional data format of the image tensor/array. Defaults to
-        None, in which case the global setting
-        `tf.keras.backend.image_data_format()` is used (unless you changed it,
-        it defaults to "channels_last").{mode}
+      data_format: Optional data format of the image tensor/array. `None` means
+        the global setting `tf.keras.backend.image_data_format()` is used
+        (unless you changed it, it defaults to "channels_last").{mode}.
+        Defaults to None
 
     Returns:
       Unchanged `numpy.array` or `tf.Tensor`.
diff --git a/keras/applications/efficientnet_v2.py b/keras/applications/efficientnet_v2.py
index 910ba4602a07..8a7ed0b7a7b3 100644
--- a/keras/applications/efficientnet_v2.py
+++ b/keras/applications/efficientnet_v2.py
@@ -1342,10 +1342,10 @@ def preprocess_input(x, data_format=None):
 
     Args:
       x: A floating point `numpy.array` or a `tf.Tensor`.
-      data_format: Optional data format of the image tensor/array. Defaults to
-        None, in which case the global setting
-        `tf.keras.backend.image_data_format()` is used (unless you changed it,
-        it defaults to "channels_last").{mode}
+      data_format: Optional data format of the image tensor/array. `None` means
+        the global setting `tf.keras.backend.image_data_format()` is used
+        (unless you changed it, it defaults to "channels_last").{mode}.
+        Defaults to None
 
     Returns:
       Unchanged `numpy.array` or `tf.Tensor`.
diff --git a/keras/applications/mobilenet_v3.py b/keras/applications/mobilenet_v3.py
index ac61c9970e16..a68a67385746 100644
--- a/keras/applications/mobilenet_v3.py
+++ b/keras/applications/mobilenet_v3.py
@@ -679,10 +679,10 @@ def preprocess_input(x, data_format=None):
 
     Args:
       x: A floating point `numpy.array` or a `tf.Tensor`.
-      data_format: Optional data format of the image tensor/array. Defaults to
-        None, in which case the global setting
-        `tf.keras.backend.image_data_format()` is used (unless you changed it,
-        it defaults to "channels_last").{mode}
+      data_format: Optional data format of the image tensor/array. `None` means
+        the global setting `tf.keras.backend.image_data_format()` is used
+        (unless you changed it, it defaults to "channels_last").{mode}.
+        Defaults to None
 
     Returns:
       Unchanged `numpy.array` or `tf.Tensor`.
diff --git a/keras/applications/regnet.py b/keras/applications/regnet.py
index b12956e514a7..97d9e3af428c 100644
--- a/keras/applications/regnet.py
+++ b/keras/applications/regnet.py
@@ -1819,10 +1819,10 @@ def preprocess_input(x, data_format=None):
 
     Args:
       x: A floating point `numpy.array` or a `tf.Tensor`.
-      data_format: Optional data format of the image tensor/array. Defaults to
-        None, in which case the global setting
-        `tf.keras.backend.image_data_format()` is used (unless you changed it,
-        it defaults to "channels_last").{mode}
+      data_format: Optional data format of the image tensor/array. `None` means
+        the global setting `tf.keras.backend.image_data_format()` is used
+        (unless you changed it, it defaults to "channels_last").{mode}.
+        Defaults to None
 
     Returns:
       Unchanged `numpy.array` or `tf.Tensor`.
diff --git a/keras/applications/resnet_rs.py b/keras/applications/resnet_rs.py
index 2aad806b0940..5830ff44852c 100644
--- a/keras/applications/resnet_rs.py
+++ b/keras/applications/resnet_rs.py
@@ -582,9 +582,9 @@ def ResNetRS(
           use on the "top" layer. Ignored unless `include_top=True`. Set
           `classifier_activation=None` to return the logits of the "top" layer.
         include_preprocessing: Boolean, whether to include the preprocessing
-          layer (`Rescaling`) at the bottom of the network. Defaults to `True`.
-          Note- Input image is normalized by ImageNet mean and standard
-          deviation.
+          layer (`Rescaling`) at the bottom of the network. Note- Input image
+          is normalized by ImageNet mean and standard deviation. Defaults to `True`
+
 
     Returns:
         A `tf.keras.Model` instance.
@@ -958,10 +958,10 @@ def preprocess_input(x, data_format=None):
 
     Args:
       x: A floating point `numpy.array` or a `tf.Tensor`.
-      data_format: Optional data format of the image tensor/array. Defaults to
-        None, in which case the global setting
-        `tf.keras.backend.image_data_format()` is used (unless you changed it,
-        it defaults to "channels_last").{mode}
+      data_format: Optional data format of the image tensor/array. `None` means
+        the global setting `tf.keras.backend.image_data_format()` is used
+        (unless you changed it, it defaults to "channels_last").{mode}.
+        Defaults to None
 
     Returns:
       Unchanged `numpy.array` or `tf.Tensor`.
diff --git a/keras/datasets/imdb.py b/keras/datasets/imdb.py
index ad0f1dca70ec..3d489d3d1c43 100644
--- a/keras/datasets/imdb.py
+++ b/keras/datasets/imdb.py
@@ -58,17 +58,17 @@ def load_data(
           ranked by how often they occur (in the training set) and only
           the `num_words` most frequent words are kept. Any less frequent word
           will appear as `oov_char` value in the sequence data. If None,
-          all words are kept. Defaults to None, so all words are kept.
+          all words are kept. Defaults to None
       skip_top: skip the top N most frequently occurring words
           (which may not be informative). These words will appear as
-          `oov_char` value in the dataset. Defaults to 0, so no words are
-          skipped.
+          `oov_char` value in the dataset. When 0, no words are
+          skipped. Defaults to `0`.
       maxlen: int or None. Maximum sequence length.
-          Any longer sequence will be truncated. Defaults to None, which
-          means no truncation.
+          Any longer sequence will be truncated. None, means no truncation.
+          Defaults to None
       seed: int. Seed for reproducible data shuffling.
       start_char: int. The start of a sequence will be marked with this
-          character. Defaults to 1 because 0 is usually the padding character.
+          character. 0 is usually the padding character. Defaults to 1
       oov_char: int. The out-of-vocabulary character.
           Words that were cut out because of the `num_words` or
           `skip_top` limits will be replaced with this character.
diff --git a/keras/datasets/reuters.py b/keras/datasets/reuters.py
index fbc431c068c3..82e1603f624f 100644
--- a/keras/datasets/reuters.py
+++ b/keras/datasets/reuters.py
@@ -65,20 +65,20 @@ def load_data(
           ranked by how often they occur (in the training set) and only
           the `num_words` most frequent words are kept. Any less frequent word
           will appear as `oov_char` value in the sequence data. If None,
-          all words are kept. Defaults to None, so all words are kept.
+          all words are kept. Defaults to None
       skip_top: skip the top N most frequently occurring words
           (which may not be informative). These words will appear as
-          `oov_char` value in the dataset. Defaults to 0, so no words are
-          skipped.
+          `oov_char` value in the dataset. 0 means no words are
+          skipped. Defaults to 0
       maxlen: int or None. Maximum sequence length.
-          Any longer sequence will be truncated. Defaults to None, which
-          means no truncation.
+          Any longer sequence will be truncated. None means no truncation.
+          Defaults to None
       test_split: Float between 0 and 1. Fraction of the dataset to be used
-        as test data. Defaults to 0.2, meaning 20% of the dataset is used as
-        test data.
+        as test data. 0.2 means that 20% of the dataset is used as
+        test data. Defaults to 0.2
       seed: int. Seed for reproducible data shuffling.
       start_char: int. The start of a sequence will be marked with this
-          character. Defaults to 1 because 0 is usually the padding character.
+          character. 0 is usually the padding character. Defaults to 1
       oov_char: int. The out-of-vocabulary character.
           Words that were cut out because of the `num_words` or
           `skip_top` limits will be replaced with this character.
diff --git a/keras/engine/base_preprocessing_layer.py b/keras/engine/base_preprocessing_layer.py
index 56e648ef5251..767f3581efad 100644
--- a/keras/engine/base_preprocessing_layer.py
+++ b/keras/engine/base_preprocessing_layer.py
@@ -140,14 +140,14 @@ def compile(self, run_eagerly=None, steps_per_execution=None):
         """Configures the layer for `adapt`.
 
         Arguments:
-          run_eagerly: Bool. Defaults to `False`. If `True`, this `Model`'s
+          run_eagerly: Bool. If `True`, this `Model`'s
             logic will not be wrapped in a `tf.function`. Recommended to leave
             this as `None` unless your `Model` cannot be run inside a
-            `tf.function`.
-          steps_per_execution: Int. Defaults to 1. The number of batches to run
+            `tf.function`. Defaults to `False`.
+          steps_per_execution: Int. The number of batches to run
             during each `tf.function` call. Running multiple batches inside a
             single `tf.function` call can greatly improve performance on TPUs or
-            small models with a large Python overhead.
+            small models with a large Python overhead. Defaults to 1.
         """
         if steps_per_execution is None:
             steps_per_execution = 1
diff --git a/keras/engine/training.py b/keras/engine/training.py
index fe9c6e0f02fb..6d117f942bcb 100644
--- a/keras/engine/training.py
+++ b/keras/engine/training.py
@@ -673,12 +673,13 @@ def compile(
               coefficients.
             weighted_metrics: List of metrics to be evaluated and weighted by
               `sample_weight` or `class_weight` during training and testing.
-            run_eagerly: Bool. Defaults to `False`. If `True`, this `Model`'s
-              logic will not be wrapped in a `tf.function`. Recommended to leave
-              this as `None` unless your `Model` cannot be run inside a
-              `tf.function`. `run_eagerly=True` is not supported when using
-              `tf.distribute.experimental.ParameterServerStrategy`.
-            steps_per_execution: Int. Defaults to 1. The number of batches to
+            run_eagerly: Bool. If `True`, this `Model`'s logic will not be
+              wrapped in a `tf.function`. Recommended to leave this as `None`
+              unless your `Model` cannot be run inside a `tf.function`.
+              `run_eagerly=True` is not supported when using
+              `tf.distribute.experimental.ParameterServerStrategy`. Defaults to
+               `False`.
+            steps_per_execution: Int. The number of batches to
               run during each `tf.function` call. Running multiple batches
               inside a single `tf.function` call can greatly improve performance
               on TPUs or small models with a large Python overhead. At most, one
@@ -687,7 +688,7 @@ def compile(
               the size of the epoch. Note that if `steps_per_execution` is set
               to `N`, `Callback.on_batch_begin` and `Callback.on_batch_end`
               methods will only be called every `N` batches (i.e. before/after
-              each `tf.function` execution).
+              each `tf.function` execution). Defaults to 1.
             jit_compile: If `True`, compile the model training step with XLA.
               [XLA](https://www.tensorflow.org/xla) is an optimizing compiler
               for machine learning.
@@ -708,9 +709,10 @@ def compile(
               not process the same data. The number of shards should be at least
               the number of workers for good performance. A value of 'auto'
               turns on exact evaluation and uses a heuristic for the number of
-              shards based on the number of workers. Defaults to 0, meaning no
+              shards based on the number of workers. 0, meaning no
               visitation guarantee is provided. NOTE: Custom implementations of
               `Model.test_step` will be ignored when doing exact evaluation.
+              Defaults to 0
             **kwargs: Arguments supported for backwards compatibility only.
         """
         if jit_compile and not tf_utils.can_jit_compile(warn=True):
@@ -3942,7 +3944,8 @@ def _get_compile_args(self, user_metrics=True):
 
         Args:
           user_metrics: Whether to return user-supplied metrics or `Metric`
-            objects. Defaults to returning the user-supplied metrics.
+            objects. If True, returns the user-supplied metrics.
+            Defaults to True.
 
         Returns:
           Dictionary of arguments that were used when compiling the model.
diff --git a/keras/estimator/__init__.py b/keras/estimator/__init__.py
index a48cb6df2aa3..dc8a6bc468bf 100644
--- a/keras/estimator/__init__.py
+++ b/keras/estimator/__init__.py
@@ -114,9 +114,9 @@ def input_fn():
         `tempfile.mkdtemp`
       config: `RunConfig` to config `Estimator`. Allows setting up things in
         `model_fn` based on configuration such as `num_ps_replicas`, or
-        `model_dir`. Defaults to `None`. If both `config.model_dir` and the
+        `model_dir`. If both `config.model_dir` and the
         `model_dir` argument (above) are specified the `model_dir` **argument**
-        takes precedence.
+        takes precedence. Defaults to `None`.
       checkpoint_format: Sets the format of the checkpoint saved by the
         estimator when training. May be `saver` or `checkpoint`, depending on
         whether to save checkpoints from `tf.train.Saver` or
@@ -312,9 +312,9 @@ def input_fn():
         `tempfile.mkdtemp`
       config: `RunConfig` to config `Estimator`. Allows setting up things in
         `model_fn` based on configuration such as `num_ps_replicas`, or
-        `model_dir`. Defaults to `None`. If both `config.model_dir` and the
+        `model_dir`. If both `config.model_dir` and the
         `model_dir` argument (above) are specified the `model_dir` **argument**
-        takes precedence.
+        takes precedence. Defaults to `None`.
       checkpoint_format: Sets the format of the checkpoint saved by the
         estimator when training. May be `saver` or `checkpoint`, depending on
         whether to save checkpoints from `tf.compat.v1.train.Saver` or
diff --git a/keras/layers/preprocessing/integer_lookup.py b/keras/layers/preprocessing/integer_lookup.py
index 8b250c3aabe0..64bee3b48542 100644
--- a/keras/layers/preprocessing/integer_lookup.py
+++ b/keras/layers/preprocessing/integer_lookup.py
@@ -82,7 +82,7 @@ class IntegerLookup(index_lookup.IndexLookup):
         vocabulary and instances of the mask token in the input will be dropped.
         If set to None, no mask term will be added. Defaults to None.
       oov_token: Only used when `invert` is True. The token to return for OOV
-        indices. Defaults to -1.
+        indices. Defaults to `-1`.
       vocabulary: Optional. Either an array of integers or a string path to a
         text file. If passing an array, can pass a tuple, list, 1D numpy array,
         or 1D tensor containing the integer vocbulary terms. If passing a file
diff --git a/keras/layers/preprocessing/normalization.py b/keras/layers/preprocessing/normalization.py
index 2ff1bb1af0ce..2c4077c6c546 100644
--- a/keras/layers/preprocessing/normalization.py
+++ b/keras/layers/preprocessing/normalization.py
@@ -52,7 +52,7 @@ class Normalization(base_preprocessing_layer.PreprocessingLayer):
           example, if shape is `(None, 5)` and `axis=1`, the layer will track 5
           separate mean and variance values for the last axis. If `axis` is set
           to `None`, the layer will normalize all elements in the input by a
-          scalar mean and variance. Defaults to -1, where the last axis of the
+          scalar mean and variance. Defaults to `-1`., where the last axis of the
           input is assumed to be a feature dimension and is normalized per
           index. Note that in the specific case of batched scalar inputs where
           the only axis is the batch axis, the default will normalize each index
diff --git a/keras/losses.py b/keras/losses.py
index adf918a5102d..16f5307fa799 100644
--- a/keras/losses.py
+++ b/keras/losses.py
@@ -637,7 +637,7 @@ def __init__(
             towards 0.5.  Larger values of `label_smoothing` correspond to
             heavier smoothing.
           axis: The axis along which to compute crossentropy (the features
-            axis).  Defaults to -1.
+            axis).  Defaults to `-1`.
           reduction: Type of `tf.keras.losses.Reduction` to apply to
             loss. Default value is `AUTO`. `AUTO` indicates that the reduction
             option will be determined by the usage context. For almost all cases
@@ -899,7 +899,7 @@ def __init__(
             `0.1`, use `0.1 / num_classes` for non-target labels and
             `0.9 + 0.1 / num_classes` for target labels.
           axis: The axis along which to compute crossentropy (the features
-            axis). Defaults to -1.
+            axis). Defaults to `-1`.
           reduction: Type of `tf.keras.losses.Reduction` to apply to
             loss. Default value is `AUTO`. `AUTO` indicates that the reduction
             option will be determined by the usage context. For almost all cases
@@ -1012,7 +1012,7 @@ class CategoricalFocalCrossentropy(LossFunctionWrapper):
             `0.1`, use `0.1 / num_classes` for non-target labels and
             `0.9 + 0.1 / num_classes` for target labels.
         axis: The axis along which to compute crossentropy (the features
-            axis). Defaults to -1.
+            axis). Defaults to `-1`.
         reduction: Type of `tf.keras.losses.Reduction` to apply to
             loss. Default value is `AUTO`. `AUTO` indicates that the reduction
             option will be determined by the usage context. For almost all cases
@@ -2084,8 +2084,7 @@ def categorical_crossentropy(
       label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
         example, if `0.1`, use `0.1 / num_classes` for non-target labels
         and `0.9 + 0.1 / num_classes` for target labels.
-      axis: Defaults to -1. The dimension along which the entropy is
-        computed.
+      axis: The dimension along which the entropy is computed. Defaults to `-1`.
 
     Returns:
       Categorical crossentropy loss value.
@@ -2138,8 +2137,7 @@ def _ragged_tensor_categorical_crossentropy(
       label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
         example, if `0.1`, use `0.1 / num_classes` for non-target labels
         and `0.9 + 0.1 / num_classes` for target labels.
-      axis: The axis along which to compute crossentropy (the features axis).
-          Defaults to -1.
+      axis: The axis along which to compute crossentropy (the features axis). Defaults to `-1`.
 
     Returns:
       Categorical crossentropy loss value.
@@ -2204,8 +2202,7 @@ def categorical_focal_crossentropy(
         label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
             example, if `0.1`, use `0.1 / num_classes` for non-target labels
             and `0.9 + 0.1 / num_classes` for target labels.
-        axis: Defaults to -1. The dimension along which the entropy is
-            computed.
+        axis: The dimension along which the entropy is computed. Defaults to `-1`.
 
     Returns:
         Categorical focal crossentropy loss value.
@@ -2284,8 +2281,7 @@ def _ragged_tensor_categorical_focal_crossentropy(
         label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
             example, if `0.1`, use `0.1 / num_classes` for non-target labels
             and `0.9 + 0.1 / num_classes` for target labels.
-        axis: Defaults to -1. The dimension along which the entropy is
-            computed.
+        axis: The dimension along which the entropy is computed. Defaults to `-1`.
 
     Returns:
       Categorical focal crossentropy loss value.
@@ -2341,8 +2337,7 @@ def sparse_categorical_crossentropy(
       y_pred: The predicted values.
       from_logits: Whether `y_pred` is expected to be a logits tensor. By
         default, we assume that `y_pred` encodes a probability distribution.
-      axis: Defaults to -1. The dimension along which the entropy is
-        computed.
+      axis: The dimension along which the entropy is computed. Defaults to `-1`.
       ignore_class: Optional integer. The ID of a class to be ignored during
         loss computation. This is useful, for example, in segmentation
         problems featuring a "void" class (commonly -1 or 255) in segmentation
@@ -2412,7 +2407,7 @@ def binary_crossentropy(
         squeezing them towards 0.5 That is, using `1. - 0.5 * label_smoothing`
         for the target class and `0.5 * label_smoothing` for the non-target
         class.
-      axis: The axis along which the mean is computed. Defaults to -1.
+      axis: The axis along which the mean is computed. Defaults to `-1`.
 
     Returns:
       Binary crossentropy loss value. shape = `[batch_size, d0, .. dN-1]`.
@@ -2792,7 +2787,7 @@ class CosineSimilarity(LossFunctionWrapper):
 
     Args:
       axis: The axis along which the cosine similarity is computed
-        (the features axis). Defaults to -1.
+        (the features axis). Defaults to `-1`.
       reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
         Default value is `AUTO`. `AUTO` indicates that the reduction option will
         be determined by the usage context. For almost all cases this defaults
diff --git a/keras/metrics/iou_metrics.py b/keras/metrics/iou_metrics.py
index 83aac5b94a18..a78c0f3183bb 100644
--- a/keras/metrics/iou_metrics.py
+++ b/keras/metrics/iou_metrics.py
@@ -67,7 +67,7 @@ class _IoUBase(base_metric.Metric):
       sparse_y_pred: Whether predictions are encoded using integers or
         dense floating point vectors. If `False`, the `tf.argmax` function
         will be used to determine each sample's most likely associated label.
-      axis: (Optional) Defaults to -1. The dimension containing the logits.
+      axis: (Optional) Defaults to `-1`. The dimension containing the logits.
     """
 
     def __init__(
@@ -197,7 +197,7 @@ class IoU(_IoUBase):
       sparse_y_pred: Whether predictions are encoded using integers or
         dense floating point vectors. If `False`, the `tf.argmax` function
         will be used to determine each sample's most likely associated label.
-      axis: (Optional) Defaults to -1. The dimension containing the logits.
+      axis: (Optional) Defaults to `-1`. The dimension containing the logits.
 
     Standalone usage:
 
@@ -465,7 +465,7 @@ class MeanIoU(IoU):
       sparse_y_pred: Whether predictions are encoded using integers or
         dense floating point vectors. If `False`, the `tf.argmax` function
         will be used to determine each sample's most likely associated label.
-      axis: (Optional) Defaults to -1. The dimension containing the logits.
+      axis: (Optional) Defaults to `-1`. The dimension containing the logits.
 
     Standalone usage:
 
@@ -581,7 +581,7 @@ class OneHotIoU(IoU):
       sparse_y_pred: Whether predictions are encoded using natural numbers or
         probability distribution vectors. If `False`, the `tf.argmax` function
         will be used to determine each sample's most likely associated label.
-      axis: (Optional) Defaults to -1. The dimension containing the logits.
+      axis: (Optional) Defaults to `-1`. The dimension containing the logits.
 
     Standalone usage:
 
@@ -695,7 +695,7 @@ class apply.
       sparse_y_pred: Whether predictions are encoded using natural numbers or
         probability distribution vectors. If `False`, the `tf.argmax` function
         will be used to determine each sample's most likely associated label.
-      axis: (Optional) Defaults to -1. The dimension containing the logits.
+      axis: (Optional) Defaults to `-1`. The dimension containing the logits.
 
     Standalone usage:
 
diff --git a/keras/metrics/probabilistic_metrics.py b/keras/metrics/probabilistic_metrics.py
index 123b011b9867..7e7b7f1c7bf2 100644
--- a/keras/metrics/probabilistic_metrics.py
+++ b/keras/metrics/probabilistic_metrics.py
@@ -183,7 +183,7 @@ class CategoricalCrossentropy(base_metric.MeanMetricWrapper):
         smoothed, meaning the confidence on label values are relaxed. e.g.
         `label_smoothing=0.2` means that we will use a value of `0.1` for label
         `0` and `0.9` for label `1`"
-      axis: (Optional) Defaults to -1. The dimension along which entropy is
+      axis: (Optional) Defaults to `-1`. The dimension along which entropy is
         computed.
 
     Standalone usage:
@@ -261,7 +261,7 @@ class SparseCategoricalCrossentropy(base_metric.MeanMetricWrapper):
         metric computation. This is useful, for example, in segmentation
         problems featuring a "void" class (commonly -1 or 255) in segmentation
         maps. By default (`ignore_class=None`), all classes are considered.
-      axis: (Optional) Defaults to -1. The dimension along which entropy is
+      axis: (Optional) Defaults to `-1`. The dimension along which entropy is
         computed.
 
     Standalone usage:
diff --git a/keras/metrics/regression_metrics.py b/keras/metrics/regression_metrics.py
index 637706432d54..8a148cf60d3c 100644
--- a/keras/metrics/regression_metrics.py
+++ b/keras/metrics/regression_metrics.py
@@ -138,7 +138,7 @@ class CosineSimilarity(base_metric.MeanMetricWrapper):
     Args:
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
-      axis: (Optional) Defaults to -1. The dimension along which the cosine
+      axis: (Optional) Defaults to `-1`. The dimension along which the cosine
         similarity is computed.
 
     Standalone usage:
@@ -614,7 +614,7 @@ def cosine_similarity(y_true, y_pred, axis=-1):
     Args:
       y_true: The ground truth values.
       y_pred: The prediction values.
-      axis: (Optional) Defaults to -1. The dimension along which the cosine
+      axis: (Optional) Defaults to `-1`. The dimension along which the cosine
         similarity is computed.
 
     Returns:
diff --git a/keras/regularizers.py b/keras/regularizers.py
index f50fc0a6c8bf..1017b58b00cc 100644
--- a/keras/regularizers.py
+++ b/keras/regularizers.py
@@ -342,11 +342,11 @@ class OrthogonalRegularizer(Regularizer):
         be proportional to `factor` times the mean of the dot products between
         the L2-normalized rows (if `mode="rows"`, or columns if
         `mode="columns"`) of the inputs, excluding the product of each
-        row/column with itself.  Defaults to 0.01.
-      mode: String, one of `{"rows", "columns"}`. Defaults to `"rows"`. In rows
+        row/column with itself. Defaults to `0.01`.
+      mode: String, one of `{"rows", "columns"}`. In rows
         mode, the regularization effect seeks to make the rows of the input
         orthogonal to each other. In columns mode, it seeks to make the columns
-        of the input orthogonal to each other.
+        of the input orthogonal to each other. Defaults to `"rows"`
 
     Example:
 

From 977f03daa51aed704eb02663b296e07d1f805cd3 Mon Sep 17 00:00:00 2001
From: Samuel Marks <807580+SamuelMarks@users.noreply.github.com>
Date: Fri, 31 Mar 2023 21:36:20 -0400
Subject: [PATCH 2/4] [*.py] Move `Defaults to` to end of arg docstring and
 standardise values

---
 keras/applications/mobilenet.py               |  2 +-
 keras/engine/base_preprocessing_layer.py      |  2 +-
 keras/engine/training.py                      |  2 +-
 keras/initializers/initializers_v1.py         |  2 +-
 keras/layers/activation/softmax.py            |  5 +-
 .../layers/attention/multi_head_attention.py  |  2 +-
 .../layers/convolutional/conv1d_transpose.py  |  2 +-
 .../normalization/group_normalization.py      |  2 +-
 .../normalization/layer_normalization.py      |  5 +-
 .../normalization/unit_normalization.py       |  6 +-
 .../layers/preprocessing/category_encoding.py |  3 +-
 keras/layers/preprocessing/discretization.py  |  5 +-
 keras/layers/preprocessing/hashed_crossing.py |  5 +-
 keras/layers/preprocessing/hashing.py         | 16 ++---
 .../preprocessing/image_preprocessing.py      | 22 +++----
 keras/layers/preprocessing/index_lookup.py    |  7 ++-
 keras/layers/preprocessing/integer_lookup.py  |  9 +--
 keras/layers/preprocessing/normalization.py   |  3 +-
 keras/layers/preprocessing/string_lookup.py   |  8 +--
 keras/legacy_tf_layers/migration_utils.py     |  7 ++-
 keras/legacy_tf_layers/variable_scope_shim.py |  9 +--
 keras/metrics/accuracy_metrics.py             |  8 +--
 keras/metrics/base_metric.py                  |  4 +-
 keras/metrics/confusion_metrics.py            | 63 ++++++++++---------
 keras/metrics/iou_metrics.py                  | 20 +++---
 keras/metrics/probabilistic_metrics.py        |  8 +--
 keras/metrics/regression_metrics.py           | 19 +++---
 keras/mixed_precision/loss_scale_optimizer.py | 16 ++---
 28 files changed, 139 insertions(+), 123 deletions(-)

diff --git a/keras/applications/mobilenet.py b/keras/applications/mobilenet.py
index 5e4daa174ec3..84826c1d3d02 100644
--- a/keras/applications/mobilenet.py
+++ b/keras/applications/mobilenet.py
@@ -154,7 +154,7 @@ def MobileNet(
         - `max` means that global max pooling will be applied.
       classes: Optional number of classes to classify images into, only to be
         specified if `include_top` is True, and if no `weights` argument is
-        specified. Defaults to 1000.
+        specified. Defaults to `1000`.
       classifier_activation: A `str` or callable. The activation function to use
         on the "top" layer. Ignored unless `include_top=True`. Set
         `classifier_activation=None` to return the logits of the "top" layer.
diff --git a/keras/engine/base_preprocessing_layer.py b/keras/engine/base_preprocessing_layer.py
index 767f3581efad..bdd32405ee0f 100644
--- a/keras/engine/base_preprocessing_layer.py
+++ b/keras/engine/base_preprocessing_layer.py
@@ -147,7 +147,7 @@ def compile(self, run_eagerly=None, steps_per_execution=None):
           steps_per_execution: Int. The number of batches to run
             during each `tf.function` call. Running multiple batches inside a
             single `tf.function` call can greatly improve performance on TPUs or
-            small models with a large Python overhead. Defaults to 1.
+            small models with a large Python overhead. Defaults to `1`.
         """
         if steps_per_execution is None:
             steps_per_execution = 1
diff --git a/keras/engine/training.py b/keras/engine/training.py
index 6d117f942bcb..5da718f12b2c 100644
--- a/keras/engine/training.py
+++ b/keras/engine/training.py
@@ -688,7 +688,7 @@ def compile(
               the size of the epoch. Note that if `steps_per_execution` is set
               to `N`, `Callback.on_batch_begin` and `Callback.on_batch_end`
               methods will only be called every `N` batches (i.e. before/after
-              each `tf.function` execution). Defaults to 1.
+              each `tf.function` execution). Defaults to `1`.
             jit_compile: If `True`, compile the model training step with XLA.
               [XLA](https://www.tensorflow.org/xla) is an optimizing compiler
               for machine learning.
diff --git a/keras/initializers/initializers_v1.py b/keras/initializers/initializers_v1.py
index 9d2d3996e93c..4606cdb2b965 100644
--- a/keras/initializers/initializers_v1.py
+++ b/keras/initializers/initializers_v1.py
@@ -191,7 +191,7 @@ class RandomUniform(tf.compat.v1.random_uniform_initializer):
       minval: A python scalar or a scalar tensor. Lower bound of the range of
         random values to generate.
       maxval: A python scalar or a scalar tensor. Upper bound of the range of
-        random values to generate.  Defaults to 1 for float types.
+        random values to generate. Float default is 1. Defaults to `1.`.
       seed: A Python integer. Used to create random seeds. See
         `tf.compat.v1.set_random_seed` for behavior.
       dtype: Default data type, used if no `dtype` argument is provided when
diff --git a/keras/layers/activation/softmax.py b/keras/layers/activation/softmax.py
index d1c0e04aca99..ee3d19e0662c 100644
--- a/keras/layers/activation/softmax.py
+++ b/keras/layers/activation/softmax.py
@@ -72,8 +72,9 @@ class Softmax(Layer):
         normalization is applied.
     Call arguments:
       inputs: The inputs, or logits to the softmax layer.
-      mask: A boolean mask of the same shape as `inputs`. Defaults to `None`.
-        The mask specifies 1 to keep and 0 to mask.
+      mask: A boolean mask of the same shape as `inputs`. The mask specifies 1 to keep and 0 to mask.
+        Defaults to `None`.
+
 
     Returns:
       softmaxed output with the same shape as `inputs`.
diff --git a/keras/layers/attention/multi_head_attention.py b/keras/layers/attention/multi_head_attention.py
index 0ba235b955b0..e11538c7b780 100644
--- a/keras/layers/attention/multi_head_attention.py
+++ b/keras/layers/attention/multi_head_attention.py
@@ -215,7 +215,7 @@ class MultiHeadAttention(Layer):
             `attention_output` if `False`. Defaults to `False`.
         training: Python boolean indicating whether the layer should behave in
             training mode (adding dropout) or in inference mode (no dropout).
-            Defaults to either using the training mode of the parent
+            Will go with either using the training mode of the parent
             layer/model, or False (inference) if there is no parent layer.
         use_causal_mask: A boolean to indicate whether to apply a causal mask to
             prevent tokens from attending to future tokens (e.g., used in a
diff --git a/keras/layers/convolutional/conv1d_transpose.py b/keras/layers/convolutional/conv1d_transpose.py
index 026ae1d6bc60..e74cff0332c6 100644
--- a/keras/layers/convolutional/conv1d_transpose.py
+++ b/keras/layers/convolutional/conv1d_transpose.py
@@ -54,7 +54,7 @@ class Conv1DTranspose(Conv1D):
       kernel_size: An integer length of the 1D convolution window.
       strides: An integer specifying the stride of the convolution along the
         time dimension. Specifying a stride value != 1 is incompatible with
-        specifying a `dilation_rate` value != 1. Defaults to 1.
+        specifying a `dilation_rate` value != 1. Defaults to `1`.
       padding: one of `"valid"` or `"same"` (case-insensitive).
         `"valid"` means no padding. `"same"` results in padding with zeros
         evenly to the left/right or up/down of the input such that output has
diff --git a/keras/layers/normalization/group_normalization.py b/keras/layers/normalization/group_normalization.py
index 0a4c0cdde2ed..8b71fbb4dd35 100644
--- a/keras/layers/normalization/group_normalization.py
+++ b/keras/layers/normalization/group_normalization.py
@@ -50,7 +50,7 @@ class GroupNormalization(Layer):
     Args:
       groups: Integer, the number of groups for Group Normalization. Can be in
         the range [1, N] where N is the input dimension. The input dimension
-        must be divisible by the number of groups. Defaults to 32.
+        must be divisible by the number of groups. Defaults to `32`.
       axis: Integer or List/Tuple. The axis or axes to normalize across.
         Typically this is the features axis/axes. The left-out axes are
         typically the batch axis/axes. This argument defaults to `-1`, the last
diff --git a/keras/layers/normalization/layer_normalization.py b/keras/layers/normalization/layer_normalization.py
index 9a07c65b7bf0..9b080dc7eb43 100644
--- a/keras/layers/normalization/layer_normalization.py
+++ b/keras/layers/normalization/layer_normalization.py
@@ -128,8 +128,9 @@ class LayerNormalization(Layer):
       center: If True, add offset of `beta` to normalized tensor. If False,
         `beta` is ignored. Defaults to True.
       scale: If True, multiply by `gamma`. If False, `gamma` is not used.
-        Defaults to True. When the next layer is linear (also e.g. `nn.relu`),
-        this can be disabled since the scaling will be done by the next layer.
+        When the next layer is linear (also e.g. `nn.relu`), this can be
+        disabled since the scaling will be done by the next layer.
+        Defaults to True.
       beta_initializer: Initializer for the beta weight. Defaults to zeros.
       gamma_initializer: Initializer for the gamma weight. Defaults to ones.
       beta_regularizer: Optional regularizer for the beta weight. None by
diff --git a/keras/layers/normalization/unit_normalization.py b/keras/layers/normalization/unit_normalization.py
index 843ecb88c4b9..eb1746fdde15 100644
--- a/keras/layers/normalization/unit_normalization.py
+++ b/keras/layers/normalization/unit_normalization.py
@@ -40,9 +40,9 @@ class UnitNormalization(base_layer.Layer):
 
     Args:
       axis: Integer or list/tuple. The axis or axes to normalize across.
-        Typically this is the features axis or axes. The left-out axes are
-        typically the batch axis or axes. Defaults to `-1`, the last dimension
-        in the input.
+        Typically, this is the features axis or axes. The left-out axes are
+        typically the batch axis or axes. `-1` is the last dimension
+        in the input. Defaults to `-1`.
     """
 
     def __init__(self, axis=-1, **kwargs):
diff --git a/keras/layers/preprocessing/category_encoding.py b/keras/layers/preprocessing/category_encoding.py
index 305caa0da420..5b606616f02e 100644
--- a/keras/layers/preprocessing/category_encoding.py
+++ b/keras/layers/preprocessing/category_encoding.py
@@ -90,7 +90,7 @@ class CategoryEncoding(base_layer.Layer):
         inputs to the layer must integers in the range `0 <= value <
         num_tokens`, or an error will be thrown.
       output_mode: Specification for the output of the layer.
-        Defaults to `"multi_hot"`. Values can be `"one_hot"`, `"multi_hot"` or
+        Values can be `"one_hot"`, `"multi_hot"` or
         `"count"`, configuring the layer as follows:
           - `"one_hot"`: Encodes each individual element in the input into an
             array of `num_tokens` size, containing a 1 at the element index. If
@@ -105,6 +105,7 @@ class CategoryEncoding(base_layer.Layer):
           - `"count"`: Like `"multi_hot"`, but the int array contains a count of
             the number of times the token at that index appeared in the sample.
         For all output modes, currently only output up to rank 2 is supported.
+        Defaults to `"multi_hot"`.
       sparse: Boolean. If true, returns a `SparseTensor` instead of a dense
         `Tensor`. Defaults to `False`.
 
diff --git a/keras/layers/preprocessing/discretization.py b/keras/layers/preprocessing/discretization.py
index a9693b99e705..eec86b12c3fc 100644
--- a/keras/layers/preprocessing/discretization.py
+++ b/keras/layers/preprocessing/discretization.py
@@ -164,8 +164,8 @@ class Discretization(base_preprocessing_layer.PreprocessingLayer):
         0.01). Higher values of epsilon increase the quantile approximation, and
         hence result in more unequal buckets, but could improve performance
         and resource consumption.
-      output_mode: Specification for the output of the layer. Defaults to
-        `"int"`.  Values can be `"int"`, `"one_hot"`, `"multi_hot"`, or
+      output_mode: Specification for the output of the layer. Values can be
+       `"int"`, `"one_hot"`, `"multi_hot"`, or
         `"count"` configuring the layer as follows:
           - `"int"`: Return the discretized bin indices directly.
           - `"one_hot"`: Encodes each individual element in the input into an
@@ -180,6 +180,7 @@ class Discretization(base_preprocessing_layer.PreprocessingLayer):
             will be `(..., num_tokens)`.
           - `"count"`: As `"multi_hot"`, but the int array contains a count of
             the number of times the bin index appeared in the sample.
+        Defaults to `"int"`.
       sparse: Boolean. Only applicable to `"one_hot"`, `"multi_hot"`,
         and `"count"` output modes. If True, returns a `SparseTensor` instead of
         a dense `Tensor`. Defaults to False.
diff --git a/keras/layers/preprocessing/hashed_crossing.py b/keras/layers/preprocessing/hashed_crossing.py
index b64e0313261e..06f7bc2190f2 100644
--- a/keras/layers/preprocessing/hashed_crossing.py
+++ b/keras/layers/preprocessing/hashed_crossing.py
@@ -51,13 +51,14 @@ class HashedCrossing(base_layer.Layer):
 
     Args:
       num_bins: Number of hash bins.
-      output_mode: Specification for the output of the layer. Defaults to
-        `"int"`.  Values can be `"int"`, or `"one_hot"` configuring the layer as
+      output_mode: Specification for the output of the layer. Values can be `"int"`,
+        or `"one_hot"` configuring the layer as
         follows:
           - `"int"`: Return the integer bin indices directly.
           - `"one_hot"`: Encodes each individual element in the input into an
             array the same size as `num_bins`, containing a 1 at the input's bin
             index.
+        Defaults to `"int"`.
       sparse: Boolean. Only applicable to `"one_hot"` mode. If True, returns a
         `SparseTensor` instead of a dense `Tensor`. Defaults to False.
       **kwargs: Keyword arguments to construct a layer.
diff --git a/keras/layers/preprocessing/hashing.py b/keras/layers/preprocessing/hashing.py
index 84755929dd57..54815ec181b4 100644
--- a/keras/layers/preprocessing/hashing.py
+++ b/keras/layers/preprocessing/hashing.py
@@ -109,17 +109,16 @@ class Hashing(base_layer.Layer):
         bin, so the effective number of bins is `(num_bins - 1)` if `mask_value`
         is set.
       mask_value: A value that represents masked inputs, which are mapped to
-        index 0. Defaults to None, meaning no mask term will be added and the
-        hashing will start at index 0.
+        index 0. None means no mask term will be added and the
+        hashing will start at index 0. Defaults to None.
       salt: A single unsigned integer or None.
         If passed, the hash function used will be SipHash64, with these values
         used as an additional input (known as a "salt" in cryptography).
-        These should be non-zero. Defaults to `None` (in that
-        case, the FarmHash64 hash function is used). It also supports
-        tuple/list of 2 unsigned integer numbers, see reference paper for
-        details.
-      output_mode: Specification for the output of the layer. Defaults to
-        `"int"`.  Values can be `"int"`, `"one_hot"`, `"multi_hot"`, or
+        These should be non-zero. If None, uses the FarmHash64 hash function.
+        It also supports tuple/list of 2 unsigned integer numbers, see
+        reference paper for details. Defaults to `None`.
+      output_mode: Specification for the output of the layer. Values can bes
+        `"int"`, `"one_hot"`, `"multi_hot"`, or
         `"count"` configuring the layer as follows:
           - `"int"`: Return the integer bin indices directly.
           - `"one_hot"`: Encodes each individual element in the input into an
@@ -134,6 +133,7 @@ class Hashing(base_layer.Layer):
             will be `(..., num_tokens)`.
           - `"count"`: As `"multi_hot"`, but the int array contains a count of
             the number of times the bin index appeared in the sample.
+        Defaults to `"int"`.
       sparse: Boolean. Only applicable to `"one_hot"`, `"multi_hot"`,
         and `"count"` output modes. If True, returns a `SparseTensor` instead of
         a dense `Tensor`. Defaults to False.
diff --git a/keras/layers/preprocessing/image_preprocessing.py b/keras/layers/preprocessing/image_preprocessing.py
index c81b3f6e3aec..cf3c8faa81e8 100644
--- a/keras/layers/preprocessing/image_preprocessing.py
+++ b/keras/layers/preprocessing/image_preprocessing.py
@@ -65,9 +65,9 @@ class Resizing(base_layer.Layer):
         height: Integer, the height of the output shape.
         width: Integer, the width of the output shape.
         interpolation: String, the interpolation method.
-            Defaults to `"bilinear"`.
             Supports `"bilinear"`, `"nearest"`, `"bicubic"`, `"area"`,
             `"lanczos3"`, `"lanczos5"`, `"gaussian"`, `"mitchellcubic"`.
+            Defaults to `"bilinear"`.
         crop_to_aspect_ratio: If True, resize the images without aspect
             ratio distortion. When the original aspect ratio differs
             from the target aspect ratio, the output image will be
@@ -420,9 +420,9 @@ class RandomFlip(base_layer.BaseRandomLayer):
 
     Args:
         mode: String indicating which flip mode to use. Can be `"horizontal"`,
-            `"vertical"`, or `"horizontal_and_vertical"`. Defaults to
-            `"horizontal_and_vertical"`. `"horizontal"` is a left-right flip and
-            `"vertical"` is a top-bottom flip.
+            `"vertical"`, or `"horizontal_and_vertical"`. `"horizontal"` is a
+            left-right flip and `"vertical"` is a top-bottom flip. Defaults to
+            `"horizontal_and_vertical"`
         seed: Integer. Used to create a random seed.
     """
 
@@ -1055,9 +1055,9 @@ class RandomZoom(base_layer.BaseRandomLayer):
             result in an output
             zooming out between 20% to 30%.
             `width_factor=(-0.3, -0.2)` result in an
-            output zooming in between 20% to 30%. Defaults to `None`,
+            output zooming in between 20% to 30%. `None` means
             i.e., zooming vertical and horizontal directions
-            by preserving the aspect ratio.
+            by preserving the aspect ratio. Defaults to `None`.
         fill_mode: Points outside the boundaries of the input are
             filled according to the given mode
             (one of `{"constant", "reflect", "wrap", "nearest"}`).
@@ -1377,9 +1377,9 @@ class RandomBrightness(base_layer.BaseRandomLayer):
             will be used for upper bound.
         value_range: Optional list/tuple of 2 floats
             for the lower and upper limit
-            of the values of the input data. Defaults to [0.0, 255.0].
-            Can be changed to e.g. [0.0, 1.0] if the image input
-            has been scaled before this layer.
+            of the values of the input data.
+            To make no change, use [0.0, 1.0], e.g., if the image input
+            has been scaled before this layer. Defaults to [0.0, 255.0].
             The brightness adjustment will be scaled to this range, and the
             output values will be clipped to this range.
         seed: optional integer, for fixed RNG behavior.
@@ -1539,9 +1539,9 @@ class RandomHeight(base_layer.BaseRandomLayer):
             `factor=0.2` results in an output with
             height changed by a random amount in the range `[-20%, +20%]`.
         interpolation: String, the interpolation method.
-            Defaults to `"bilinear"`.
             Supports `"bilinear"`, `"nearest"`, `"bicubic"`, `"area"`,
             `"lanczos3"`, `"lanczos5"`, `"gaussian"`, `"mitchellcubic"`.
+            Defaults to `"bilinear"`.
         seed: Integer. Used to create a random seed.
 
     Input shape:
@@ -1661,9 +1661,9 @@ class RandomWidth(base_layer.BaseRandomLayer):
             `factor=0.2` results in an output with width changed
             by a random amount in the range `[-20%, +20%]`.
         interpolation: String, the interpolation method.
-            Defaults to `bilinear`.
             Supports `"bilinear"`, `"nearest"`, `"bicubic"`, `"area"`,
             `"lanczos3"`, `"lanczos5"`, `"gaussian"`, `"mitchellcubic"`.
+            Defaults to `bilinear`.
         seed: Integer. Used to create a random seed.
 
     Input shape:
diff --git a/keras/layers/preprocessing/index_lookup.py b/keras/layers/preprocessing/index_lookup.py
index c1c68ecf66af..c57740b087d7 100644
--- a/keras/layers/preprocessing/index_lookup.py
+++ b/keras/layers/preprocessing/index_lookup.py
@@ -135,9 +135,9 @@ class IndexLookup(base_preprocessing_layer.PreprocessingLayer):
       invert: Only valid when `output_mode` is `"int"`. If True, this layer will
         map indices to vocabulary items instead of mapping vocabulary items to
         indices. Default to False.
-      output_mode: Specification for the output of the layer. Defaults to
-        `"int"`.  Values can be `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`,
-        or `"tf_idf"` configuring the layer as follows:
+      output_mode: Specification for the output of the layer. Values can be
+        `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`, or `"tf_idf"` configuring
+        the layer as follows:
           - `"int"`: Return the raw integer indices of the input tokens.
           - `"one_hot"`: Encodes each individual element in the input into an
             array the same size as the vocabulary, containing a 1 at the element
@@ -153,6 +153,7 @@ class IndexLookup(base_preprocessing_layer.PreprocessingLayer):
             the number of times the token at that index appeared in the sample.
           - `"tf_idf"`: As `"multi_hot"`, but the TF-IDF algorithm is applied to
             find the value in each token slot.
+        Defaults to `"int"`.
       pad_to_max_tokens: Only valid when `output_mode` is `"multi_hot"`,
         `"count"`, or `"tf_idf"`. If True, the output will have its feature axis
         padded to `max_tokens` even if the number of unique tokens in the
diff --git a/keras/layers/preprocessing/integer_lookup.py b/keras/layers/preprocessing/integer_lookup.py
index 64bee3b48542..832ea1338542 100644
--- a/keras/layers/preprocessing/integer_lookup.py
+++ b/keras/layers/preprocessing/integer_lookup.py
@@ -75,7 +75,7 @@ class IntegerLookup(index_lookup.IndexLookup):
       num_oov_indices: The number of out-of-vocabulary tokens to use. If this
         value is more than 1, OOV inputs are modulated to determine their OOV
         value. If this value is 0, OOV inputs will cause an error when calling
-        the layer. Defaults to 1.
+        the layer. Defaults to `1`.
       mask_token: An integer token that represents masked inputs. When
         `output_mode` is `"int"`, the token is included in vocabulary and mapped
         to index 0. In other output modes, the token will not appear in the
@@ -99,9 +99,9 @@ class IntegerLookup(index_lookup.IndexLookup):
       invert: Only valid when `output_mode` is `"int"`. If True, this layer will
         map indices to vocabulary items instead of mapping vocabulary items to
         indices. Default to False.
-      output_mode: Specification for the output of the layer. Defaults to
-        `"int"`.  Values can be `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`,
-        or `"tf_idf"` configuring the layer as follows:
+      output_mode: Specification for the output of the layer. Values can be
+        `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`, or `"tf_idf"`
+        configuring the layer as follows:
           - `"int"`: Return the vocabulary indices of the input tokens.
           - `"one_hot"`: Encodes each individual element in the input into an
             array the same size as the vocabulary, containing a 1 at the element
@@ -119,6 +119,7 @@ class IntegerLookup(index_lookup.IndexLookup):
             find the value in each token slot.
         For `"int"` output, any shape of input and output is supported. For all
         other output modes, currently only output up to rank 2 is supported.
+        Defaults to `"int"`.
       pad_to_max_tokens: Only applicable when `output_mode` is `"multi_hot"`,
         `"count"`, or `"tf_idf"`. If True, the output will have its feature axis
         padded to `max_tokens` even if the number of unique tokens in the
diff --git a/keras/layers/preprocessing/normalization.py b/keras/layers/preprocessing/normalization.py
index 2c4077c6c546..c105877d8d64 100644
--- a/keras/layers/preprocessing/normalization.py
+++ b/keras/layers/preprocessing/normalization.py
@@ -52,11 +52,12 @@ class Normalization(base_preprocessing_layer.PreprocessingLayer):
           example, if shape is `(None, 5)` and `axis=1`, the layer will track 5
           separate mean and variance values for the last axis. If `axis` is set
           to `None`, the layer will normalize all elements in the input by a
-          scalar mean and variance. Defaults to `-1`., where the last axis of the
+          scalar mean and variance. When `-1` the last axis of the
           input is assumed to be a feature dimension and is normalized per
           index. Note that in the specific case of batched scalar inputs where
           the only axis is the batch axis, the default will normalize each index
           in the batch separately. In this case, consider passing `axis=None`.
+          Defaults to `-1`.
         mean: The mean value(s) to use during normalization. The passed value(s)
           will be broadcast to the shape of the kept axes above; if the value(s)
           cannot be broadcast, an error will be raised when this layer's
diff --git a/keras/layers/preprocessing/string_lookup.py b/keras/layers/preprocessing/string_lookup.py
index 4b16dca6f636..d345fe89ff04 100644
--- a/keras/layers/preprocessing/string_lookup.py
+++ b/keras/layers/preprocessing/string_lookup.py
@@ -72,7 +72,7 @@ class StringLookup(index_lookup.IndexLookup):
       num_oov_indices: The number of out-of-vocabulary tokens to use. If this
         value is more than 1, OOV inputs are hashed to determine their OOV
         value. If this value is 0, OOV inputs will cause an error when calling
-        the layer.  Defaults to 1.
+        the layer.  Defaults to `1`.
       mask_token: A token that represents masked inputs. When `output_mode` is
         `"int"`, the token is included in vocabulary and mapped to index 0. In
         other output modes, the token will not appear in the vocabulary and
@@ -94,9 +94,8 @@ class StringLookup(index_lookup.IndexLookup):
       invert: Only valid when `output_mode` is `"int"`. If True, this layer will
         map indices to vocabulary items instead of mapping vocabulary items to
         indices. Default to False.
-      output_mode: Specification for the output of the layer. Defaults to
-        `"int"`.  Values can be `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`,
-        or `"tf_idf"` configuring the layer as follows:
+      output_mode: Specification for the output of the layer. Values can be
+        `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`, or `"tf_idf"` configuring the layer as follows:
           - `"int"`: Return the raw integer indices of the input tokens.
           - `"one_hot"`: Encodes each individual element in the input into an
             array the same size as the vocabulary, containing a 1 at the element
@@ -114,6 +113,7 @@ class StringLookup(index_lookup.IndexLookup):
             find the value in each token slot.
         For `"int"` output, any shape of input and output is supported. For all
         other output modes, currently only output up to rank 2 is supported.
+        Defaults to `"int"`
       pad_to_max_tokens: Only applicable when `output_mode` is `"multi_hot"`,
         `"count"`, or `"tf_idf"`. If True, the output will have its feature axis
         padded to `max_tokens` even if the number of unique tokens in the
diff --git a/keras/legacy_tf_layers/migration_utils.py b/keras/legacy_tf_layers/migration_utils.py
index 61dfcf6b9340..932cd51e619e 100644
--- a/keras/legacy_tf_layers/migration_utils.py
+++ b/keras/legacy_tf_layers/migration_utils.py
@@ -46,8 +46,11 @@ class DeterministicRandomTestTool(object):
     """
 
     def __init__(self, seed: int = 42, mode="constant"):
-        """Set mode to 'constant' or 'num_random_ops'. Defaults to
-        'constant'."""
+        """
+        Args:
+          mode: Set mode to 'constant' or 'num_random_ops'. Defaults to
+        'constant'.
+        """
         if mode not in {"constant", "num_random_ops"}:
             raise ValueError(
                 "Mode arg must be 'constant' or 'num_random_ops'. "
diff --git a/keras/legacy_tf_layers/variable_scope_shim.py b/keras/legacy_tf_layers/variable_scope_shim.py
index ed08ac542e32..dd4211e43f22 100644
--- a/keras/legacy_tf_layers/variable_scope_shim.py
+++ b/keras/legacy_tf_layers/variable_scope_shim.py
@@ -231,11 +231,11 @@ def get_variable(
           collections: List of graph collections keys to add the `Variable` to.
             Defaults to `[GraphKeys.GLOBAL_VARIABLES]` (see `tf.Variable`).
           caching_device: Optional device string or function describing where
-            the Variable should be cached for reading.  Defaults to the
+            the Variable should be cached for reading. `None` to use the
             Variable's device.  If not `None`, caches on another device.
             Typical use is to cache on the device where the Ops using the
             `Variable` reside, to deduplicate copying through `Switch` and other
-            conditional statements.
+            conditional statements. Defaults to None.
           partitioner: Optional callable that accepts a fully defined
             `TensorShape` and dtype of the `Variable` to be created, and returns
             a list of partitions for each axis (currently only one axis can be
@@ -245,8 +245,9 @@ def get_variable(
             initial_value must be known.
           use_resource: If False, creates a regular Variable. If True, creates
             instead an experimental ResourceVariable which has well-defined
-            semantics. Defaults to False (will later change to True). When eager
-            execution is enabled this argument is always forced to be true.
+            semantics. When starting off as False it will later change to True.
+            When eager execution is enabled this argument always True.
+            Defaults to False.
           custom_getter: Callable that takes as a first argument the true
             getter, and allows overwriting the internal get_variable method. The
             signature of `custom_getter` should match that of this method, but
diff --git a/keras/metrics/accuracy_metrics.py b/keras/metrics/accuracy_metrics.py
index 17cb1849e015..98e130a8efc7 100644
--- a/keras/metrics/accuracy_metrics.py
+++ b/keras/metrics/accuracy_metrics.py
@@ -261,7 +261,7 @@ class TopKCategoricalAccuracy(base_metric.MeanMetricWrapper):
 
     Args:
       k: (Optional) Number of top elements to look at for computing accuracy.
-        Defaults to 5.
+        Defaults to `5`.
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
 
@@ -307,7 +307,7 @@ class SparseTopKCategoricalAccuracy(base_metric.MeanMetricWrapper):
 
     Args:
       k: (Optional) Number of top elements to look at for computing accuracy.
-        Defaults to 5.
+        Defaults to `5`.
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
 
@@ -482,7 +482,7 @@ def top_k_categorical_accuracy(y_true, y_pred, k=5):
       y_true: The ground truth values.
       y_pred: The prediction values.
       k: (Optional) Number of top elements to look at for computing accuracy.
-        Defaults to 5.
+        Defaults to `5`.
 
     Returns:
       Top K categorical accuracy value.
@@ -514,7 +514,7 @@ def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
       y_true: tensor of true targets.
       y_pred: tensor of predicted targets.
       k: (Optional) Number of top elements to look at for computing accuracy.
-        Defaults to 5.
+        Defaults to `5`.
 
     Returns:
       Sparse top K categorical accuracy value.
diff --git a/keras/metrics/base_metric.py b/keras/metrics/base_metric.py
index af0aa318c99d..7a56b4d13815 100644
--- a/keras/metrics/base_metric.py
+++ b/keras/metrics/base_metric.py
@@ -471,7 +471,7 @@ def update_state(self, values, sample_weight=None):
 
         Args:
           values: Per-example value.
-          sample_weight: Optional weighting of each example. Defaults to 1.
+          sample_weight: Optional weighting of each example. Defaults to `1`.
 
         Returns:
           Update op.
@@ -828,7 +828,7 @@ def update_state(self, values, sample_weight=None):
 
         Args:
           values: Per-example value.
-          sample_weight: Optional weighting of each example. Defaults to 1.
+          sample_weight: Optional weighting of each example. Defaults to `1`.
 
         Returns:
           Update op.
diff --git a/keras/metrics/confusion_metrics.py b/keras/metrics/confusion_metrics.py
index 6a1af4ea22fa..80b90622be97 100644
--- a/keras/metrics/confusion_metrics.py
+++ b/keras/metrics/confusion_metrics.py
@@ -36,11 +36,11 @@ class _ConfusionMatrixConditionCount(base_metric.Metric):
 
     Args:
       confusion_matrix_cond: One of `metrics_utils.ConfusionMatrix` conditions.
-      thresholds: (Optional) Defaults to 0.5. A float value or a python
-        list/tuple of float threshold values in [0, 1]. A threshold is compared
-        with prediction values to determine the truth value of predictions
+      thresholds: (Optional) A float value or a python list/tuple of float
+        threshold values in [0, 1]. A threshold is compared with prediction
+        values to determine the truth value of predictions
         (i.e., above the threshold is `true`, below is `false`). One metric
-        value is generated for each threshold value.
+        value is generated for each threshold value. Defaults to 0.5.
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
     """
@@ -67,9 +67,9 @@ def update_state(self, y_true, y_pred, sample_weight=None):
         Args:
           y_true: The ground truth values.
           y_pred: The predicted values.
-          sample_weight: Optional weighting of each example. Defaults to 1. Can
+          sample_weight: Optional weighting of each example. Can
             be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
-            and must be broadcastable to `y_true`.
+            and must be broadcastable to `y_true`. Defaults to `1`.
 
         Returns:
           Update op.
@@ -113,13 +113,13 @@ class FalsePositives(_ConfusionMatrixConditionCount):
     Use `sample_weight` of 0 to mask values.
 
     Args:
-      thresholds: (Optional) Defaults to 0.5. A float value, or a Python
+      thresholds: (Optional) A float value, or a Python
         list/tuple of float threshold values in [0, 1]. A threshold is compared
         with prediction values to determine the truth value of predictions
         (i.e., above the threshold is `true`, below is `false`). If used with a
         loss function that sets `from_logits=True` (i.e. no sigmoid applied to
         predictions), `thresholds` should be set to 0. One metric value is
-        generated for each threshold value.
+        generated for each threshold value. Defaults to `0.5`.
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
 
@@ -174,13 +174,13 @@ class FalseNegatives(_ConfusionMatrixConditionCount):
     Use `sample_weight` of 0 to mask values.
 
     Args:
-      thresholds: (Optional) Defaults to 0.5. A float value, or a Python
+      thresholds: (Optional) A float value, or a Python
         list/tuple of float threshold values in [0, 1]. A threshold is compared
         with prediction values to determine the truth value of predictions
         (i.e., above the threshold is `true`, below is `false`). If used with a
         loss function that sets `from_logits=True` (i.e. no sigmoid applied to
         predictions), `thresholds` should be set to 0. One metric value is
-        generated for each threshold value.
+        generated for each threshold value. Defaults to `0.5`.
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
 
@@ -235,13 +235,13 @@ class TrueNegatives(_ConfusionMatrixConditionCount):
     Use `sample_weight` of 0 to mask values.
 
     Args:
-      thresholds: (Optional) Defaults to 0.5. A float value, or a Python
+      thresholds: (Optional) A float value, or a Python
         list/tuple of float threshold values in [0, 1]. A threshold is compared
         with prediction values to determine the truth value of predictions
         (i.e., above the threshold is `true`, below is `false`). If used with a
         loss function that sets `from_logits=True` (i.e. no sigmoid applied to
         predictions), `thresholds` should be set to 0. One metric value is
-        generated for each threshold value.
+        generated for each threshold value. Defaults to `0.5`.
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
 
@@ -296,13 +296,13 @@ class TruePositives(_ConfusionMatrixConditionCount):
     Use `sample_weight` of 0 to mask values.
 
     Args:
-      thresholds: (Optional) Defaults to 0.5. A float value, or a Python
+      thresholds: (Optional) A float value, or a Python
         list/tuple of float threshold values in [0, 1]. A threshold is compared
         with prediction values to determine the truth value of predictions
         (i.e., above the threshold is `true`, below is `false`). If used with a
         loss function that sets `from_logits=True` (i.e. no sigmoid applied to
         predictions), `thresholds` should be set to 0. One metric value is
-        generated for each threshold value.
+        generated for each threshold value. Defaults to `0.5`.
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
 
@@ -460,9 +460,9 @@ def update_state(self, y_true, y_pred, sample_weight=None):
             Will be cast to `bool`.
           y_pred: The predicted values. Each element must be in the range
             `[0, 1]`.
-          sample_weight: Optional weighting of each example. Defaults to 1. Can
+          sample_weight: Optional weighting of each example. Can
             be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
-            and must be broadcastable to `y_true`.
+            and must be broadcastable to `y_true`. Defaults to `1`.
 
         Returns:
           Update op.
@@ -606,9 +606,9 @@ def update_state(self, y_true, y_pred, sample_weight=None):
             Will be cast to `bool`.
           y_pred: The predicted values. Each element must be in the range
             `[0, 1]`.
-          sample_weight: Optional weighting of each example. Defaults to 1. Can
+          sample_weight: Optional weighting of each example. Can
             be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
-            and must be broadcastable to `y_true`.
+            and must be broadcastable to `y_true`. Defaults to `1`.
 
         Returns:
           Update op.
@@ -702,9 +702,9 @@ def update_state(self, y_true, y_pred, sample_weight=None):
         Args:
           y_true: The ground truth values.
           y_pred: The predicted values.
-          sample_weight: Optional weighting of each example. Defaults to 1. Can
+          sample_weight: Optional weighting of each example. Can
             be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
-            and must be broadcastable to `y_true`.
+            and must be broadcastable to `y_true`. Defaults to `1`.
 
         Returns:
           Update op.
@@ -798,8 +798,8 @@ class SensitivityAtSpecificity(SensitivitySpecificityBase):
 
     Args:
       specificity: A scalar value in range `[0, 1]`.
-      num_thresholds: (Optional) Defaults to 200. The number of thresholds to
-        use for matching the given specificity.
+      num_thresholds: (Optional) The number of thresholds to
+        use for matching the given specificity. Defaults to `200`.
       class_id: (Optional) Integer class ID for which we want binary metrics.
         This must be in the half-open interval `[0, num_classes)`, where
         `num_classes` is the last dimension of predictions.
@@ -903,8 +903,8 @@ class SpecificityAtSensitivity(SensitivitySpecificityBase):
 
     Args:
       sensitivity: A scalar value in range `[0, 1]`.
-      num_thresholds: (Optional) Defaults to 200. The number of thresholds to
-        use for matching the given sensitivity.
+      num_thresholds: (Optional) The number of thresholds to
+        use for matching the given sensitivity. Defaults to `200`.
       class_id: (Optional) Integer class ID for which we want binary metrics.
         This must be in the half-open interval `[0, num_classes)`, where
         `num_classes` is the last dimension of predictions.
@@ -999,8 +999,8 @@ class PrecisionAtRecall(SensitivitySpecificityBase):
 
     Args:
       recall: A scalar value in range `[0, 1]`.
-      num_thresholds: (Optional) Defaults to 200. The number of thresholds to
-        use for matching the given recall.
+      num_thresholds: (Optional) The number of thresholds to
+        use for matching the given recall. Defaults to `200`.
       class_id: (Optional) Integer class ID for which we want binary metrics.
         This must be in the half-open interval `[0, num_classes)`, where
         `num_classes` is the last dimension of predictions.
@@ -1090,8 +1090,8 @@ class RecallAtPrecision(SensitivitySpecificityBase):
 
     Args:
       precision: A scalar value in range `[0, 1]`.
-      num_thresholds: (Optional) Defaults to 200. The number of thresholds to
-        use for matching the given precision.
+      num_thresholds: (Optional) The number of thresholds to
+        use for matching the given precision. Defaults to `200`.
       class_id: (Optional) Integer class ID for which we want binary metrics.
         This must be in the half-open interval `[0, num_classes)`, where
         `num_classes` is the last dimension of predictions.
@@ -1209,8 +1209,9 @@ class AUC(base_metric.Metric):
     Use `sample_weight` of 0 to mask values.
 
     Args:
-      num_thresholds: (Optional) Defaults to 200. The number of thresholds to
+      num_thresholds: (Optional) The number of thresholds to
         use when discretizing the roc curve. Values must be > 1.
+        Defaults to `200`.
       curve: (Optional) Specifies the name of the curve to be computed, 'ROC'
         [default] or 'PR' for the Precision-Recall-curve.
       summation_method: (Optional) Specifies the [Riemann summation method](
@@ -1442,9 +1443,9 @@ def update_state(self, y_true, y_pred, sample_weight=None):
         Args:
           y_true: The ground truth values.
           y_pred: The predicted values.
-          sample_weight: Optional weighting of each example. Defaults to 1. Can
+          sample_weight: Optional weighting of each example. Can
             be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
-            and must be broadcastable to `y_true`.
+            and must be broadcastable to `y_true`. Defaults to `1`.
 
         Returns:
           Update op.
diff --git a/keras/metrics/iou_metrics.py b/keras/metrics/iou_metrics.py
index a78c0f3183bb..b3fe12fa2af0 100644
--- a/keras/metrics/iou_metrics.py
+++ b/keras/metrics/iou_metrics.py
@@ -67,7 +67,8 @@ class _IoUBase(base_metric.Metric):
       sparse_y_pred: Whether predictions are encoded using integers or
         dense floating point vectors. If `False`, the `tf.argmax` function
         will be used to determine each sample's most likely associated label.
-      axis: (Optional) Defaults to `-1`. The dimension containing the logits.
+      axis: (Optional) -1 is the dimension containing the logits.
+        Defaults to `-1`.
     """
 
     def __init__(
@@ -100,9 +101,9 @@ def update_state(self, y_true, y_pred, sample_weight=None):
         Args:
           y_true: The ground truth values.
           y_pred: The predicted values.
-          sample_weight: Optional weighting of each example. Defaults to 1. Can
+          sample_weight: Optional weighting of each example. Can
             be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
-            and must be broadcastable to `y_true`.
+            and must be broadcastable to `y_true`. Defaults to `1`.
 
         Returns:
           Update op.
@@ -197,7 +198,8 @@ class IoU(_IoUBase):
       sparse_y_pred: Whether predictions are encoded using integers or
         dense floating point vectors. If `False`, the `tf.argmax` function
         will be used to determine each sample's most likely associated label.
-      axis: (Optional) Defaults to `-1`. The dimension containing the logits.
+      axis: (Optional) -1 is the dimension containing the logits.
+        Defaults to `-1`.
 
     Standalone usage:
 
@@ -405,9 +407,9 @@ def update_state(self, y_true, y_pred, sample_weight=None):
         Args:
           y_true: The ground truth values.
           y_pred: The predicted values.
-          sample_weight: Optional weighting of each example. Defaults to 1. Can
+          sample_weight: Optional weighting of each example. Can
             be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
-            and must be broadcastable to `y_true`.
+            and must be broadcastable to `y_true`. Defaults to `1`.
 
         Returns:
           Update op.
@@ -465,7 +467,7 @@ class MeanIoU(IoU):
       sparse_y_pred: Whether predictions are encoded using integers or
         dense floating point vectors. If `False`, the `tf.argmax` function
         will be used to determine each sample's most likely associated label.
-      axis: (Optional) Defaults to `-1`. The dimension containing the logits.
+      axis: (Optional) -1 dimension contains the logits. Defaults to `-1`.
 
     Standalone usage:
 
@@ -581,7 +583,7 @@ class OneHotIoU(IoU):
       sparse_y_pred: Whether predictions are encoded using natural numbers or
         probability distribution vectors. If `False`, the `tf.argmax` function
         will be used to determine each sample's most likely associated label.
-      axis: (Optional) Defaults to `-1`. The dimension containing the logits.
+      axis: (Optional) -1 dimension contains the logits. Defaults to `-1`.
 
     Standalone usage:
 
@@ -695,7 +697,7 @@ class apply.
       sparse_y_pred: Whether predictions are encoded using natural numbers or
         probability distribution vectors. If `False`, the `tf.argmax` function
         will be used to determine each sample's most likely associated label.
-      axis: (Optional) Defaults to `-1`. The dimension containing the logits.
+      axis: (Optional) -1 dimension contains the logits. Defaults to `-1`.
 
     Standalone usage:
 
diff --git a/keras/metrics/probabilistic_metrics.py b/keras/metrics/probabilistic_metrics.py
index 7e7b7f1c7bf2..3be4f43e3f12 100644
--- a/keras/metrics/probabilistic_metrics.py
+++ b/keras/metrics/probabilistic_metrics.py
@@ -183,8 +183,8 @@ class CategoricalCrossentropy(base_metric.MeanMetricWrapper):
         smoothed, meaning the confidence on label values are relaxed. e.g.
         `label_smoothing=0.2` means that we will use a value of `0.1` for label
         `0` and `0.9` for label `1`"
-      axis: (Optional) Defaults to `-1`. The dimension along which entropy is
-        computed.
+      axis: (Optional) -1 is the dimension along which entropy is
+        computed. Defaults to `-1`.
 
     Standalone usage:
 
@@ -261,8 +261,8 @@ class SparseCategoricalCrossentropy(base_metric.MeanMetricWrapper):
         metric computation. This is useful, for example, in segmentation
         problems featuring a "void" class (commonly -1 or 255) in segmentation
         maps. By default (`ignore_class=None`), all classes are considered.
-      axis: (Optional) Defaults to `-1`. The dimension along which entropy is
-        computed.
+      axis: (Optional) The dimension along which entropy is
+        computed. Defaults to `-1`.
 
     Standalone usage:
 
diff --git a/keras/metrics/regression_metrics.py b/keras/metrics/regression_metrics.py
index 8a148cf60d3c..4e2528ca5cfc 100644
--- a/keras/metrics/regression_metrics.py
+++ b/keras/metrics/regression_metrics.py
@@ -84,9 +84,9 @@ def update_state(self, y_true, y_pred, sample_weight=None):
         Args:
           y_true: The ground truth values.
           y_pred: The predicted values.
-          sample_weight: Optional weighting of each example. Defaults to 1. Can
+          sample_weight: Optional weighting of each example. Can
             be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
-            and must be broadcastable to `y_true`.
+            and must be broadcastable to `y_true`. Defaults to `1`.
 
         Returns:
           Update op.
@@ -138,8 +138,8 @@ class CosineSimilarity(base_metric.MeanMetricWrapper):
     Args:
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
-      axis: (Optional) Defaults to `-1`. The dimension along which the cosine
-        similarity is computed.
+      axis: (Optional) The dimension along which the cosine
+        similarity is computed. Defaults to `-1`.
 
     Standalone usage:
 
@@ -357,9 +357,9 @@ def update_state(self, y_true, y_pred, sample_weight=None):
         Args:
           y_true: The ground truth values.
           y_pred: The predicted values.
-          sample_weight: Optional weighting of each example. Defaults to 1. Can
+          sample_weight: Optional weighting of each example. Can
             be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
-            and must be broadcastable to `y_true`.
+            and must be broadcastable to `y_true`. Defaults to `1`.
 
         Returns:
           Update op.
@@ -443,7 +443,8 @@ class R2Score(base_metric.Metric):
             `None` (no aggregation), `"uniform_average"`,
             `"variance_weighted_average"`.
         num_regressors: Number of independent regressors used
-            ("Adjusted R2" score). Defaults to 0 (standard R2 score).
+            ("Adjusted R2" score). 0 is the standard R2 score.
+            Defaults to `0`.
         name: Optional. string name of the metric instance.
         dtype: Optional. data type of the metric result.
 
@@ -614,8 +615,8 @@ def cosine_similarity(y_true, y_pred, axis=-1):
     Args:
       y_true: The ground truth values.
       y_pred: The prediction values.
-      axis: (Optional) Defaults to `-1`. The dimension along which the cosine
-        similarity is computed.
+      axis: (Optional) -1 is the dimension along which the cosine
+        similarity is computed. Defaults to `-1`.
 
     Returns:
       Cosine similarity value.
diff --git a/keras/mixed_precision/loss_scale_optimizer.py b/keras/mixed_precision/loss_scale_optimizer.py
index b1a95abae279..52d7f968a5ab 100644
--- a/keras/mixed_precision/loss_scale_optimizer.py
+++ b/keras/mixed_precision/loss_scale_optimizer.py
@@ -406,14 +406,14 @@ class BaseLossScaleOptimizer(metaclass=LossScaleOptimizerMetaclass):
     Args:
       inner_optimizer: The `tf.keras.optimizers.Optimizer` or
         `tf.keras.optimizers.experimental.Optimizer` instance to wrap.
-      dynamic: Bool indicating whether dynamic loss scaling is used. Defaults to
-        True. If True, the loss scale will be dynamically updated over time
-        using an algorithm that keeps the loss scale at approximately its
-        optimal value.  If False, a single fixed loss scale is used and
-        `initial_scale` must be specified, which is used as the loss scale.
+      dynamic: Bool indicating whether dynamic loss scaling is used.  If True,
+        the loss scale will be dynamically updated over time using an algorithm
+        that keeps the loss scale at approximately its optimal value. If False,
+        a single fixed loss scale is used and  `initial_scale` must be specified,
+        which is used as the loss scale.
         Recommended to keep as True, as choosing a fixed loss scale can be
         tricky. Currently, there is a small performance overhead to dynamic loss
-        scaling compared to fixed loss scaling.
+        scaling compared to fixed loss scaling. Defaults to True.
       initial_scale: The initial loss scale. If `dynamic` is True, this defaults
         to `2 ** 15`. If `dynamic` is False, this must be specified and acts as
         the sole loss scale, as the loss scale does not change over time. When
@@ -422,11 +422,11 @@ class BaseLossScaleOptimizer(metaclass=LossScaleOptimizerMetaclass):
         quickly than a loss scale that is too low gets raised.
       dynamic_growth_steps: With dynamic loss scaling, every
         `dynamic_growth_steps` steps with finite gradients, the loss scale is
-        doubled. Defaults to 2000. If a nonfinite gradient is encountered, the
+        doubled. If a nonfinite gradient is encountered, the
         count is reset back to zero, gradients are skipped that step, and the
         loss scale is halved. The count can be queried with
         `LossScaleOptimizer.dynamic_counter`. This argument can only be
-        specified if `dynamic` is True.
+        specified if `dynamic` is True. Defaults to `2000`.
 
     `LossScaleOptimizer` will occasionally skip applying gradients to the
     variables, in which case the trainable variables will not change that step.

From cd053cc6a42202ad36667c2609c7f3465b0ffe35 Mon Sep 17 00:00:00 2001
From: Samuel Marks <807580+SamuelMarks@users.noreply.github.com>
Date: Sat, 1 Apr 2023 22:24:14 -0400
Subject: [PATCH 3/4] [*.py] Move `Defaults to` to end of arg docstring and
 standardise values

---
 keras/applications/convnext.py                | 12 ++++-----
 keras/applications/efficientnet.py            |  8 +++---
 keras/applications/efficientnet_v2.py         | 11 ++++----
 keras/applications/imagenet_utils.py          | 11 ++++----
 keras/applications/inception_v3.py            |  8 +++---
 keras/applications/mobilenet.py               | 15 ++++++-----
 keras/applications/regnet.py                  | 12 ++++-----
 keras/applications/resnet_rs.py               | 11 ++++----
 keras/backend.py                              |  8 +++---
 keras/datasets/imdb.py                        |  2 +-
 keras/datasets/reuters.py                     |  2 +-
 keras/engine/base_layer.py                    |  6 ++---
 keras/engine/base_layer_v1.py                 |  2 +-
 keras/engine/data_adapter.py                  |  4 +--
 keras/engine/input_layer.py                   |  4 +--
 keras/engine/training.py                      | 10 +++----
 keras/layers/activation/leaky_relu.py         |  2 +-
 keras/layers/activation/relu.py               |  6 ++---
 keras/layers/activation/softmax.py            |  4 +--
 keras/layers/attention/additive_attention.py  |  4 +--
 keras/layers/attention/attention.py           |  2 +-
 .../normalization/group_normalization.py      |  7 ++---
 .../normalization/layer_normalization.py      |  4 +--
 keras/layers/preprocessing/discretization.py  |  2 +-
 keras/layers/preprocessing/hashed_crossing.py |  7 +++--
 keras/layers/preprocessing/hashing.py         |  4 +--
 .../preprocessing/image_preprocessing_test.py |  4 +--
 keras/layers/preprocessing/index_lookup.py    |  8 +++---
 keras/layers/preprocessing/integer_lookup.py  |  8 +++---
 keras/layers/preprocessing/string_lookup.py   |  9 ++++---
 .../preprocessing/text_vectorization.py       |  6 ++---
 keras/legacy_tf_layers/variable_scope_shim.py |  2 +-
 keras/losses.py                               |  9 ++++---
 keras/mixed_precision/loss_scale_optimizer.py |  6 ++---
 keras/optimizers/adadelta.py                  |  7 ++---
 keras/optimizers/adafactor.py                 | 22 ++++++++--------
 keras/optimizers/adagrad.py                   |  2 +-
 keras/optimizers/adam.py                      |  9 ++++---
 keras/optimizers/adamax.py                    |  2 +-
 keras/optimizers/adamw.py                     | 11 ++++----
 keras/optimizers/ftrl.py                      |  6 ++---
 keras/optimizers/legacy/adadelta.py           |  2 +-
 keras/optimizers/legacy/adagrad.py            |  2 +-
 keras/optimizers/legacy/adam.py               | 17 ++++++------
 keras/optimizers/legacy/ftrl.py               |  6 ++---
 keras/optimizers/legacy/gradient_descent.py   |  6 ++---
 keras/optimizers/legacy/optimizer_v2.py       |  4 +--
 keras/optimizers/legacy/rmsprop.py            | 15 ++++++-----
 .../optimizers/legacy_learning_rate_decay.py  |  9 ++++---
 keras/optimizers/lion.py                      |  2 +-
 keras/optimizers/nadam.py                     |  9 ++++---
 keras/optimizers/optimizer.py                 |  4 +--
 keras/optimizers/rmsprop.py                   |  6 ++---
 .../schedules/learning_rate_schedule.py       |  5 ++--
 keras/optimizers/sgd.py                       |  7 ++---
 keras/preprocessing/image.py                  |  4 +--
 keras/saving/legacy/save.py                   |  6 ++---
 keras/saving/legacy/saved_model/save.py       |  6 ++---
 keras/saving/saving_api.py                    | 10 +++----
 keras/saving/serialization_lib.py             |  2 +-
 keras/testing_infra/test_combinations.py      | 15 ++++++-----
 keras/testing_infra/test_utils.py             |  5 ++--
 keras/utils/audio_dataset.py                  |  2 +-
 keras/utils/conv_utils.py                     |  4 +--
 keras/utils/dataset_utils.py                  |  8 +++---
 keras/utils/feature_space.py                  |  6 ++---
 keras/utils/image_dataset.py                  |  6 ++---
 keras/utils/image_utils.py                    | 26 +++++++++----------
 keras/utils/layer_utils.py                    |  4 +--
 keras/utils/metrics_utils.py                  |  2 +-
 keras/utils/text_dataset.py                   |  2 +-
 71 files changed, 250 insertions(+), 231 deletions(-)

diff --git a/keras/applications/convnext.py b/keras/applications/convnext.py
index 7915e3339bde..5eb983d90a09 100644
--- a/keras/applications/convnext.py
+++ b/keras/applications/convnext.py
@@ -124,7 +124,7 @@
 
   Args:
     include_top: Whether to include the fully-connected
-      layer at the top of the network. Defaults to True.
+      layer at the top of the network. Defaults to `True`.
     weights: One of `None` (random initialization),
       `"imagenet"` (pre-training on ImageNet-1k), or the path to the weights
       file to be loaded. Defaults to `"imagenet"`.
@@ -135,7 +135,7 @@
       if `include_top` is False.
       It should have exactly 3 inputs channels.
     pooling: Optional pooling mode for feature extraction
-      when `include_top` is `False`. Defaults to None.
+      when `include_top` is `False`.
       - `None` means that the output of the model will be
         the 4D tensor output of the last convolutional layer.
       - `avg` means that global average pooling
@@ -144,16 +144,16 @@
         the output of the model will be a 2D tensor.
       - `max` means that global max pooling will
         be applied.
+      Defaults to `None`.
     classes: Optional number of classes to classify images
       into, only to be specified if `include_top` is True, and
-      if no `weights` argument is specified. Defaults to 1000 (number of
-      ImageNet classes).
+      if no `weights` argument is specified. 1000 is how many
+      ImageNet classes there are. Defaults to `1000`.
     classifier_activation: A `str` or callable. The activation function to use
       on the "top" layer. Ignored unless `include_top=True`. Set
       `classifier_activation=None` to return the logits of the "top" layer.
-      Defaults to `"softmax"`.
       When loading pretrained weights, `classifier_activation` can only
-      be `None` or `"softmax"`.
+      be `None` or `"softmax"`. Defaults to `"softmax"`.
 
   Returns:
     A `keras.Model` instance.
diff --git a/keras/applications/efficientnet.py b/keras/applications/efficientnet.py
index 2f699f9d0bc1..775a19153ec7 100644
--- a/keras/applications/efficientnet.py
+++ b/keras/applications/efficientnet.py
@@ -192,7 +192,7 @@
 
   Args:
     include_top: Whether to include the fully-connected
-        layer at the top of the network. Defaults to True.
+        layer at the top of the network. Defaults to `True`.
     weights: One of `None` (random initialization),
           'imagenet' (pre-training on ImageNet),
           or the path to the weights file to be loaded. Defaults to 'imagenet'.
@@ -203,7 +203,7 @@
         if `include_top` is False.
         It should have exactly 3 inputs channels.
     pooling: Optional pooling mode for feature extraction
-        when `include_top` is `False`. Defaults to None.
+        when `include_top` is `False`. Defaults to `None`.
         - `None` means that the output of the model will be
             the 4D tensor output of the
             last convolutional layer.
@@ -215,8 +215,8 @@
             be applied.
     classes: Optional number of classes to classify images
         into, only to be specified if `include_top` is True, and
-        if no `weights` argument is specified. Defaults to 1000 (number of
-        ImageNet classes).
+        if no `weights` argument is specified. 1000 is how many
+        ImageNet classes there are. Defaults to `1000`.
     classifier_activation: A `str` or callable. The activation function to use
         on the "top" layer. Ignored unless `include_top=True`. Set
         `classifier_activation=None` to return the logits of the "top" layer.
diff --git a/keras/applications/efficientnet_v2.py b/keras/applications/efficientnet_v2.py
index 8a7ed0b7a7b3..82a7b3a6efd6 100644
--- a/keras/applications/efficientnet_v2.py
+++ b/keras/applications/efficientnet_v2.py
@@ -574,7 +574,7 @@
 
   Args:
     include_top: Boolean, whether to include the fully-connected
-      layer at the top of the network. Defaults to True.
+      layer at the top of the network. Defaults to `True`.
     weights: One of `None` (random initialization),
       `"imagenet"` (pre-training on ImageNet),
       or the path to the weights file to be loaded. Defaults to `"imagenet"`.
@@ -585,7 +585,7 @@
       if `include_top` is False.
       It should have exactly 3 inputs channels.
     pooling: Optional pooling mode for feature extraction
-      when `include_top` is `False`. Defaults to None.
+      when `include_top` is `False`.
       - `None` means that the output of the model will be
           the 4D tensor output of the
           last convolutional layer.
@@ -595,16 +595,17 @@
           the output of the model will be a 2D tensor.
       - `"max"` means that global max pooling will
           be applied.
+      Defaults to `None`.
     classes: Optional number of classes to classify images
       into, only to be specified if `include_top` is True, and
-      if no `weights` argument is specified. Defaults to 1000 (number of
-      ImageNet classes).
+      if no `weights` argument is specified. 1000 is how many
+      ImageNet classes there are. Defaults to `1000`.
     classifier_activation: A string or callable. The activation function to use
       on the `"top"` layer. Ignored unless `include_top=True`. Set
       `classifier_activation=None` to return the logits of the "top" layer.
-      Defaults to `"softmax"`.
       When loading pretrained weights, `classifier_activation` can only
       be `None` or `"softmax"`.
+      Defaults to `"softmax"`.
 
   Returns:
     A `keras.Model` instance.
diff --git a/keras/applications/imagenet_utils.py b/keras/applications/imagenet_utils.py
index cc58b47c7628..12b745c8229c 100644
--- a/keras/applications/imagenet_utils.py
+++ b/keras/applications/imagenet_utils.py
@@ -56,10 +56,10 @@
       The preprocessed data are written over the input data
       if the data types are compatible. To avoid this
       behaviour, `numpy.copy(x)` can be used.
-    data_format: Optional data format of the image tensor/array. Defaults to
-      None, in which case the global setting
-      `tf.keras.backend.image_data_format()` is used (unless you changed it,
-      it defaults to "channels_last").{mode}
+    data_format: Optional data format of the image tensor/array. None, means
+      the global setting `tf.keras.backend.image_data_format()` is used
+      (unless you changed it, it defaults to "channels_last").{mode}
+      Defaults to `None`.
 
   Returns:
       Preprocessed `numpy.array` or a `tf.Tensor` with type `float32`.
@@ -70,7 +70,7 @@
   """
 
 PREPROCESS_INPUT_MODE_DOC = """
-    mode: One of "caffe", "tf" or "torch". Defaults to "caffe".
+    mode: One of "caffe", "tf" or "torch".
       - caffe: will convert the images from RGB to BGR,
           then will zero-center each color channel with
           respect to the ImageNet dataset,
@@ -80,6 +80,7 @@
       - torch: will scale pixels between 0 and 1 and then
           will normalize each channel with respect to the
           ImageNet dataset.
+      Defaults to "caffe".
   """
 
 PREPROCESS_INPUT_DEFAULT_ERROR_DOC = """
diff --git a/keras/applications/inception_v3.py b/keras/applications/inception_v3.py
index 4433325538d5..381192646fc0 100644
--- a/keras/applications/inception_v3.py
+++ b/keras/applications/inception_v3.py
@@ -82,13 +82,13 @@ def InceptionV3(
 
     Args:
       include_top: Boolean, whether to include the fully-connected
-        layer at the top, as the last layer of the network. Default to `True`.
+        layer at the top, as the last layer of the network. Defaults to `True`.
       weights: One of `None` (random initialization),
         `imagenet` (pre-training on ImageNet),
-        or the path to the weights file to be loaded. Default to `imagenet`.
+        or the path to the weights file to be loaded. Defaults to `imagenet`.
       input_tensor: Optional Keras tensor (i.e. output of `layers.Input()`)
         to use as image input for the model. `input_tensor` is useful for
-        sharing inputs between multiple different networks. Default to None.
+        sharing inputs between multiple different networks. Defaults to None.
       input_shape: Optional shape tuple, only to be specified
         if `include_top` is False (otherwise the input shape
         has to be `(299, 299, 3)` (with `channels_last` data format)
@@ -108,7 +108,7 @@ def InceptionV3(
         - `max` means that global max pooling will be applied.
       classes: optional number of classes to classify images
         into, only to be specified if `include_top` is True, and
-        if no `weights` argument is specified. Default to 1000.
+        if no `weights` argument is specified. Defaults to 1000.
       classifier_activation: A `str` or callable. The activation function to use
         on the "top" layer. Ignored unless `include_top=True`. Set
         `classifier_activation=None` to return the logits of the "top" layer.
diff --git a/keras/applications/mobilenet.py b/keras/applications/mobilenet.py
index 84826c1d3d02..0232fd837ce8 100644
--- a/keras/applications/mobilenet.py
+++ b/keras/applications/mobilenet.py
@@ -124,25 +124,26 @@ def MobileNet(
         `channels_last` data format) or (3, 224, 224) (with `channels_first`
         data format). It should have exactly 3 inputs channels, and width and
         height should be no smaller than 32. E.g. `(200, 200, 3)` would be one
-        valid value. Default to `None`.
+        valid value. Defaults to `None`.
         `input_shape` will be ignored if the `input_tensor` is provided.
       alpha: Controls the width of the network. This is known as the width
         multiplier in the MobileNet paper. - If `alpha` < 1.0, proportionally
         decreases the number of filters in each layer. - If `alpha` > 1.0,
         proportionally increases the number of filters in each layer. - If
         `alpha` = 1, default number of filters from the paper are used at each
-        layer. Default to 1.0.
+        layer. Defaults to `1.0`.
       depth_multiplier: Depth multiplier for depthwise convolution. This is
-        called the resolution multiplier in the MobileNet paper. Default to 1.0.
-      dropout: Dropout rate. Default to 0.001.
+        called the resolution multiplier in the MobileNet paper.
+        Defaults to `1.0`.
+      dropout: Dropout rate. Defaults to `0.001`.
       include_top: Boolean, whether to include the fully-connected layer at the
-        top of the network. Default to `True`.
+        top of the network. Defaults to `True`.
       weights: One of `None` (random initialization), 'imagenet' (pre-training
-        on ImageNet), or the path to the weights file to be loaded. Default to
+        on ImageNet), or the path to the weights file to be loaded. Defaults to
         `imagenet`.
       input_tensor: Optional Keras tensor (i.e. output of `layers.Input()`) to
         use as image input for the model. `input_tensor` is useful for sharing
-        inputs between multiple different networks. Default to None.
+        inputs between multiple different networks. Defaults to None.
       pooling: Optional pooling mode for feature extraction when `include_top`
         is `False`.
         - `None` (default) means that the output of the model will be
diff --git a/keras/applications/regnet.py b/keras/applications/regnet.py
index 97d9e3af428c..e05071533712 100644
--- a/keras/applications/regnet.py
+++ b/keras/applications/regnet.py
@@ -342,7 +342,7 @@
 
   Args:
     include_top: Whether to include the fully-connected
-        layer at the top of the network. Defaults to True.
+        layer at the top of the network. Defaults to `True`.
     weights: One of `None` (random initialization),
           `"imagenet"` (pre-training on ImageNet), or the path to the weights
           file to be loaded. Defaults to `"imagenet"`.
@@ -353,7 +353,7 @@
         if `include_top` is False.
         It should have exactly 3 inputs channels.
     pooling: Optional pooling mode for feature extraction
-        when `include_top` is `False`. Defaults to None.
+        when `include_top` is `False`.
         - `None` means that the output of the model will be
             the 4D tensor output of the
             last convolutional layer.
@@ -363,16 +363,16 @@
             the output of the model will be a 2D tensor.
         - `max` means that global max pooling will
             be applied.
+        Defaults to `None`.
     classes: Optional number of classes to classify images
         into, only to be specified if `include_top` is True, and
-        if no `weights` argument is specified. Defaults to 1000 (number of
-        ImageNet classes).
+        if no `weights` argument is specified. 1000 is how many
+        ImageNet classes there are. Defaults to `1000`.
     classifier_activation: A `str` or callable. The activation function to use
         on the "top" layer. Ignored unless `include_top=True`. Set
         `classifier_activation=None` to return the logits of the "top" layer.
-        Defaults to `"softmax"`.
         When loading pretrained weights, `classifier_activation` can only
-        be `None` or `"softmax"`.
+        be `None` or `"softmax"`. Defaults to `"softmax"`.
 
   Returns:
     A `keras.Model` instance.
diff --git a/keras/applications/resnet_rs.py b/keras/applications/resnet_rs.py
index 5830ff44852c..a7c29b7a61a3 100644
--- a/keras/applications/resnet_rs.py
+++ b/keras/applications/resnet_rs.py
@@ -196,9 +196,9 @@
             `classifier_activation=None` to return the logits of the "top"
             layer.
         include_preprocessing: Boolean, whether to include the preprocessing
-            layer (`Rescaling`) at the bottom of the network. Defaults to
-            `True`.  Note: Input image is normalized by ImageNet mean and
-            standard deviation.
+            layer (`Rescaling`) at the bottom of the network. Note: Input image
+            is normalized by ImageNet mean and standard deviation.
+            Defaults to `True`.
 
     Returns:
         A `keras.Model` instance.
@@ -582,8 +582,9 @@ def ResNetRS(
           use on the "top" layer. Ignored unless `include_top=True`. Set
           `classifier_activation=None` to return the logits of the "top" layer.
         include_preprocessing: Boolean, whether to include the preprocessing
-          layer (`Rescaling`) at the bottom of the network. Note- Input image
-          is normalized by ImageNet mean and standard deviation. Defaults to `True`
+          layer (`Rescaling`) at the bottom of the network. Note - Input image
+          is normalized by ImageNet mean and standard deviation.
+          Defaults to `True`.
 
 
     Returns:
diff --git a/keras/backend.py b/keras/backend.py
index 63e7bcd20bfe..fec66940c68f 100644
--- a/keras/backend.py
+++ b/keras/backend.py
@@ -6898,11 +6898,11 @@ def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
     Args:
         shape: A tuple of integers, the shape of tensor to create.
         mean: A float, the mean value of the normal distribution to draw
-          samples. Default to 0.0.
+          samples. Defaults to `0.0`.
         stddev: A float, the standard deviation of the normal distribution
-          to draw samples. Default to 1.0.
-        dtype: `tf.dtypes.DType`, dtype of returned tensor. Default to use Keras
-          backend dtype which is float32.
+          to draw samples. Defaults to `1.0`.
+        dtype: `tf.dtypes.DType`, dtype of returned tensor. None uses Keras
+          backend dtype which is float32. Defaults to `None`.
         seed: Integer, random seed. Will use a random numpy integer when not
           specified.
 
diff --git a/keras/datasets/imdb.py b/keras/datasets/imdb.py
index 3d489d3d1c43..0dd660b5215c 100644
--- a/keras/datasets/imdb.py
+++ b/keras/datasets/imdb.py
@@ -68,7 +68,7 @@ def load_data(
           Defaults to None
       seed: int. Seed for reproducible data shuffling.
       start_char: int. The start of a sequence will be marked with this
-          character. 0 is usually the padding character. Defaults to 1
+          character. 0 is usually the padding character. Defaults to `1`.
       oov_char: int. The out-of-vocabulary character.
           Words that were cut out because of the `num_words` or
           `skip_top` limits will be replaced with this character.
diff --git a/keras/datasets/reuters.py b/keras/datasets/reuters.py
index 82e1603f624f..ca7ca3a87d59 100644
--- a/keras/datasets/reuters.py
+++ b/keras/datasets/reuters.py
@@ -78,7 +78,7 @@ def load_data(
         test data. Defaults to 0.2
       seed: int. Seed for reproducible data shuffling.
       start_char: int. The start of a sequence will be marked with this
-          character. 0 is usually the padding character. Defaults to 1
+          character. 0 is usually the padding character. Defaults to `1`.
       oov_char: int. The out-of-vocabulary character.
           Words that were cut out because of the `num_words` or
           `skip_top` limits will be replaced with this character.
diff --git a/keras/engine/base_layer.py b/keras/engine/base_layer.py
index 02b1b1e15859..f03ff0605e99 100644
--- a/keras/engine/base_layer.py
+++ b/keras/engine/base_layer.py
@@ -458,7 +458,7 @@ def __init__(
 
         # Whether the layer will track any layers that is set as attribute on
         # itself as sub-layers, the weights from the sub-layers will be included
-        # in the parent layer's variables() as well.  Default to True, which
+        # in the parent layer's variables() as well.  Defaults to `True`, which
         # means auto tracking is turned on. Certain subclass might want to turn
         # it off, like Sequential model.
         self._auto_track_sub_layers = True
@@ -3830,9 +3830,9 @@ def __init__(
           force_generator: boolean, default to False, whether to force the
             RandomGenerator to use the code branch of tf.random.Generator.
           rng_type: string, the rng type that will be passed to backend
-            RandomGenerator. Default to `None`, which will allow RandomGenerator
+            RandomGenerator. `None`, will allow RandomGenerator
             to choose types by itself. Valid values are "stateful", "stateless",
-            "legacy_stateful".
+            "legacy_stateful". Defaults to `None`.
           **kwargs: other keyword arguments that will be passed to the parent
             *class
         """
diff --git a/keras/engine/base_layer_v1.py b/keras/engine/base_layer_v1.py
index 8baae6944549..abc72f3879fc 100644
--- a/keras/engine/base_layer_v1.py
+++ b/keras/engine/base_layer_v1.py
@@ -237,7 +237,7 @@ def __init__(
 
         # Whether the layer will track any layers that are set as attribute on
         # itself as sub-layers, the weights from the sub-layers will be included
-        # in the parent layer's variables() as well.  Default to True, which
+        # in the parent layer's variables() as well.  Defaults to `True`, which
         # means auto tracking is turned on. Certain subclass might want to turn
         # it off, like the Sequential model.
         self._auto_track_sub_layers = True
diff --git a/keras/engine/data_adapter.py b/keras/engine/data_adapter.py
index 3cc07242d9c2..9201bfe3be03 100644
--- a/keras/engine/data_adapter.py
+++ b/keras/engine/data_adapter.py
@@ -268,7 +268,7 @@ def __init__(
         _check_data_cardinality(inputs)
 
         # If batch_size is not passed but steps is, calculate from the input
-        # data.  Default to 32 for backwards compat.
+        # data.  Defaults to `32` for backwards compatibility.
         if not batch_size:
             batch_size = int(math.ceil(num_samples / steps)) if steps else 32
 
@@ -645,7 +645,7 @@ def __init__(
             dataset = dataset.shuffle(num_samples)
 
         # If batch_size is not passed but steps is, calculate from the input
-        # data.  Default to 32 for backwards compatibility.
+        # data.  Defaults to `32` for backwards compatibility.
         if not batch_size:
             batch_size = int(math.ceil(num_samples / steps)) if steps else 32
 
diff --git a/keras/engine/input_layer.py b/keras/engine/input_layer.py
index 3310ef9d3635..41479ad89325 100644
--- a/keras/engine/input_layer.py
+++ b/keras/engine/input_layer.py
@@ -88,12 +88,12 @@ class InputLayer(base_layer.Layer):
             will use the `tf.TypeSpec` of this tensor rather
             than creating a new placeholder tensor.
         sparse: Boolean, whether the placeholder created is meant to be sparse.
-            Default to `False`.
+            Defaults to `False`.
         ragged: Boolean, whether the placeholder created is meant to be ragged.
             In this case, values of `None` in the `shape` argument represent
             ragged dimensions. For more information about `tf.RaggedTensor`, see
             [this guide](https://www.tensorflow.org/guide/ragged_tensor).
-            Default to `False`.
+            Defaults to `False`.
         type_spec: A `tf.TypeSpec` object to create Input from. This
             `tf.TypeSpec` represents the entire batch. When provided, all other
             args except name must be `None`.
diff --git a/keras/engine/training.py b/keras/engine/training.py
index 5da718f12b2c..562038de4f50 100644
--- a/keras/engine/training.py
+++ b/keras/engine/training.py
@@ -712,7 +712,7 @@ def compile(
               shards based on the number of workers. 0, meaning no
               visitation guarantee is provided. NOTE: Custom implementations of
               `Model.test_step` will be ignored when doing exact evaluation.
-              Defaults to 0
+              Defaults to `0`.
             **kwargs: Arguments supported for backwards compatibility only.
         """
         if jit_compile and not tf_utils.can_jit_compile(warn=True):
@@ -2960,7 +2960,7 @@ def save(self, filepath, overwrite=True, save_format=None, **kwargs):
         SavedModel format arguments:
             include_optimizer: Only applied to SavedModel and legacy HDF5
                 formats. If False, do not save the optimizer state.
-                Defaults to True.
+                Defaults to `True`.
             signatures: Only applies to SavedModel format. Signatures to save
                 with the SavedModel. See the `signatures` argument in
                 `tf.saved_model.save` for details.
@@ -3945,7 +3945,7 @@ def _get_compile_args(self, user_metrics=True):
         Args:
           user_metrics: Whether to return user-supplied metrics or `Metric`
             objects. If True, returns the user-supplied metrics.
-            Defaults to True.
+            Defaults to `True`.
 
         Returns:
           Dictionary of arguments that were used when compiling the model.
@@ -4189,11 +4189,11 @@ def _get_verbosity(verbose, distribute_strategy):
             distribute_strategy._should_use_with_coordinator
             or not io_utils.is_interactive_logging_enabled()
         ):
-            # Default to epoch-level logging for PSStrategy or using absl
+            # Defaults to epoch-level logging for PSStrategy or using absl
             # logging.
             return 2
         else:
-            return 1  # Default to batch-level logging otherwise.
+            return 1  # Defaults to batch-level logging otherwise.
     return verbose
 
 
diff --git a/keras/layers/activation/leaky_relu.py b/keras/layers/activation/leaky_relu.py
index 4e3217d5d5b7..bc82ed5edc45 100644
--- a/keras/layers/activation/leaky_relu.py
+++ b/keras/layers/activation/leaky_relu.py
@@ -54,7 +54,7 @@ class LeakyReLU(Layer):
       Same shape as the input.
 
     Args:
-      alpha: Float >= 0. Negative slope coefficient. Default to 0.3.
+      alpha: Float >= 0. Negative slope coefficient. Defaults to `0.3`.
 
     """
 
diff --git a/keras/layers/activation/relu.py b/keras/layers/activation/relu.py
index a63e368cba5e..a9de5cce6b10 100644
--- a/keras/layers/activation/relu.py
+++ b/keras/layers/activation/relu.py
@@ -65,9 +65,9 @@ class ReLU(Layer):
       Same shape as the input.
 
     Args:
-      max_value: Float >= 0. Maximum activation value. Default to None, which
-        means unlimited.
-      negative_slope: Float >= 0. Negative slope coefficient. Default to 0.
+      max_value: Float >= 0. Maximum activation value. None
+        means unlimited. Defaults to `None`.
+      negative_slope: Float >= 0. Negative slope coefficient. Defaults to `0.`.
       threshold: Float >= 0. Threshold value for thresholded activation. Default
         to 0.
     """
diff --git a/keras/layers/activation/softmax.py b/keras/layers/activation/softmax.py
index ee3d19e0662c..cc9e86e544a7 100644
--- a/keras/layers/activation/softmax.py
+++ b/keras/layers/activation/softmax.py
@@ -72,8 +72,8 @@ class Softmax(Layer):
         normalization is applied.
     Call arguments:
       inputs: The inputs, or logits to the softmax layer.
-      mask: A boolean mask of the same shape as `inputs`. The mask specifies 1 to keep and 0 to mask.
-        Defaults to `None`.
+      mask: A boolean mask of the same shape as `inputs`. The mask
+        specifies 1 to keep and 0 to mask. Defaults to `None`.
 
 
     Returns:
diff --git a/keras/layers/attention/additive_attention.py b/keras/layers/attention/additive_attention.py
index 4406d6c28ba9..15423688277e 100644
--- a/keras/layers/attention/additive_attention.py
+++ b/keras/layers/attention/additive_attention.py
@@ -49,7 +49,7 @@ class AdditiveAttention(BaseDenseAttention):
       use_scale: If `True`, will create a variable to scale the attention
         scores.
       dropout: Float between 0 and 1. Fraction of the units to drop for the
-        attention scores. Defaults to 0.0.
+        attention scores. Defaults to `0.0`.
 
     Call Args:
 
@@ -73,7 +73,7 @@ class AdditiveAttention(BaseDenseAttention):
       use_causal_mask: Boolean. Set to `True` for decoder self-attention. Adds a
         mask such that position `i` cannot attend to positions `j > i`. This
         prevents the flow of information from the future towards the past.
-        Defaults to `False`.`
+        Defaults to `False`.
 
     Output:
 
diff --git a/keras/layers/attention/attention.py b/keras/layers/attention/attention.py
index d84eac9cb419..650090a8b521 100644
--- a/keras/layers/attention/attention.py
+++ b/keras/layers/attention/attention.py
@@ -47,7 +47,7 @@ class Attention(BaseDenseAttention):
       use_scale: If `True`, will create a scalar variable to scale the attention
         scores.
       dropout: Float between 0 and 1. Fraction of the units to drop for the
-        attention scores. Defaults to 0.0.
+        attention scores. Defaults to `0.0`.
       score_mode: Function to use to compute attention scores, one of
         `{"dot", "concat"}`. `"dot"` refers to the dot product between the query
         and key vectors. `"concat"` refers to the hyperbolic tangent of the
diff --git a/keras/layers/normalization/group_normalization.py b/keras/layers/normalization/group_normalization.py
index 8b71fbb4dd35..010d07aae96c 100644
--- a/keras/layers/normalization/group_normalization.py
+++ b/keras/layers/normalization/group_normalization.py
@@ -58,10 +58,11 @@ class GroupNormalization(Layer):
       epsilon: Small float added to variance to avoid dividing by zero. Defaults
         to 1e-3
       center: If True, add offset of `beta` to normalized tensor. If False,
-        `beta` is ignored. Defaults to True.
+        `beta` is ignored. Defaults to `True`.
       scale: If True, multiply by `gamma`. If False, `gamma` is not used.
-        Defaults to True. When the next layer is linear (also e.g. `nn.relu`),
-        this can be disabled since the scaling will be done by the next layer.
+        When the next layer is linear (also e.g. `nn.relu`), this can be
+        disabled since the scaling will be done by the next layer.
+        Defaults to `True`.
       beta_initializer: Initializer for the beta weight. Defaults to zeros.
       gamma_initializer: Initializer for the gamma weight. Defaults to ones.
       beta_regularizer: Optional regularizer for the beta weight. None by
diff --git a/keras/layers/normalization/layer_normalization.py b/keras/layers/normalization/layer_normalization.py
index 9b080dc7eb43..4b550b7c3d87 100644
--- a/keras/layers/normalization/layer_normalization.py
+++ b/keras/layers/normalization/layer_normalization.py
@@ -126,11 +126,11 @@ class LayerNormalization(Layer):
       epsilon: Small float added to variance to avoid dividing by zero. Defaults
         to 1e-3
       center: If True, add offset of `beta` to normalized tensor. If False,
-        `beta` is ignored. Defaults to True.
+        `beta` is ignored. Defaults to `True`.
       scale: If True, multiply by `gamma`. If False, `gamma` is not used.
         When the next layer is linear (also e.g. `nn.relu`), this can be
         disabled since the scaling will be done by the next layer.
-        Defaults to True.
+        Defaults to `True`.
       beta_initializer: Initializer for the beta weight. Defaults to zeros.
       gamma_initializer: Initializer for the gamma weight. Defaults to ones.
       beta_regularizer: Optional regularizer for the beta weight. None by
diff --git a/keras/layers/preprocessing/discretization.py b/keras/layers/preprocessing/discretization.py
index eec86b12c3fc..72ae53c4e0ac 100644
--- a/keras/layers/preprocessing/discretization.py
+++ b/keras/layers/preprocessing/discretization.py
@@ -183,7 +183,7 @@ class Discretization(base_preprocessing_layer.PreprocessingLayer):
         Defaults to `"int"`.
       sparse: Boolean. Only applicable to `"one_hot"`, `"multi_hot"`,
         and `"count"` output modes. If True, returns a `SparseTensor` instead of
-        a dense `Tensor`. Defaults to False.
+        a dense `Tensor`. Defaults to `False`.
 
     Examples:
 
diff --git a/keras/layers/preprocessing/hashed_crossing.py b/keras/layers/preprocessing/hashed_crossing.py
index 06f7bc2190f2..86e0f58a5b53 100644
--- a/keras/layers/preprocessing/hashed_crossing.py
+++ b/keras/layers/preprocessing/hashed_crossing.py
@@ -51,16 +51,15 @@ class HashedCrossing(base_layer.Layer):
 
     Args:
       num_bins: Number of hash bins.
-      output_mode: Specification for the output of the layer. Values can be `"int"`,
-        or `"one_hot"` configuring the layer as
-        follows:
+      output_mode: Specification for the output of the layer. Values can be
+        `"int"`, or `"one_hot"` configuring the layer as follows:
           - `"int"`: Return the integer bin indices directly.
           - `"one_hot"`: Encodes each individual element in the input into an
             array the same size as `num_bins`, containing a 1 at the input's bin
             index.
         Defaults to `"int"`.
       sparse: Boolean. Only applicable to `"one_hot"` mode. If True, returns a
-        `SparseTensor` instead of a dense `Tensor`. Defaults to False.
+        `SparseTensor` instead of a dense `Tensor`. Defaults to `False`.
       **kwargs: Keyword arguments to construct a layer.
 
     Examples:
diff --git a/keras/layers/preprocessing/hashing.py b/keras/layers/preprocessing/hashing.py
index 54815ec181b4..e64c0f34297b 100644
--- a/keras/layers/preprocessing/hashing.py
+++ b/keras/layers/preprocessing/hashing.py
@@ -110,7 +110,7 @@ class Hashing(base_layer.Layer):
         is set.
       mask_value: A value that represents masked inputs, which are mapped to
         index 0. None means no mask term will be added and the
-        hashing will start at index 0. Defaults to None.
+        hashing will start at index 0. Defaults to `None`.
       salt: A single unsigned integer or None.
         If passed, the hash function used will be SipHash64, with these values
         used as an additional input (known as a "salt" in cryptography).
@@ -136,7 +136,7 @@ class Hashing(base_layer.Layer):
         Defaults to `"int"`.
       sparse: Boolean. Only applicable to `"one_hot"`, `"multi_hot"`,
         and `"count"` output modes. If True, returns a `SparseTensor` instead of
-        a dense `Tensor`. Defaults to False.
+        a dense `Tensor`. Defaults to `False`.
       **kwargs: Keyword arguments to construct a layer.
 
     Input shape:
diff --git a/keras/layers/preprocessing/image_preprocessing_test.py b/keras/layers/preprocessing/image_preprocessing_test.py
index 8c07ab131f53..8385e6cdace2 100644
--- a/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/keras/layers/preprocessing/image_preprocessing_test.py
@@ -2233,7 +2233,7 @@ def test_plain_call(self):
         layer = image_preprocessing.RandomWidth(0.5, seed=123)
         shape = (12, 12, 3)
         img = np.random.random((12,) + shape)
-        out = layer(img)  # Default to training=True
+        out = layer(img)  # Defaults to training=True
         self.assertNotEqual(tuple(int(i) for i in out.shape[1:]), shape)
 
         out = layer(img, training=True)
@@ -2249,7 +2249,7 @@ def test_call_in_container(self):
 
         shape = (12, 12, 3)
         img = np.random.random((12,) + shape)
-        out = seq(img)  # Default to training=True
+        out = seq(img)  # Defaults to training=True
         self.assertNotEqual(tuple(int(i) for i in out.shape[1:]), shape)
 
         out = seq(img, training=True)
diff --git a/keras/layers/preprocessing/index_lookup.py b/keras/layers/preprocessing/index_lookup.py
index c57740b087d7..4747b7ac206e 100644
--- a/keras/layers/preprocessing/index_lookup.py
+++ b/keras/layers/preprocessing/index_lookup.py
@@ -134,10 +134,10 @@ class IndexLookup(base_preprocessing_layer.PreprocessingLayer):
         `"tf_idf"`, this argument must be supplied.
       invert: Only valid when `output_mode` is `"int"`. If True, this layer will
         map indices to vocabulary items instead of mapping vocabulary items to
-        indices. Default to False.
+        indices. Defaults to `False`.
       output_mode: Specification for the output of the layer. Values can be
-        `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`, or `"tf_idf"` configuring
-        the layer as follows:
+        `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`, or `"tf_idf"`
+        configuring the layer as follows:
           - `"int"`: Return the raw integer indices of the input tokens.
           - `"one_hot"`: Encodes each individual element in the input into an
             array the same size as the vocabulary, containing a 1 at the element
@@ -162,7 +162,7 @@ class IndexLookup(base_preprocessing_layer.PreprocessingLayer):
         False.
       sparse: Boolean. Only applicable to `"one_hot"`, `"multi_hot"`, `"count"`
         and `"tf-idf"` output modes. If True, returns a `SparseTensor` instead
-        of a dense `Tensor`. Defaults to False.
+        of a dense `Tensor`. Defaults to `False`.
     """
 
     def __init__(
diff --git a/keras/layers/preprocessing/integer_lookup.py b/keras/layers/preprocessing/integer_lookup.py
index 832ea1338542..62b660a48846 100644
--- a/keras/layers/preprocessing/integer_lookup.py
+++ b/keras/layers/preprocessing/integer_lookup.py
@@ -71,7 +71,7 @@ class IntegerLookup(index_lookup.IndexLookup):
         only be specified when adapting the vocabulary or when setting
         `pad_to_max_tokens=True`. If None, there is no cap on the size of the
         vocabulary. Note that this size includes the OOV and mask tokens.
-        Defaults to None.
+        Defaults to `None`.
       num_oov_indices: The number of out-of-vocabulary tokens to use. If this
         value is more than 1, OOV inputs are modulated to determine their OOV
         value. If this value is 0, OOV inputs will cause an error when calling
@@ -80,7 +80,7 @@ class IntegerLookup(index_lookup.IndexLookup):
         `output_mode` is `"int"`, the token is included in vocabulary and mapped
         to index 0. In other output modes, the token will not appear in the
         vocabulary and instances of the mask token in the input will be dropped.
-        If set to None, no mask term will be added. Defaults to None.
+        If set to None, no mask term will be added. Defaults to `None`.
       oov_token: Only used when `invert` is True. The token to return for OOV
         indices. Defaults to `-1`.
       vocabulary: Optional. Either an array of integers or a string path to a
@@ -98,7 +98,7 @@ class IntegerLookup(index_lookup.IndexLookup):
         `"tf_idf"`, this argument must be supplied.
       invert: Only valid when `output_mode` is `"int"`. If True, this layer will
         map indices to vocabulary items instead of mapping vocabulary items to
-        indices. Default to False.
+        indices. Defaults to `False`.
       output_mode: Specification for the output of the layer. Values can be
         `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`, or `"tf_idf"`
         configuring the layer as follows:
@@ -128,7 +128,7 @@ class IntegerLookup(index_lookup.IndexLookup):
         False.
       sparse: Boolean. Only applicable when `output_mode` is `"multi_hot"`,
         `"count"`, or `"tf_idf"`. If True, returns a `SparseTensor` instead of a
-        dense `Tensor`. Defaults to False.
+        dense `Tensor`. Defaults to `False`.
 
     Examples:
 
diff --git a/keras/layers/preprocessing/string_lookup.py b/keras/layers/preprocessing/string_lookup.py
index d345fe89ff04..0b514c2d5cc6 100644
--- a/keras/layers/preprocessing/string_lookup.py
+++ b/keras/layers/preprocessing/string_lookup.py
@@ -68,7 +68,7 @@ class StringLookup(index_lookup.IndexLookup):
         only be specified when adapting the vocabulary or when setting
         `pad_to_max_tokens=True`. If None, there is no cap on the size of the
         vocabulary. Note that this size includes the OOV and mask tokens.
-        Defaults to None.
+        Defaults to `None`.
       num_oov_indices: The number of out-of-vocabulary tokens to use. If this
         value is more than 1, OOV inputs are hashed to determine their OOV
         value. If this value is 0, OOV inputs will cause an error when calling
@@ -93,9 +93,10 @@ class StringLookup(index_lookup.IndexLookup):
         `"tf_idf"`, this argument must be supplied.
       invert: Only valid when `output_mode` is `"int"`. If True, this layer will
         map indices to vocabulary items instead of mapping vocabulary items to
-        indices. Default to False.
+        indices. Defaults to `False`.
       output_mode: Specification for the output of the layer. Values can be
-        `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`, or `"tf_idf"` configuring the layer as follows:
+        `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`, or `"tf_idf"`
+        configuring the layer as follows:
           - `"int"`: Return the raw integer indices of the input tokens.
           - `"one_hot"`: Encodes each individual element in the input into an
             array the same size as the vocabulary, containing a 1 at the element
@@ -122,7 +123,7 @@ class StringLookup(index_lookup.IndexLookup):
         False.
       sparse: Boolean. Only applicable when `output_mode` is `"multi_hot"`,
         `"count"`, or `"tf_idf"`. If True, returns a `SparseTensor` instead of a
-        dense `Tensor`. Defaults to False.
+        dense `Tensor`. Defaults to `False`.
       encoding: Optional. The text encoding to use to interpret the input
         strings. Defaults to `"utf-8"`.
 
diff --git a/keras/layers/preprocessing/text_vectorization.py b/keras/layers/preprocessing/text_vectorization.py
index a50beb2789c3..b0d57265cb15 100644
--- a/keras/layers/preprocessing/text_vectorization.py
+++ b/keras/layers/preprocessing/text_vectorization.py
@@ -157,7 +157,7 @@ class TextVectorization(base_preprocessing_layer.PreprocessingLayer):
         modes. If True, the output will have its feature axis padded to
         `max_tokens` even if the number of unique tokens in the vocabulary is
         less than max_tokens, resulting in a tensor of shape `(batch_size,
-        max_tokens)` regardless of vocabulary size. Defaults to False.
+        max_tokens)` regardless of vocabulary size. Defaults to `False`.
       vocabulary: Optional. Either an array of strings or a string path to a
         text file. If passing an array, can pass a tuple, list, 1D numpy array,
         or 1D tensor containing the string vocabulary terms. If passing a file
@@ -171,10 +171,10 @@ class TextVectorization(base_preprocessing_layer.PreprocessingLayer):
         `"tf_idf"`, this argument must be supplied.
       ragged: Boolean. Only applicable to `"int"` output mode. If True, returns
         a `RaggedTensor` instead of a dense `Tensor`, where each sequence may
-        have a different length after string splitting. Defaults to False.
+        have a different length after string splitting. Defaults to `False`.
       sparse: Boolean. Only applicable to `"multi_hot"`, `"count"`, and
         `"tf_idf"` output modes. If True, returns a `SparseTensor` instead of a
-        dense `Tensor`. Defaults to False.
+        dense `Tensor`. Defaults to `False`.
       encoding: Optional. The text encoding to use to interpret the input
         strings. Defaults to `"utf-8"`.
 
diff --git a/keras/legacy_tf_layers/variable_scope_shim.py b/keras/legacy_tf_layers/variable_scope_shim.py
index dd4211e43f22..63c7fd6e81b7 100644
--- a/keras/legacy_tf_layers/variable_scope_shim.py
+++ b/keras/legacy_tf_layers/variable_scope_shim.py
@@ -247,7 +247,7 @@ def get_variable(
             instead an experimental ResourceVariable which has well-defined
             semantics. When starting off as False it will later change to True.
             When eager execution is enabled this argument always True.
-            Defaults to False.
+            Defaults to `False`.
           custom_getter: Callable that takes as a first argument the true
             getter, and allows overwriting the internal get_variable method. The
             signature of `custom_getter` should match that of this method, but
diff --git a/keras/losses.py b/keras/losses.py
index 16f5307fa799..5a1df59b13fe 100644
--- a/keras/losses.py
+++ b/keras/losses.py
@@ -2137,7 +2137,8 @@ def _ragged_tensor_categorical_crossentropy(
       label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
         example, if `0.1`, use `0.1 / num_classes` for non-target labels
         and `0.9 + 0.1 / num_classes` for target labels.
-      axis: The axis along which to compute crossentropy (the features axis). Defaults to `-1`.
+      axis: The axis along which to compute crossentropy (the features axis).
+        Defaults to `-1`.
 
     Returns:
       Categorical crossentropy loss value.
@@ -2202,7 +2203,8 @@ def categorical_focal_crossentropy(
         label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
             example, if `0.1`, use `0.1 / num_classes` for non-target labels
             and `0.9 + 0.1 / num_classes` for target labels.
-        axis: The dimension along which the entropy is computed. Defaults to `-1`.
+        axis: The dimension along which the entropy is computed.
+            Defaults to `-1`.
 
     Returns:
         Categorical focal crossentropy loss value.
@@ -2281,7 +2283,8 @@ def _ragged_tensor_categorical_focal_crossentropy(
         label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
             example, if `0.1`, use `0.1 / num_classes` for non-target labels
             and `0.9 + 0.1 / num_classes` for target labels.
-        axis: The dimension along which the entropy is computed. Defaults to `-1`.
+        axis: The dimension along which the entropy is computed.
+            Defaults to `-1`.
 
     Returns:
       Categorical focal crossentropy loss value.
diff --git a/keras/mixed_precision/loss_scale_optimizer.py b/keras/mixed_precision/loss_scale_optimizer.py
index 52d7f968a5ab..2f0bc20fbcda 100644
--- a/keras/mixed_precision/loss_scale_optimizer.py
+++ b/keras/mixed_precision/loss_scale_optimizer.py
@@ -409,11 +409,11 @@ class BaseLossScaleOptimizer(metaclass=LossScaleOptimizerMetaclass):
       dynamic: Bool indicating whether dynamic loss scaling is used.  If True,
         the loss scale will be dynamically updated over time using an algorithm
         that keeps the loss scale at approximately its optimal value. If False,
-        a single fixed loss scale is used and  `initial_scale` must be specified,
-        which is used as the loss scale.
+        a single fixed loss scale is used and  `initial_scale` must be
+        specified, which is used as the loss scale.
         Recommended to keep as True, as choosing a fixed loss scale can be
         tricky. Currently, there is a small performance overhead to dynamic loss
-        scaling compared to fixed loss scaling. Defaults to True.
+        scaling compared to fixed loss scaling. Defaults to `True`.
       initial_scale: The initial loss scale. If `dynamic` is True, this defaults
         to `2 ** 15`. If `dynamic` is False, this must be specified and acts as
         the sole loss scale, as the loss scale does not change over time. When
diff --git a/keras/optimizers/adadelta.py b/keras/optimizers/adadelta.py
index 20f723f1881c..51954f6b9244 100644
--- a/keras/optimizers/adadelta.py
+++ b/keras/optimizers/adadelta.py
@@ -49,13 +49,14 @@ class Adadelta(optimizer.Optimizer):
     Args:
       learning_rate: Initial value for the learning rate: either a floating
         point value, or a `tf.keras.optimizers.schedules.LearningRateSchedule`
-        instance. Defaults to 0.001. Note that `Adadelta` tends to benefit from
+        instance. Note that `Adadelta` tends to benefit from
         higher initial learning rate values compared to other optimizers. To
         match the exact form in the original paper, use 1.0.
+        Defaults to `0.001`.
       rho: A `Tensor` or a floating point value. The decay rate. Defaults to
-        0.95.
+        `0.95`.
       epsilon: Small floating point value used to maintain numerical stability.
-        Defaults to 1e-7.
+        Defaults to `1e-7`.
       {{base_optimizer_keyword_args}}
 
     Reference:
diff --git a/keras/optimizers/adafactor.py b/keras/optimizers/adafactor.py
index 07e48ad31660..0c38183aaa06 100644
--- a/keras/optimizers/adafactor.py
+++ b/keras/optimizers/adafactor.py
@@ -45,19 +45,19 @@ class Adafactor(optimizer.Optimizer):
       learning_rate: Initial value for the learning rate:
         either a floating point value,
         or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
-        Defaults to 0.001.
-      beta_2_decay: float, defaults to -0.8. The decay rate of `beta_2`.
-      epsilon_1: float, defaults to 1e-30. A small offset to keep demoninator
-        away from 0.
-      epsilon_2: float, defaults to 1e-3. A small offset to avoid learning
-        rate becoming too small by time.
-      clip_threshold: float, defaults to 1.0. Clipping threshold. This is a part
-        of Adafactor algorithm, independent from `clipnorm`, `clipvalue` and
-        `global_clipnorm`.
-      relative_step: bool, defaults to True. If `learning_rate` is a
+        Defaults to `0.001`.
+      beta_2_decay: float. The decay rate of `beta_2`. Defaults to `-0.8`.
+      epsilon_1: float, A small offset to keep denominator
+        away from 0. Defaults to `1e-30`.
+      epsilon_2: float. A small offset to avoid learning
+        rate becoming too small by time. Defaults to `1e-3`.
+      clip_threshold: float. Clipping threshold. This is a part
+        of Adafactor algorithm, independent of `clipnorm`, `clipvalue` and
+        `global_clipnorm`. Defaults to `1.0`.
+      relative_step: bool. If `learning_rate` is a
         constant and `relative_step=True`, learning rate will be adjusted
         based on current iterations. This is a default learning rate decay
-        in Adafactor.
+        in Adafactor. Defaults to `True`.
       {{base_optimizer_keyword_args}}
 
     Reference:
diff --git a/keras/optimizers/adagrad.py b/keras/optimizers/adagrad.py
index 0d288e834d9a..eb332f883cc3 100644
--- a/keras/optimizers/adagrad.py
+++ b/keras/optimizers/adagrad.py
@@ -43,10 +43,10 @@ class Adagrad(optimizer.Optimizer):
       learning_rate: Initial value for the learning rate:
         either a floating point value,
         or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
-        Defaults to 0.001.
         Note that `Adagrad` tends to benefit from higher initial learning rate
         values compared to other optimizers.
         To match the exact form in the original paper, use 1.0.
+        Defaults to `0.001`.
       initial_accumulator_value: Floating point value.
         Starting value for the accumulators (per-parameter momentum values).
         Must be non-negative.
diff --git a/keras/optimizers/adam.py b/keras/optimizers/adam.py
index 04585b5ee5fb..8fb236e71408 100644
--- a/keras/optimizers/adam.py
+++ b/keras/optimizers/adam.py
@@ -47,17 +47,18 @@ class Adam(optimizer.Optimizer):
       learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
         that takes no arguments and returns the actual value to use. The
-        learning rate. Defaults to 0.001.
+        learning rate. Defaults to `0.001`.
       beta_1: A float value or a constant float tensor, or a callable
         that takes no arguments and returns the actual value to use. The
-        exponential decay rate for the 1st moment estimates. Defaults to 0.9.
+        exponential decay rate for the 1st moment estimates. Defaults to `0.9`.
       beta_2: A float value or a constant float tensor, or a callable
         that takes no arguments and returns the actual value to use. The
-        exponential decay rate for the 2nd moment estimates. Defaults to 0.999.
+        exponential decay rate for the 2nd moment estimates. Defaults to
+        `0.999`.
       epsilon: A small constant for numerical stability. This epsilon is
         "epsilon hat" in the Kingma and Ba paper (in the formula just before
         Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
-        1e-7.
+        `1e-7`.
       amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
         the paper "On the Convergence of Adam and beyond". Defaults to `False`.
       {{base_optimizer_keyword_args}}
diff --git a/keras/optimizers/adamax.py b/keras/optimizers/adamax.py
index 63aa208884fe..dd694dc866ac 100644
--- a/keras/optimizers/adamax.py
+++ b/keras/optimizers/adamax.py
@@ -60,7 +60,7 @@ class Adamax(optimizer.Optimizer):
       learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
         that takes no arguments and returns the actual value to use. The
-        learning rate. Defaults to 0.001.
+        learning rate. Defaults to `0.001`.
       beta_1: A float value or a constant float tensor. The exponential decay
         rate for the 1st moment estimates.
       beta_2: A float value or a constant float tensor. The exponential decay
diff --git a/keras/optimizers/adamw.py b/keras/optimizers/adamw.py
index cf7b4a05b9ce..c2258c5e2a85 100644
--- a/keras/optimizers/adamw.py
+++ b/keras/optimizers/adamw.py
@@ -51,19 +51,20 @@ class AdamW(optimizer.Optimizer):
       learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
         that takes no arguments and returns the actual value to use. The
-        learning rate. Defaults to 0.001.
+        learning rate. Defaults to `0.001`.
       weight_decay: A `tf.Tensor`, floating point value. The weight decay.
-        Defaults to 0.004.
+        Defaults to `0.004`.
       beta_1: A float value or a constant float tensor, or a callable
         that takes no arguments and returns the actual value to use. The
-        exponential decay rate for the 1st moment estimates. Defaults to 0.9.
+        exponential decay rate for the 1st moment estimates. Defaults to `0.9`.
       beta_2: A float value or a constant float tensor, or a callable
         that takes no arguments and returns the actual value to use. The
-        exponential decay rate for the 2nd moment estimates. Defaults to 0.999.
+        exponential decay rate for the 2nd moment estimates. Defaults to
+        `0.999`.
       epsilon: A small constant for numerical stability. This epsilon is
         "epsilon hat" in the Kingma and Ba paper (in the formula just before
         Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
-        1e-7.
+        `1e-7`.
       amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
         the paper "On the Convergence of Adam and beyond". Defaults to `False`.
       {{base_optimizer_keyword_args}}
diff --git a/keras/optimizers/ftrl.py b/keras/optimizers/ftrl.py
index 0499294610aa..8acc416e246e 100644
--- a/keras/optimizers/ftrl.py
+++ b/keras/optimizers/ftrl.py
@@ -77,16 +77,16 @@ class Ftrl(optimizer.Optimizer):
       learning_rate: A `Tensor`, floating point value, a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable that
         takes no arguments and returns the actual value to use. The learning
-        rate.  Defaults to 0.001.
+        rate.  Defaults to `0.001`.
       learning_rate_power: A float value, must be less or equal to zero.
         Controls how the learning rate decreases during training. Use zero for a
         fixed learning rate.
       initial_accumulator_value: The starting value for accumulators. Only zero
         or positive values are allowed.
       l1_regularization_strength: A float value, must be greater than or equal
-        to zero. Defaults to 0.0.
+        to zero. Defaults to `0.0`.
       l2_regularization_strength: A float value, must be greater than or equal
-        to zero. Defaults to 0.0.
+        to zero. Defaults to `0.0`.
       l2_shrinkage_regularization_strength: A float value, must be greater than
         or equal to zero. This differs from L2 above in that the L2 above is a
         stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
diff --git a/keras/optimizers/legacy/adadelta.py b/keras/optimizers/legacy/adadelta.py
index 4b8b1680e2f1..9310a9bfcfd5 100644
--- a/keras/optimizers/legacy/adadelta.py
+++ b/keras/optimizers/legacy/adadelta.py
@@ -48,10 +48,10 @@ class Adadelta(optimizer_v2.OptimizerV2):
       learning_rate: Initial value for the learning rate:
         either a floating point value,
         or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
-        Defaults to 0.001.
         Note that `Adadelta` tends to benefit from higher initial learning rate
         values compared to other optimizers.
         To match the exact form in the original paper, use 1.0.
+        Defaults to `0.001`.
       rho: A `Tensor` or a floating point value. The decay rate.
       epsilon: Small floating point value used to maintain numerical stability.
       name: Optional name prefix for the operations created when applying
diff --git a/keras/optimizers/legacy/adagrad.py b/keras/optimizers/legacy/adagrad.py
index c29280c8690a..4b130051416d 100644
--- a/keras/optimizers/legacy/adagrad.py
+++ b/keras/optimizers/legacy/adagrad.py
@@ -40,10 +40,10 @@ class Adagrad(optimizer_v2.OptimizerV2):
       learning_rate: Initial value for the learning rate:
         either a floating point value,
         or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
-        Defaults to 0.001.
         Note that `Adagrad` tends to benefit from higher initial learning rate
         values compared to other optimizers.
         To match the exact form in the original paper, use 1.0.
+        Defaults to `0.001`.
       initial_accumulator_value: Floating point value.
         Starting value for the accumulators (per-parameter momentum values).
         Must be non-negative.
diff --git a/keras/optimizers/legacy/adam.py b/keras/optimizers/legacy/adam.py
index a416d22f10bb..3678f316de85 100644
--- a/keras/optimizers/legacy/adam.py
+++ b/keras/optimizers/legacy/adam.py
@@ -44,17 +44,18 @@ class Adam(optimizer_v2.OptimizerV2):
       learning_rate: A `Tensor`, floating point value, or a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
         that takes no arguments and returns the actual value to use, The
-        learning rate. Defaults to 0.001.
+        learning rate. Defaults to `0.001`.
       beta_1: A float value or a constant float tensor, or a callable
         that takes no arguments and returns the actual value to use. The
-        exponential decay rate for the 1st moment estimates. Defaults to 0.9.
+        exponential decay rate for the 1st moment estimates. Defaults to `0.9`.
       beta_2: A float value or a constant float tensor, or a callable
         that takes no arguments and returns the actual value to use, The
-        exponential decay rate for the 2nd moment estimates. Defaults to 0.999.
+        exponential decay rate for the 2nd moment estimates. Defaults to
+        `0.999`.
       epsilon: A small constant for numerical stability. This epsilon is
         "epsilon hat" in the Kingma and Ba paper (in the formula just before
         Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
-        1e-7.
+        `1e-7`.
       amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
         the paper "On the Convergence of Adam and beyond". Defaults to `False`.
       name: Optional name for the operations created when applying gradients.
@@ -364,19 +365,19 @@ def __init__(
           learning_rate: A `Tensor`, floating point value, or a schedule that is
             a `tf.keras.optimizers.schedules.LearningRateSchedule`, or a
             callable that takes no arguments and returns the actual value to
-            use, The learning rate. Defaults to 0.001.
+            use, The learning rate. Defaults to `0.001`.
           beta_1: A float value or a constant float tensor, or a callable that
             takes no arguments and returns the actual value to use. The
             exponential decay rate for the 1st moment estimates. Defaults to
-            0.9.
+            `0.9`.
           beta_2: A float value or a constant float tensor, or a callable that
             takes no arguments and returns the actual value to use, The
             exponential decay rate for the 2nd moment estimates. Defaults to
-            0.999.
+            `0.999`.
           epsilon: A small constant for numerical stability. This epsilon is
             "epsilon hat" in the Kingma and Ba paper (in the formula just before
             Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults
-            to 1e-7.
+            to `1e-7`.
           amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm
             from the paper "On the Convergence of Adam and beyond". Defaults to
             `False`.
diff --git a/keras/optimizers/legacy/ftrl.py b/keras/optimizers/legacy/ftrl.py
index d41536ecaf18..0e592b268743 100644
--- a/keras/optimizers/legacy/ftrl.py
+++ b/keras/optimizers/legacy/ftrl.py
@@ -81,9 +81,9 @@ class Ftrl(optimizer_v2.OptimizerV2):
       initial_accumulator_value: The starting value for accumulators.
         Only zero or positive values are allowed.
       l1_regularization_strength: A float value, must be greater than or
-        equal to zero. Defaults to 0.0.
+        equal to zero. Defaults to `0.0`.
       l2_regularization_strength: A float value, must be greater than or
-        equal to zero. Defaults to 0.0.
+        equal to zero. Defaults to `0.0`.
       name: Optional name prefix for the operations created when applying
         gradients.  Defaults to `"Ftrl"`.
       l2_shrinkage_regularization_strength: A float value, must be greater than
@@ -91,7 +91,7 @@ class Ftrl(optimizer_v2.OptimizerV2):
         stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
         When input is sparse shrinkage will only happen on the active weights.
       beta: A float value, representing the beta value from the paper.
-        Defaults to 0.0.
+        Defaults to `0.0`.
       **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
         `clipnorm`, `global_clipnorm`.
         If `clipvalue` (float) is set, the gradient of each weight
diff --git a/keras/optimizers/legacy/gradient_descent.py b/keras/optimizers/legacy/gradient_descent.py
index 0bcb10fdfec8..8d305f705e6e 100644
--- a/keras/optimizers/legacy/gradient_descent.py
+++ b/keras/optimizers/legacy/gradient_descent.py
@@ -54,10 +54,10 @@ class SGD(optimizer_v2.OptimizerV2):
       learning_rate: A `Tensor`, floating point value, or a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
         that takes no arguments and returns the actual value to use. The
-        learning rate. Defaults to 0.01.
+        learning rate. Defaults to `0.01`.
       momentum: float hyperparameter >= 0 that accelerates gradient descent in
-        the relevant direction and dampens oscillations. Defaults to 0, i.e.,
-        vanilla gradient descent.
+        the relevant direction and dampens oscillations. Vanilla gradient
+        descent means no momentum. Defaults to `0.`.
       nesterov: boolean. Whether to apply Nesterov momentum.
         Defaults to `False`.
       name: Optional name prefix for the operations created when applying
diff --git a/keras/optimizers/legacy/optimizer_v2.py b/keras/optimizers/legacy/optimizer_v2.py
index 7deacfad20e4..ca56b07cfaa7 100644
--- a/keras/optimizers/legacy/optimizer_v2.py
+++ b/keras/optimizers/legacy/optimizer_v2.py
@@ -692,8 +692,8 @@ def apply_gradients(
 
         Args:
           grads_and_vars: List of (gradient, variable) pairs.
-          name: Optional name for the returned operation. Default to the name
-            passed to the `Optimizer` constructor.
+          name: Optional name for the returned operation. When None, uses the
+            name passed to the `Optimizer` constructor. Defaults to `None`.
           experimental_aggregate_gradients: Whether to sum gradients from
             different replicas in the presence of `tf.distribute.Strategy`. If
             False, it's user responsibility to aggregate the gradients. Default
diff --git a/keras/optimizers/legacy/rmsprop.py b/keras/optimizers/legacy/rmsprop.py
index 626c333398da..5537de9cc8ab 100644
--- a/keras/optimizers/legacy/rmsprop.py
+++ b/keras/optimizers/legacy/rmsprop.py
@@ -45,13 +45,14 @@ class RMSprop(optimizer_v2.OptimizerV2):
       learning_rate: A `Tensor`, floating point value, or a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
         that takes no arguments and returns the actual value to use. The
-        learning rate. Defaults to 0.001.
-      rho: Discounting factor for the history/coming gradient. Defaults to 0.9.
-      momentum: A scalar or a scalar `Tensor`. Defaults to 0.0.
+        learning rate. Defaults to `0.001`.
+      rho: Discounting factor for the history/coming gradient. Defaults to
+        `0.9`.
+      momentum: A scalar or a scalar `Tensor`. Defaults to `0.0`.
       epsilon: A small constant for numerical stability. This epsilon is
         "epsilon hat" in the Kingma and Ba paper (in the formula just before
         Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
-        1e-7.
+        `1e-7`.
       centered: Boolean. If `True`, gradients are normalized by the estimated
         variance of the gradient; if False, by the uncentered second moment.
         Setting this to `True` may help with training, but is slightly more
@@ -111,10 +112,10 @@ def __init__(
           learning_rate: A `Tensor`, floating point value, or a schedule that is
             a `tf.keras.optimizers.schedules.LearningRateSchedule`, or a
             callable that takes no arguments and returns the actual value to
-            use. The learning rate. Defaults to 0.001.
+            use. The learning rate. Defaults to `0.001`.
           rho: Discounting factor for the history/coming gradient. Defaults to
-            0.9.
-          momentum: A scalar or a scalar `Tensor`. Defaults to 0.0.
+            `0.9`.
+          momentum: A scalar or a scalar `Tensor`. Defaults to `0.0`.
           epsilon: A small constant for numerical stability. This epsilon is
             "epsilon hat" in the Kingma and Ba paper (in the formula just before
             Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults
diff --git a/keras/optimizers/legacy_learning_rate_decay.py b/keras/optimizers/legacy_learning_rate_decay.py
index a75a43e03724..93bd9dabd1ac 100644
--- a/keras/optimizers/legacy_learning_rate_decay.py
+++ b/keras/optimizers/legacy_learning_rate_decay.py
@@ -79,7 +79,7 @@ def exponential_decay(
         The decay rate.
       staircase: Boolean. If `True` decay the learning rate at discrete
         intervals
-      name: String. Optional name of the operation.  Defaults to
+      name: String. Optional name of the operation. Defaults to
         'ExponentialDecay'.
 
     Returns:
@@ -264,9 +264,10 @@ def polynomial_decay(
       end_learning_rate: A scalar `float32` or `float64` `Tensor` or a Python
         number.  The minimal end learning rate.
       power: A scalar `float32` or `float64` `Tensor` or a Python number.  The
-        power of the polynomial. Defaults to linear, 1.0.
-      cycle: A boolean, whether or not it should cycle beyond decay_steps.
-      name: String.  Optional name of the operation. Defaults to
+        power of the polynomial. Linear is default. Defaults to `1.0`.
+      cycle: A boolean, whether it should cycle beyond decay_steps. Defaults to
+        `False`.
+      name: String. Optional name of the operation. Defaults to
         'PolynomialDecay'.
 
     Returns:
diff --git a/keras/optimizers/lion.py b/keras/optimizers/lion.py
index 4a0eff2492fc..ace52dc30099 100644
--- a/keras/optimizers/lion.py
+++ b/keras/optimizers/lion.py
@@ -43,7 +43,7 @@ class Lion(optimizer.Optimizer):
       learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
         that takes no arguments and returns the actual value to use. The
-        learning rate. Defaults to 0.0001.
+        learning rate. Defaults to `0.0001`.
       beta_1: A float value or a constant float tensor, or a callable
         that takes no arguments and returns the actual value to use. The rate
         to combine the current gradient and the 1st moment estimate.
diff --git a/keras/optimizers/nadam.py b/keras/optimizers/nadam.py
index e8084c343dde..955dc2be30fa 100644
--- a/keras/optimizers/nadam.py
+++ b/keras/optimizers/nadam.py
@@ -37,17 +37,18 @@ class Nadam(optimizer.Optimizer):
       learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
         that takes no arguments and returns the actual value to use. The
-        learning rate. Defaults to 0.001.
+        learning rate. Defaults to `0.001`.
       beta_1: A float value or a constant float tensor, or a callable
         that takes no arguments and returns the actual value to use. The
-        exponential decay rate for the 1st moment estimates. Defaults to 0.9.
+        exponential decay rate for the 1st moment estimates. Defaults to `0.9`.
       beta_2: A float value or a constant float tensor, or a callable
         that takes no arguments and returns the actual value to use. The
-        exponential decay rate for the 2nd moment estimates. Defaults to 0.999.
+        exponential decay rate for the 2nd moment estimates. Defaults to
+        `0.999`.
       epsilon: A small constant for numerical stability. This epsilon is
         "epsilon hat" in the Kingma and Ba paper (in the formula just before
         Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
-        1e-7.
+        `1e-7`.
       {{base_optimizer_keyword_args}}
 
     Reference:
diff --git a/keras/optimizers/optimizer.py b/keras/optimizers/optimizer.py
index 717c78dea1ee..f47abe0c34be 100644
--- a/keras/optimizers/optimizer.py
+++ b/keras/optimizers/optimizer.py
@@ -448,8 +448,8 @@ def add_variable(self, shape, dtype=None, initializer="zeros", name=None):
         Args:
           shape: A list of integers, a tuple of integers, or a 1-D Tensor of
             type int32. Defaults to scalar if unspecified.
-          dtype: The DType of the optimizer variable to be created. Defaults to
-            `tf.keras.backend.floatx` if unspecified.
+          dtype: The DType of the optimizer variable to be created. None
+            means `tf.keras.backend.floatx`. Defaults to `None`.
           initializer: string or callable. Initializer instance.
           name: The name of the optimizer variable to be created.
 
diff --git a/keras/optimizers/rmsprop.py b/keras/optimizers/rmsprop.py
index 46332713bb77..b2e5f932ef5f 100644
--- a/keras/optimizers/rmsprop.py
+++ b/keras/optimizers/rmsprop.py
@@ -47,14 +47,14 @@ class RMSprop(optimizer.Optimizer):
       learning_rate: Initial value for the learning rate:
         either a floating point value,
         or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
-        Defaults to 0.001.
-      rho: float, defaults to 0.9. Discounting factor for the old gradients.
+        Defaults to `0.001`.
+      rho: float. Discounting factor for the old gradients. Defaults to `0.9`.
       momentum: float, defaults to 0.0. If not 0.0., the optimizer tracks the
         momentum value, with a decay rate equals to `1 - momentum`.
       epsilon: A small constant for numerical stability. This epsilon is
         "epsilon hat" in the Kingma and Ba paper (in the formula just before
         Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
-        1e-7.
+        `1e-7`.
       centered: Boolean. If `True`, gradients are normalized by the estimated
         variance of the gradient; if False, by the uncentered second moment.
         Setting this to `True` may help with training, but is slightly more
diff --git a/keras/optimizers/schedules/learning_rate_schedule.py b/keras/optimizers/schedules/learning_rate_schedule.py
index ef773c9b1b9e..6146bf60ab38 100644
--- a/keras/optimizers/schedules/learning_rate_schedule.py
+++ b/keras/optimizers/schedules/learning_rate_schedule.py
@@ -405,8 +405,9 @@ def __init__(
           end_learning_rate: A scalar `float32` or `float64` `Tensor` or a
             Python number.  The minimal end learning rate.
           power: A scalar `float32` or `float64` `Tensor` or a
-            Python number. The power of the polynomial. Defaults to linear, 1.0.
-          cycle: A boolean, whether or not it should cycle beyond decay_steps.
+            Python number. The power of the polynomial. Linear default.
+            Defaults to `1.0`.
+          cycle: A boolean, whether it should cycle beyond decay_steps.
           name: String.  Optional name of the operation. Defaults to
             'PolynomialDecay'.
         """
diff --git a/keras/optimizers/sgd.py b/keras/optimizers/sgd.py
index 39b79a0d99ac..e663bf0d8414 100644
--- a/keras/optimizers/sgd.py
+++ b/keras/optimizers/sgd.py
@@ -57,10 +57,11 @@ class SGD(optimizer.Optimizer):
       learning_rate: A `Tensor`, floating point value, or a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
         that takes no arguments and returns the actual value to use. The
-        learning rate. Defaults to 0.001.
+        learning rate. Defaults to `0.001`.
       momentum: float hyperparameter >= 0 that accelerates gradient descent in
-        the relevant direction and dampens oscillations. Defaults to 0, i.e.,
-        vanilla gradient descent.
+        the relevant direction and dampens oscillations. Vanilla gradient
+        descent has 0 momentum. Defaults to `0.`.
+        .
       nesterov: boolean. Whether to apply Nesterov momentum.
         Defaults to `False`.
       {{base_optimizer_keyword_args}}
diff --git a/keras/preprocessing/image.py b/keras/preprocessing/image.py
index e088fafb66e7..266f60e3e119 100644
--- a/keras/preprocessing/image.py
+++ b/keras/preprocessing/image.py
@@ -1225,9 +1225,9 @@ class ImageDataGenerator:
           `fill_mode = "constant"`.
         horizontal_flip: Boolean. Randomly flip inputs horizontally.
         vertical_flip: Boolean. Randomly flip inputs vertically.
-        rescale: rescaling factor. Defaults to None. If None or 0, no rescaling
+        rescale: rescaling factor. If None or 0, no rescaling
           is applied, otherwise we multiply the data by the value provided
-          (after applying all other transformations).
+          (after applying all other transformations). Defaults to None.
         preprocessing_function: function that will be applied on each input. The
           function will run after the image is resized and augmented.
             The function should take one argument: one image (Numpy tensor with
diff --git a/keras/saving/legacy/save.py b/keras/saving/legacy/save.py
index 4c6a3825308f..f6b1c0ece4c8 100644
--- a/keras/saving/legacy/save.py
+++ b/keras/saving/legacy/save.py
@@ -120,9 +120,9 @@ def save_model(
         save_traces: (only applies to SavedModel format) When enabled, the
           SavedModel will store the function traces for each layer. This
           can be disabled, so that only the configs of each layer are stored.
-          Defaults to `True`. Disabling this will decrease serialization time
-          and reduce file size, but it requires that all custom layers/models
-          implement a `get_config()` method.
+          Disabling this will decrease serialization time and filesize, but
+          it requires that all custom layers/models implement a
+          `get_config()` method. Defaults to `True`.
 
     Raises:
         ImportError: If save format is hdf5, and h5py is not available.
diff --git a/keras/saving/legacy/saved_model/save.py b/keras/saving/legacy/saved_model/save.py
index 601f4c089ab4..7d99a15485b5 100644
--- a/keras/saving/legacy/saved_model/save.py
+++ b/keras/saving/legacy/saved_model/save.py
@@ -64,9 +64,9 @@ def save(
       save_traces: (only applies to SavedModel format) When enabled, the
         SavedModel will store the function traces for each layer. This
         can be disabled, so that only the configs of each layer are stored.
-        Defaults to `True`. Disabling this will decrease serialization time
-        and reduce file size, but it requires that all custom layers/models
-        implement a `get_config()` method.
+        Disabling this will decrease serialization time and filesize, but
+        it requires that all custom layers/models implement a
+        `get_config()` method. Defaults to `True`.
 
     Raises:
       ValueError: if the model's inputs have not been defined.
diff --git a/keras/saving/saving_api.py b/keras/saving/saving_api.py
index e841716e09e9..4109d83b0bfb 100644
--- a/keras/saving/saving_api.py
+++ b/keras/saving/saving_api.py
@@ -52,7 +52,7 @@ def save_model(model, filepath, overwrite=True, save_format=None, **kwargs):
 
     SavedModel format arguments:
         include_optimizer: Only applied to SavedModel and legacy HDF5 formats.
-            If False, do not save the optimizer state. Defaults to True.
+            If False, do not save the optimizer state. Defaults to `True`.
         signatures: Only applies to SavedModel format. Signatures to save
             with the SavedModel. See the `signatures` argument in
             `tf.saved_model.save` for details.
@@ -62,9 +62,9 @@ def save_model(model, filepath, overwrite=True, save_format=None, **kwargs):
         save_traces: Only applies to SavedModel format. When enabled, the
             SavedModel will store the function traces for each layer. This
             can be disabled, so that only the configs of each layer are stored.
-            Defaults to `True`. Disabling this will decrease serialization time
-            and reduce file size, but it requires that all custom layers/models
-            implement a `get_config()` method.
+            Disabling this will decrease serialization time and filesize, but
+            it requires that all custom layers/models implement a
+            `get_config()` method. Defaults to `True`.
 
     Example:
 
@@ -184,7 +184,7 @@ def load_model(
         safe_mode: Boolean, whether to disallow unsafe `lambda` deserialization.
             When `safe_mode=False`, loading an object has the potential to
             trigger arbitrary code execution. This argument is only
-            applicable to the Keras v3 model format. Defaults to True.
+            applicable to the Keras v3 model format. Defaults to `True`.
 
     SavedModel format arguments:
         options: Only applies to SavedModel format.
diff --git a/keras/saving/serialization_lib.py b/keras/saving/serialization_lib.py
index c9cbe0f6ccda..02e4965a6f5f 100644
--- a/keras/saving/serialization_lib.py
+++ b/keras/saving/serialization_lib.py
@@ -485,7 +485,7 @@ class ModifiedMeanSquaredError(keras.losses.MeanSquaredError):
         safe_mode: Boolean, whether to disallow unsafe `lambda` deserialization.
             When `safe_mode=False`, loading an object has the potential to
             trigger arbitrary code execution. This argument is only
-            applicable to the Keras v3 model format. Defaults to True.
+            applicable to the Keras v3 model format. Defaults to `True`.
 
     Returns:
       The object described by the `config` dictionary.
diff --git a/keras/testing_infra/test_combinations.py b/keras/testing_infra/test_combinations.py
index d10c558a02d0..2f29e1e3d5fa 100644
--- a/keras/testing_infra/test_combinations.py
+++ b/keras/testing_infra/test_combinations.py
@@ -112,7 +112,7 @@ def test_foo(self):
         test or class.
       exclude_formats: A collection of Keras saved model formats to not run.
         (May also be a single format not wrapped in a collection).
-        Defaults to None.
+        Defaults to `None`.
 
     Returns:
       Returns a decorator that will run the decorated test method multiple
@@ -258,7 +258,7 @@ def test_foo(self):
         test or class.
       exclude_models: A collection of Keras model types to not run.
         (May also be a single model type not wrapped in a collection).
-        Defaults to None.
+        Defaults to `None`.
 
     Returns:
       Returns a decorator that will run the decorated test method multiple
@@ -497,12 +497,13 @@ def keras_mode_combinations(mode=None, run_eagerly=None):
 
     Args:
       mode: List of modes to run the tests. The valid options are 'graph' and
-        'eager'. Default to ['graph', 'eager'] if not specified. If a empty list
-        is provide, then the test will run under the context based on tf's
-        version, eg graph for v1 and eager for v2.
+        'eager'. If None, uses ['graph', 'eager']. If an empty
+        list is provided, then the test will run under the context based on
+        tensorflow's version, e.g., graph for v1 and eager for v2. Defaults to
+        `None`.
       run_eagerly: List of `run_eagerly` value to be run with the tests.
-        Default to [True, False] if not specified. Note that for `graph` mode,
-        run_eagerly value will only be False.
+        When None, uses [True, False]. Note that for `graph` mode,
+        run_eagerly value will only be False. Defaults to `None`.
 
     Returns:
       A list contains all the combinations to be used to generate test cases.
diff --git a/keras/testing_infra/test_utils.py b/keras/testing_infra/test_utils.py
index 0240f03c13a9..0c138c1aea80 100644
--- a/keras/testing_infra/test_utils.py
+++ b/keras/testing_infra/test_utils.py
@@ -880,10 +880,11 @@ def get_multi_io_model(
       shared_input_branch: An optional sequence of layers to apply to a single
         input, before applying both branches to that intermediate result. If
         set, the model will take only one input instead of two. Defaults to
-        None.
+        `None`.
       shared_output_branch: An optional sequence of layers to merge the
         intermediate results produced by branch a and branch b. If set,
-        the model will produce only one output instead of two. Defaults to None.
+        the model will produce only one output instead of two.
+        Defaults to `None`.
 
     Returns:
       A multi-io model of the type specified by `get_model_type`, specified
diff --git a/keras/utils/audio_dataset.py b/keras/utils/audio_dataset.py
index ec9f08478595..52afba42780d 100644
--- a/keras/utils/audio_dataset.py
+++ b/keras/utils/audio_dataset.py
@@ -103,7 +103,7 @@ def audio_dataset_from_directory(
       subset: Subset of the data to return. One of "training", "validation" or
         "both". Only used if `validation_split` is set.
       follow_links: Whether to visits subdirectories pointed to by symlinks.
-        Defaults to False.
+        Defaults to `False`.
 
     Returns:
       A `tf.data.Dataset` object.
diff --git a/keras/utils/conv_utils.py b/keras/utils/conv_utils.py
index e9946ccb2e24..930bbaf9fef9 100644
--- a/keras/utils/conv_utils.py
+++ b/keras/utils/conv_utils.py
@@ -63,8 +63,8 @@ def normalize_tuple(value, n, name, allow_zero=False):
       n: The size of the tuple to be returned.
       name: The name of the argument being validated, e.g. "strides" or
         "kernel_size". This is only used to format error messages.
-      allow_zero: Default to False. A ValueError will raised if zero is received
-        and this param is False.
+      allow_zero: A ValueError will be raised if zero is received
+        and this param is False. Defaults to `False`.
 
     Returns:
       A tuple of n integers.
diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py
index 0103cad42c37..444b25670ca8 100644
--- a/keras/utils/dataset_utils.py
+++ b/keras/utils/dataset_utils.py
@@ -130,10 +130,10 @@ def _convert_dataset_to_list(
         dataset_type_spec : the type of the dataset
         data_size_warning_flag (bool, optional): If set to True, a warning will
           be issued if the dataset takes longer than 10 seconds to iterate.
-          Defaults to True.
+          Defaults to `True`.
         ensure_shape_similarity (bool, optional): If set to True, the shape of
           the first sample will be used to validate the shape of rest of the
-          samples. Defaults to True.
+          samples. Defaults to `True`.
 
     Returns:
         List: A list of tuples/NumPy arrays.
@@ -254,10 +254,10 @@ def _get_next_sample(
         dataset_iterator : An `iterator` object.
         ensure_shape_similarity (bool, optional): If set to True, the shape of
           the first sample will be used to validate the shape of rest of the
-          samples. Defaults to True.
+          samples. Defaults to `True`.
         data_size_warning_flag (bool, optional): If set to True, a warning will
           be issued if the dataset takes longer than 10 seconds to iterate.
-          Defaults to True.
+          Defaults to `True`.
         start_time (float): the start time of the dataset iteration. this is
           used only if `data_size_warning_flag` is set to true.
 
diff --git a/keras/utils/feature_space.py b/keras/utils/feature_space.py
index f3e0a0045434..e52e158dab05 100644
--- a/keras/utils/feature_space.py
+++ b/keras/utils/feature_space.py
@@ -105,12 +105,12 @@ class FeatureSpace(base_layer.Layer):
             "crossed" by hashing their combined value into
             a fixed-length vector.
         crossing_dim: Default vector size for hashing crossed features.
-            Defaults to 32.
+            Defaults to `32`.
         hashing_dim: Default vector size for hashing features of type
-            `"integer_hashed"` and `"string_hashed"`. Defaults to 32.
+            `"integer_hashed"` and `"string_hashed"`. Defaults to `32`.
         num_discretization_bins: Default number of bins to be used for
             discretizing features of type `"float_discretized"`.
-            Defaults to 32.
+            Defaults to `32`.
 
     **Available feature types:**
 
diff --git a/keras/utils/image_dataset.py b/keras/utils/image_dataset.py
index 449a8d4624d4..74d05b647a76 100644
--- a/keras/utils/image_dataset.py
+++ b/keras/utils/image_dataset.py
@@ -118,10 +118,10 @@ def image_dataset_from_directory(
           When `subset="both"`, the utility returns a tuple of two datasets
           (the training and validation datasets respectively).
       interpolation: String, the interpolation method used when resizing images.
-        Defaults to `bilinear`. Supports `bilinear`, `nearest`, `bicubic`,
-        `area`, `lanczos3`, `lanczos5`, `gaussian`, `mitchellcubic`.
+        Supports `bilinear`, `nearest`, `bicubic`, `area`, `lanczos3`,
+        `lanczos5`, `gaussian`, `mitchellcubic`. Defaults to `bilinear`.
       follow_links: Whether to visit subdirectories pointed to by symlinks.
-          Defaults to False.
+          Defaults to `False`.
       crop_to_aspect_ratio: If True, resize the images without aspect
         ratio distortion. When the original aspect ratio differs from the target
         aspect ratio, the output image will be cropped so as to return the
diff --git a/keras/utils/image_utils.py b/keras/utils/image_utils.py
index c5f13274a3e5..d3190f51aaf6 100644
--- a/keras/utils/image_utils.py
+++ b/keras/utils/image_utils.py
@@ -120,9 +120,9 @@ def smart_resize(x, size, interpolation="bilinear"):
         format `(height, width, channels)` or `(batch_size, height, width,
         channels)`.
       size: Tuple of `(height, width)` integer. Target size.
-      interpolation: String, interpolation to use for resizing. Defaults to
-        `'bilinear'`. Supports `bilinear`, `nearest`, `bicubic`, `area`,
-        `lanczos3`, `lanczos5`, `gaussian`, `mitchellcubic`.
+      interpolation: String, interpolation to use for resizing. Supports
+        `bilinear`, `nearest`, `bicubic`, `area`, `lanczos3`, `lanczos5`,
+        `gaussian`, `mitchellcubic`. Defaults to `'bilinear'`.
 
     Returns:
       Array with shape `(size[0], size[1], channels)`. If the input image was a
@@ -216,14 +216,14 @@ def array_to_img(x, data_format=None, scale=True, dtype=None):
     Args:
         x: Input data, in any form that can be converted to a Numpy array.
         data_format: Image data format, can be either `"channels_first"` or
-          `"channels_last"`. Defaults to `None`, in which case the global
+          `"channels_last"`. None means the global
           setting `tf.keras.backend.image_data_format()` is used (unless you
-          changed it, it defaults to `"channels_last"`).
+          changed it, it defaults to `"channels_last"`). Defaults to `None`.
         scale: Whether to rescale the image such that minimum and maximum values
           are 0 and 255 respectively. Defaults to `True`.
-        dtype: Dtype to use. Default to `None`, in which case the global setting
-          `tf.keras.backend.floatx()` is used (unless you changed it, it
-          defaults to `"float32"`)
+        dtype: Dtype to use. None makes the global setting
+          `tf.keras.backend.floatx()` to be used (unless you changed it, it
+          defaults to `"float32"`). Defaults to `None`.
 
     Returns:
         A PIL Image instance.
@@ -298,12 +298,12 @@ def img_to_array(img, data_format=None, dtype=None):
     Args:
         img: Input PIL Image instance.
         data_format: Image data format, can be either `"channels_first"` or
-          `"channels_last"`. Defaults to `None`, in which case the global
+          `"channels_last"`. None means the global
           setting `tf.keras.backend.image_data_format()` is used (unless you
-          changed it, it defaults to `"channels_last"`).
-        dtype: Dtype to use. Default to `None`, in which case the global setting
-          `tf.keras.backend.floatx()` is used (unless you changed it, it
-          defaults to `"float32"`).
+          changed it, it defaults to `"channels_last"`). Defaults to `None`.
+        dtype: Dtype to use. None makes the global setting
+          `tf.keras.backend.floatx()` to be used (unless you changed it, it
+          defaults to `"float32"`). Defaults to `None`.
 
     Returns:
         A 3D Numpy array.
diff --git a/keras/utils/layer_utils.py b/keras/utils/layer_utils.py
index 071bbff62eae..31d42b781f4d 100644
--- a/keras/utils/layer_utils.py
+++ b/keras/utils/layer_utils.py
@@ -1042,9 +1042,9 @@ def warmstart_embedding_matrix(
           embedding matrix.
         new_embeddings_initializer: Initializer for embedding vectors for
           previously unseen terms to be added to the new embedding matrix (see
-          `keras.initializers`). Defaults to "uniform". new_embedding matrix
+          `keras.initializers`). new_embedding matrix
           needs to be specified with "constant" initializer.
-          matrix. Default value is None.
+          matrix. None means "uniform". Default value is None.
 
     Returns:
       tf.tensor of remapped embedding layer matrix
diff --git a/keras/utils/metrics_utils.py b/keras/utils/metrics_utils.py
index 8664657c8bec..e7622b3cda54 100644
--- a/keras/utils/metrics_utils.py
+++ b/keras/utils/metrics_utils.py
@@ -979,7 +979,7 @@ def sparse_top_k_categorical_matches(y_true, y_pred, k=5):
       y_true: tensor of true targets.
       y_pred: tensor of predicted targets.
       k: (Optional) Number of top elements to look at for computing accuracy.
-        Defaults to 5.
+        Defaults to `5`.
 
     Returns:
       Match tensor: 1.0 for label-prediction match, 0.0 for mismatch.
diff --git a/keras/utils/text_dataset.py b/keras/utils/text_dataset.py
index d6c6d9ee5bf9..f05a6e5f9cbc 100644
--- a/keras/utils/text_dataset.py
+++ b/keras/utils/text_dataset.py
@@ -104,7 +104,7 @@ def text_dataset_from_directory(
             When `subset="both"`, the utility returns a tuple of two datasets
             (the training and validation datasets respectively).
         follow_links: Whether to visits subdirectories pointed to by symlinks.
-            Defaults to False.
+            Defaults to `False`.
 
     Returns:
         A `tf.data.Dataset` object.

From 8b84cd81e520414be7df84194a3ab042286ad6f8 Mon Sep 17 00:00:00 2001
From: Samuel Marks <807580+SamuelMarks@users.noreply.github.com>
Date: Mon, 3 Apr 2023 23:08:35 -0400
Subject: [PATCH 4/4] [*.py] Move `Defaults to` to end of arg docstring and
 standardise values

---
 keras/applications/convnext.py                |   4 +-
 keras/applications/efficientnet.py            |   4 +-
 keras/applications/efficientnet_v2.py         |   4 +-
 keras/applications/imagenet_utils.py          |   2 +-
 keras/applications/inception_v3.py            |   2 +-
 keras/applications/mobilenet.py               |   2 +-
 keras/applications/mobilenet_v3.py            |   4 +-
 keras/applications/regnet.py                  |   4 +-
 keras/applications/resnet_rs.py               |   4 +-
 keras/backend.py                              |   5 +-
 keras/callbacks.py                            |   6 +-
 keras/datasets/imdb.py                        |   4 +-
 keras/datasets/reuters.py                     |   4 +-
 keras/engine/base_layer_utils.py              |   4 +-
 keras/engine/functional.py                    |   4 +-
 keras/engine/training.py                      |  22 +--
 keras/engine/training_v1.py                   |   4 +-
 keras/estimator/__init__.py                   |   4 +-
 keras/feature_column/dense_features.py        |   6 +-
 .../feature_column/sequence_feature_column.py |   4 +-
 .../convolutional/base_depthwise_conv.py      |   6 +-
 keras/layers/convolutional/conv2d.py          |   8 +-
 .../layers/convolutional/conv2d_transpose.py  |   6 +-
 keras/layers/convolutional/conv3d.py          |  10 +-
 .../layers/convolutional/conv3d_transpose.py  |   6 +-
 .../layers/convolutional/depthwise_conv1d.py  |   6 +-
 .../layers/convolutional/depthwise_conv2d.py  |   6 +-
 .../layers/convolutional/separable_conv2d.py  |   7 +-
 keras/layers/kernelized.py                    |   5 +-
 .../locally_connected/locally_connected1d.py  |   7 +-
 .../locally_connected/locally_connected2d.py  |   6 +-
 .../normalization/group_normalization.py      |   6 +-
 .../normalization/layer_normalization.py      |   6 +-
 keras/layers/pooling/average_pooling2d.py     |   7 +-
 keras/layers/pooling/average_pooling3d.py     |   7 +-
 .../pooling/global_average_pooling2d.py       |   6 +-
 .../pooling/global_average_pooling3d.py       |   7 +-
 keras/layers/pooling/global_max_pooling2d.py  |   7 +-
 keras/layers/pooling/global_max_pooling3d.py  |   7 +-
 keras/layers/pooling/max_pooling2d.py         |   7 +-
 keras/layers/pooling/max_pooling3d.py         |   7 +-
 .../preprocessing/text_vectorization.py       |   2 +-
 .../regularization/spatial_dropout2d.py       |   6 +-
 .../regularization/spatial_dropout3d.py       |   6 +-
 keras/layers/reshaping/cropping2d.py          |   7 +-
 keras/layers/reshaping/cropping3d.py          |   7 +-
 keras/layers/reshaping/flatten.py             |   7 +-
 keras/layers/reshaping/up_sampling2d.py       |   7 +-
 keras/layers/reshaping/up_sampling3d.py       |   7 +-
 keras/layers/reshaping/zero_padding2d.py      |   7 +-
 keras/layers/reshaping/zero_padding3d.py      |   7 +-
 keras/layers/rnn/base_conv_lstm.py            |  14 +-
 keras/layers/rnn/conv_lstm1d.py               |   7 +-
 keras/layers/rnn/conv_lstm2d.py               |   7 +-
 keras/layers/rnn/conv_lstm3d.py               |   7 +-
 keras/layers/rnn/gru.py                       |  10 +-
 keras/layers/rnn/legacy_cells.py              |   3 +-
 keras/layers/rnn/lstm.py                      |  10 +-
 keras/legacy_tf_layers/base.py                |   4 +-
 keras/legacy_tf_layers/variable_scope_shim.py |   8 +-
 keras/losses.py                               | 186 +++++++++---------
 keras/models/cloning.py                       |   3 +-
 keras/models/sharpness_aware_minimization.py  |   8 +-
 keras/optimizers/optimizer.py                 |  33 ++--
 keras/optimizers/optimizer_v1.py              |  15 +-
 keras/optimizers/rmsprop.py                   |   3 +-
 keras/preprocessing/image.py                  |  12 +-
 keras/preprocessing/text.py                   |   4 +-
 keras/saving/legacy/save.py                   |   2 +-
 keras/saving/legacy/saved_model/json_utils.py |   4 +-
 keras/saving/object_registration.py           |   2 +-
 keras/saving/serialization_lib.py             |   2 +-
 keras/utils/data_utils.py                     |  14 +-
 keras/utils/dataset_utils.py                  |   4 +-
 keras/utils/generic_utils.py                  |   2 +-
 keras/utils/image_utils.py                    |   8 +-
 keras/utils/layer_utils.py                    |   7 +-
 keras/utils/losses_utils.py                   |   9 +-
 78 files changed, 362 insertions(+), 326 deletions(-)

diff --git a/keras/applications/convnext.py b/keras/applications/convnext.py
index 5eb983d90a09..7e5e209bf200 100644
--- a/keras/applications/convnext.py
+++ b/keras/applications/convnext.py
@@ -756,8 +756,8 @@ def preprocess_input(x, data_format=None):
       x: A floating point `numpy.array` or a `tf.Tensor`.
       data_format: Optional data format of the image tensor/array. `None` means
         the global setting `tf.keras.backend.image_data_format()` is used
-        (unless you changed it, it defaults to "channels_last").{mode}.
-        Defaults to None
+        (unless you changed it, it uses "channels_last").{mode}.
+        Defaults to `None`.
 
     Returns:
       Unchanged `numpy.array` or `tf.Tensor`.
diff --git a/keras/applications/efficientnet.py b/keras/applications/efficientnet.py
index 775a19153ec7..cbadfad14d35 100644
--- a/keras/applications/efficientnet.py
+++ b/keras/applications/efficientnet.py
@@ -854,8 +854,8 @@ def preprocess_input(x, data_format=None):
       x: A floating point `numpy.array` or a `tf.Tensor`.
       data_format: Optional data format of the image tensor/array. `None` means
         the global setting `tf.keras.backend.image_data_format()` is used
-        (unless you changed it, it defaults to "channels_last").{mode}.
-        Defaults to None
+        (unless you changed it, it uses "channels_last").{mode}.
+        Defaults to `None`.
 
     Returns:
       Unchanged `numpy.array` or `tf.Tensor`.
diff --git a/keras/applications/efficientnet_v2.py b/keras/applications/efficientnet_v2.py
index 82a7b3a6efd6..715c8f5281ab 100644
--- a/keras/applications/efficientnet_v2.py
+++ b/keras/applications/efficientnet_v2.py
@@ -1345,8 +1345,8 @@ def preprocess_input(x, data_format=None):
       x: A floating point `numpy.array` or a `tf.Tensor`.
       data_format: Optional data format of the image tensor/array. `None` means
         the global setting `tf.keras.backend.image_data_format()` is used
-        (unless you changed it, it defaults to "channels_last").{mode}.
-        Defaults to None
+        (unless you changed it, it uses "channels_last").{mode}.
+        Defaults to `None`.
 
     Returns:
       Unchanged `numpy.array` or `tf.Tensor`.
diff --git a/keras/applications/imagenet_utils.py b/keras/applications/imagenet_utils.py
index 12b745c8229c..3aafbad0a174 100644
--- a/keras/applications/imagenet_utils.py
+++ b/keras/applications/imagenet_utils.py
@@ -58,7 +58,7 @@
       behaviour, `numpy.copy(x)` can be used.
     data_format: Optional data format of the image tensor/array. None, means
       the global setting `tf.keras.backend.image_data_format()` is used
-      (unless you changed it, it defaults to "channels_last").{mode}
+      (unless you changed it, it uses "channels_last").{mode}
       Defaults to `None`.
 
   Returns:
diff --git a/keras/applications/inception_v3.py b/keras/applications/inception_v3.py
index 381192646fc0..d3ab844e16a9 100644
--- a/keras/applications/inception_v3.py
+++ b/keras/applications/inception_v3.py
@@ -88,7 +88,7 @@ def InceptionV3(
         or the path to the weights file to be loaded. Defaults to `imagenet`.
       input_tensor: Optional Keras tensor (i.e. output of `layers.Input()`)
         to use as image input for the model. `input_tensor` is useful for
-        sharing inputs between multiple different networks. Defaults to None.
+        sharing inputs between multiple different networks. Defaults to `None`.
       input_shape: Optional shape tuple, only to be specified
         if `include_top` is False (otherwise the input shape
         has to be `(299, 299, 3)` (with `channels_last` data format)
diff --git a/keras/applications/mobilenet.py b/keras/applications/mobilenet.py
index 0232fd837ce8..e3a0cdd09e18 100644
--- a/keras/applications/mobilenet.py
+++ b/keras/applications/mobilenet.py
@@ -143,7 +143,7 @@ def MobileNet(
         `imagenet`.
       input_tensor: Optional Keras tensor (i.e. output of `layers.Input()`) to
         use as image input for the model. `input_tensor` is useful for sharing
-        inputs between multiple different networks. Defaults to None.
+        inputs between multiple different networks. Defaults to `None`.
       pooling: Optional pooling mode for feature extraction when `include_top`
         is `False`.
         - `None` (default) means that the output of the model will be
diff --git a/keras/applications/mobilenet_v3.py b/keras/applications/mobilenet_v3.py
index a68a67385746..1c46f3fa20de 100644
--- a/keras/applications/mobilenet_v3.py
+++ b/keras/applications/mobilenet_v3.py
@@ -681,8 +681,8 @@ def preprocess_input(x, data_format=None):
       x: A floating point `numpy.array` or a `tf.Tensor`.
       data_format: Optional data format of the image tensor/array. `None` means
         the global setting `tf.keras.backend.image_data_format()` is used
-        (unless you changed it, it defaults to "channels_last").{mode}.
-        Defaults to None
+        (unless you changed it, it uses "channels_last").{mode}.
+        Defaults to `None`.
 
     Returns:
       Unchanged `numpy.array` or `tf.Tensor`.
diff --git a/keras/applications/regnet.py b/keras/applications/regnet.py
index e05071533712..f40c548a196a 100644
--- a/keras/applications/regnet.py
+++ b/keras/applications/regnet.py
@@ -1821,8 +1821,8 @@ def preprocess_input(x, data_format=None):
       x: A floating point `numpy.array` or a `tf.Tensor`.
       data_format: Optional data format of the image tensor/array. `None` means
         the global setting `tf.keras.backend.image_data_format()` is used
-        (unless you changed it, it defaults to "channels_last").{mode}.
-        Defaults to None
+        (unless you changed it, it uses "channels_last").{mode}.
+        Defaults to `None`.
 
     Returns:
       Unchanged `numpy.array` or `tf.Tensor`.
diff --git a/keras/applications/resnet_rs.py b/keras/applications/resnet_rs.py
index a7c29b7a61a3..8a72652c2370 100644
--- a/keras/applications/resnet_rs.py
+++ b/keras/applications/resnet_rs.py
@@ -961,8 +961,8 @@ def preprocess_input(x, data_format=None):
       x: A floating point `numpy.array` or a `tf.Tensor`.
       data_format: Optional data format of the image tensor/array. `None` means
         the global setting `tf.keras.backend.image_data_format()` is used
-        (unless you changed it, it defaults to "channels_last").{mode}.
-        Defaults to None
+        (unless you changed it, it uses "channels_last").{mode}.
+        Defaults to `None`.
 
     Returns:
       Unchanged `numpy.array` or `tf.Tensor`.
diff --git a/keras/backend.py b/keras/backend.py
index fec66940c68f..310361d950ac 100644
--- a/keras/backend.py
+++ b/keras/backend.py
@@ -1901,8 +1901,8 @@ class RandomGenerator(tf.__internal__.tracking.AutoTrackable):
         When `rng_type` is "legacy_stateful", the seed will be passed down to
         stateful random ops.
       rng_type: Type of RNG to use, one of "stateful", "stateless",
-        "legacy_stateful". It defaults to "stateful" if
-        `enable_tf_random_generator` has been activated, or to
+        "legacy_stateful". When `None` it uses "stateful" if
+        `enable_tf_random_generator` has been activated, or
         "legacy_stateful" otherwise.
         - When using "stateless", the random ops outputs are constant (the same
           inputs result in the same outputs).
@@ -1913,6 +1913,7 @@ class RandomGenerator(tf.__internal__.tracking.AutoTrackable):
         - "legacy_stateful" is backed by TF1 stateful RNG ops
           (e.g. `tf.random.uniform`), while "stateful"
           is backed by TF2 APIs (e.g. `tf.random.Generator.uniform`).
+        Defaults to `None`.
     """
 
     RNG_STATELESS = "stateless"
diff --git a/keras/callbacks.py b/keras/callbacks.py
index f0f47a4d90af..f46f1975a180 100644
--- a/keras/callbacks.py
+++ b/keras/callbacks.py
@@ -1015,7 +1015,7 @@ class ProgbarLogger(Callback):
             should *not* be averaged over an epoch.
             Metrics in this list will be logged as-is.
             All others will be averaged over time (e.g. loss, etc).
-            If not provided, defaults to the `Model`'s metrics.
+            When None, uses the `Model`'s metrics. Defaults to `None`.
 
     Raises:
         ValueError: In case of invalid `count_mode`.
@@ -2276,8 +2276,8 @@ def keras_model_summary(name, data, step=None):
         be this name prefixed by any active name scopes.
       data: A Keras Model to write.
       step: Explicit `int64`-castable monotonic step value for this summary. If
-        omitted, this defaults to `tf.summary.experimental.get_step()`, which
-        must not be None.
+        None, this uses `tf.summary.experimental.get_step()`, which
+        must not be None. Defaults to `None`.
 
     Returns:
       True on success, or False if no summary was written because no default
diff --git a/keras/datasets/imdb.py b/keras/datasets/imdb.py
index 0dd660b5215c..1e61771ad79b 100644
--- a/keras/datasets/imdb.py
+++ b/keras/datasets/imdb.py
@@ -58,14 +58,14 @@ def load_data(
           ranked by how often they occur (in the training set) and only
           the `num_words` most frequent words are kept. Any less frequent word
           will appear as `oov_char` value in the sequence data. If None,
-          all words are kept. Defaults to None
+          all words are kept. Defaults to `None`.
       skip_top: skip the top N most frequently occurring words
           (which may not be informative). These words will appear as
           `oov_char` value in the dataset. When 0, no words are
           skipped. Defaults to `0`.
       maxlen: int or None. Maximum sequence length.
           Any longer sequence will be truncated. None, means no truncation.
-          Defaults to None
+          Defaults to `None`.
       seed: int. Seed for reproducible data shuffling.
       start_char: int. The start of a sequence will be marked with this
           character. 0 is usually the padding character. Defaults to `1`.
diff --git a/keras/datasets/reuters.py b/keras/datasets/reuters.py
index ca7ca3a87d59..19b27949d84e 100644
--- a/keras/datasets/reuters.py
+++ b/keras/datasets/reuters.py
@@ -65,14 +65,14 @@ def load_data(
           ranked by how often they occur (in the training set) and only
           the `num_words` most frequent words are kept. Any less frequent word
           will appear as `oov_char` value in the sequence data. If None,
-          all words are kept. Defaults to None
+          all words are kept. Defaults to `None`.
       skip_top: skip the top N most frequently occurring words
           (which may not be informative). These words will appear as
           `oov_char` value in the dataset. 0 means no words are
           skipped. Defaults to 0
       maxlen: int or None. Maximum sequence length.
           Any longer sequence will be truncated. None means no truncation.
-          Defaults to None
+          Defaults to `None`.
       test_split: Float between 0 and 1. Fraction of the dataset to be used
         as test data. 0.2 means that 20% of the dataset is used as
         test data. Defaults to 0.2
diff --git a/keras/engine/base_layer_utils.py b/keras/engine/base_layer_utils.py
index 8c5062a59665..8e3de3d4df2e 100644
--- a/keras/engine/base_layer_utils.py
+++ b/keras/engine/base_layer_utils.py
@@ -98,8 +98,8 @@ def make_variable(
         or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
         Note, if the current variable scope is marked as non-trainable
         then this parameter is ignored and any added variables are also
-        marked as non-trainable. `trainable` defaults to `True` unless
-        `synchronization` is set to `ON_READ`.
+        marked as non-trainable. `trainable` becomes `True` unless
+        `synchronization` is set to `ON_READ`. Defaults to `None`.
       caching_device: Passed to `tf.Variable`.
       validate_shape: Passed to `tf.Variable`.
       constraint: Constraint instance (callable).
diff --git a/keras/engine/functional.py b/keras/engine/functional.py
index 3bb31164d774..d17d429f3fd5 100644
--- a/keras/engine/functional.py
+++ b/keras/engine/functional.py
@@ -1647,8 +1647,8 @@ def __init__(self, module, method_name=None, **kwargs):
         Args:
           module: The `tf.Module` instance to be wrapped.
           method_name: (Optional) str. The name of the method to use as the
-            forward pass of the module. If not set, defaults to '__call__' if
-            defined, or 'call'.
+            forward pass of the module. If not set, becomes '__call__' if
+            defined, or 'call'. Defaults to `None`.
           **kwargs: Additional keywrod arguments. See `tf.keras.layers.Layer`.
 
         Raises:
diff --git a/keras/engine/training.py b/keras/engine/training.py
index 562038de4f50..71111202bec7 100644
--- a/keras/engine/training.py
+++ b/keras/engine/training.py
@@ -1459,11 +1459,11 @@ def fit(
                 of index `epochs` is reached.
             verbose: 'auto', 0, 1, or 2. Verbosity mode.
                 0 = silent, 1 = progress bar, 2 = one line per epoch.
-                'auto' defaults to 1 for most cases, but 2 when used with
+                'auto' becomes 1 for most cases, but 2 when used with
                 `ParameterServerStrategy`. Note that the progress bar is not
                 particularly useful when logged to a file, so verbose=2 is
                 recommended when not running interactively (eg, in a production
-                environment).
+                environment). Defaults to 'auto'.
             callbacks: List of `keras.callbacks.Callback` instances.
                 List of callbacks to apply during training.
                 See `tf.keras.callbacks`. Note
@@ -2061,11 +2061,11 @@ def evaluate(
               they generate batches).
             verbose: `"auto"`, 0, 1, or 2. Verbosity mode.
                 0 = silent, 1 = progress bar, 2 = single line.
-                `"auto"` defaults to 1 for most cases, and to 2 when used with
+                `"auto"` becomes 1 for most cases, and to 2 when used with
                 `ParameterServerStrategy`. Note that the progress bar is not
                 particularly useful when logged to a file, so `verbose=2` is
                 recommended when not running interactively (e.g. in a production
-                environment).
+                environment). Defaults to 'auto'.
             sample_weight: Optional Numpy array of weights for the test samples,
               used for weighting the loss function. You can either pass a flat
               (1D) Numpy array with the same length as the input samples
@@ -2421,11 +2421,11 @@ def predict(
                 (since they generate batches).
             verbose: `"auto"`, 0, 1, or 2. Verbosity mode.
                 0 = silent, 1 = progress bar, 2 = single line.
-                `"auto"` defaults to 1 for most cases, and to 2 when used with
+                `"auto"` becomes 1 for most cases, and to 2 when used with
                 `ParameterServerStrategy`. Note that the progress bar is not
                 particularly useful when logged to a file, so `verbose=2` is
                 recommended when not running interactively (e.g. in a production
-                environment).
+                environment). Defaults to 'auto'.
             steps: Total number of steps (batches of samples)
                 before declaring the prediction round finished.
                 Ignored with the default value of `None`. If x is a `tf.data`
@@ -3053,7 +3053,7 @@ def save_weights(
                 target location, or provide the user with a manual prompt.
             save_format: Either 'tf' or 'h5'. A `filepath` ending in '.h5' or
                 '.keras' will default to HDF5 if `save_format` is `None`.
-                Otherwise `None` defaults to 'tf'.
+                Otherwise, `None` becomes 'tf'. Defaults to `None`.
             options: Optional `tf.train.CheckpointOptions` object that specifies
                 options for saving weights.
 
@@ -3368,17 +3368,17 @@ def summary(
                 (e.g. set this to adapt the display to different
                 terminal window sizes).
             positions: Relative or absolute positions of log elements
-                in each line. If not provided,
-                defaults to `[0.3, 0.6, 0.70, 1.]`
+                in each line. If not provided, becomes
+                `[0.3, 0.6, 0.70, 1.]`. Defaults to `None`.
             print_fn: Print function to use. By default, prints to `stdout`.
                 If `stdout` doesn't work in your environment, change to `print`.
                 It will be called on each line of the summary.
                 You can set it to a custom function
                 in order to capture the string summary.
             expand_nested: Whether to expand the nested models.
-                If not provided, defaults to `False`.
+                Defaults to `False`.
             show_trainable: Whether to show if a layer is trainable.
-                If not provided, defaults to `False`.
+                Defaults to `False`.
             layer_range: a list or tuple of 2 strings,
                 which is the starting layer name and ending layer name
                 (both inclusive) indicating the range of layers to be printed
diff --git a/keras/engine/training_v1.py b/keras/engine/training_v1.py
index 097663224096..a5ef55a4fc20 100644
--- a/keras/engine/training_v1.py
+++ b/keras/engine/training_v1.py
@@ -269,10 +269,10 @@ def compile(
                 output names (strings) to scalar coefficients.
             sample_weight_mode: If you need to do timestep-wise
                 sample weighting (2D weights), set this to `"temporal"`.
-                `None` defaults to sample-wise weights (1D).
+                `None` becomes sample-wise weights (1D).
                 If the model has multiple outputs, you can use a different
                 `sample_weight_mode` on each output by passing a
-                dictionary or a list of modes.
+                dictionary or a list of modes. Defaults to `None`.
             weighted_metrics: List of metrics to be evaluated and weighted
                 by sample_weight or class_weight during training and testing.
             target_tensors: By default, Keras will create placeholders for the
diff --git a/keras/estimator/__init__.py b/keras/estimator/__init__.py
index dc8a6bc468bf..ff0d16822425 100644
--- a/keras/estimator/__init__.py
+++ b/keras/estimator/__init__.py
@@ -120,8 +120,8 @@ def input_fn():
       checkpoint_format: Sets the format of the checkpoint saved by the
         estimator when training. May be `saver` or `checkpoint`, depending on
         whether to save checkpoints from `tf.train.Saver` or
-        `tf.train.Checkpoint`. This argument currently defaults to `saver`. When
-        2.0 is released, the default will be `checkpoint`. Estimators use
+        `tf.train.Checkpoint`. Before 2.0 the argument is `saver`. When
+        2.0 is released, this will be `checkpoint`. Estimators use
         name-based `tf.train.Saver` checkpoints, while Keras models use
         object-based checkpoints from `tf.train.Checkpoint`. Currently, saving
         object-based checkpoints from `model_to_estimator` is only supported by
diff --git a/keras/feature_column/dense_features.py b/keras/feature_column/dense_features.py
index fb8c801e65c5..f5ae664581cc 100644
--- a/keras/feature_column/dense_features.py
+++ b/keras/feature_column/dense_features.py
@@ -90,7 +90,7 @@ def __init__(
           trainable:  Boolean, whether the layer's variables will be updated via
             gradient descent during training.
           name: Name to give to the DenseFeatures.
-          partitioner: Partitioner for input layer. Defaults to None.
+          partitioner: Partitioner for input layer. Defaults to `None`.
           **kwargs: Keyword arguments to construct a layer.
 
         Raises:
@@ -150,8 +150,8 @@ def call(self, features, cols_to_output_tensors=None, training=None):
             method of any `FeatureColumn` that takes a `training` argument. For
             example, if a `FeatureColumn` performed dropout, the column could
             expose a `training` argument to control whether the dropout should
-            be applied. If `None`, defaults to
-            `tf.keras.backend.learning_phase()`.
+            be applied. If `None`, becomes `tf.keras.backend.learning_phase()`.
+            Defaults to `None`.
 
 
         Returns:
diff --git a/keras/feature_column/sequence_feature_column.py b/keras/feature_column/sequence_feature_column.py
index 5fd05fdd6656..89e4f5cfdb76 100644
--- a/keras/feature_column/sequence_feature_column.py
+++ b/keras/feature_column/sequence_feature_column.py
@@ -122,8 +122,8 @@ def call(self, features, training=None):
             method of any `FeatureColumn` that takes a `training` argument. For
             example, if a `FeatureColumn` performed dropout, the column could
             expose a `training` argument to control whether the dropout should
-            be applied. If `None`, defaults to
-            `tf.keras.backend.learning_phase()`.
+            be applied. If `None`, becomes `tf.keras.backend.learning_phase()`.
+            Defaults to `None`.
 
 
         Returns:
diff --git a/keras/layers/convolutional/base_depthwise_conv.py b/keras/layers/convolutional/base_depthwise_conv.py
index 425586dc04bd..8dddbc5497d0 100644
--- a/keras/layers/convolutional/base_depthwise_conv.py
+++ b/keras/layers/convolutional/base_depthwise_conv.py
@@ -65,10 +65,10 @@ class DepthwiseConv(Conv):
         `channels_first`.  The ordering of the dimensions in the inputs.
         `channels_last` corresponds to inputs with shape `(batch_size, height,
         width, channels)` while `channels_first` corresponds to inputs with
-        shape `(batch_size, channels, height, width)`. It defaults to the
+        shape `(batch_size, channels, height, width)`. When unspecified, uses
         `image_data_format` value found in your Keras config file at
-        `~/.keras/keras.json`. If you never set it, then it will be
-        'channels_last'.
+        `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
       dilation_rate: An integer or tuple/list of 2 integers, specifying the
         dilation rate to use for dilated convolution. Currently, specifying any
         `dilation_rate` value != 1 is incompatible with specifying any `strides`
diff --git a/keras/layers/convolutional/conv2d.py b/keras/layers/convolutional/conv2d.py
index 2c44cad555d1..8a961f23dc0f 100644
--- a/keras/layers/convolutional/conv2d.py
+++ b/keras/layers/convolutional/conv2d.py
@@ -101,11 +101,11 @@ class Conv2D(Conv):
         `channels_first`.  The ordering of the dimensions in the inputs.
         `channels_last` corresponds to inputs with shape `(batch_size, height,
         width, channels)` while `channels_first` corresponds to inputs with
-        shape `(batch_size, channels, height, width)`. It defaults to the
+        shape `(batch_size, channels, height, width)`. When unspecified, uses
         `image_data_format` value found in your Keras config file at
-        `~/.keras/keras.json`. If you never set it, then it will be
-        `channels_last`. Note that the `channels_first` format is currently not
-        supported by TensorFlow on CPU.
+        `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Note that the `channels_first` format is currently not
+        supported by TensorFlow on CPU. Defaults to 'channels_last'.
       dilation_rate: an integer or tuple/list of 2 integers, specifying the
         dilation rate to use for dilated convolution. Can be a single integer to
         specify the same value for all spatial dimensions. Currently, specifying
diff --git a/keras/layers/convolutional/conv2d_transpose.py b/keras/layers/convolutional/conv2d_transpose.py
index 5003cabbc08c..772b761e95d8 100644
--- a/keras/layers/convolutional/conv2d_transpose.py
+++ b/keras/layers/convolutional/conv2d_transpose.py
@@ -82,9 +82,9 @@ class Conv2DTranspose(Conv2D):
         `(batch_size, height, width, channels)` while `channels_first`
         corresponds to inputs with shape
         `(batch_size, channels, height, width)`.
-        It defaults to the `image_data_format` value found in your
-        Keras config file at `~/.keras/keras.json`.
-        If you never set it, then it will be "channels_last".
+        When unspecified, uses `image_data_format` value found in your Keras
+        config file at `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to "channels_last".
       dilation_rate: an integer, specifying the dilation rate for all spatial
         dimensions for dilated convolution. Specifying different dilation rates
         for different dimensions is not supported.
diff --git a/keras/layers/convolutional/conv3d.py b/keras/layers/convolutional/conv3d.py
index bff96123d1fd..bec540cf39a3 100644
--- a/keras/layers/convolutional/conv3d.py
+++ b/keras/layers/convolutional/conv3d.py
@@ -83,11 +83,11 @@ class Conv3D(Conv):
         `channels_last` corresponds to inputs with shape `batch_shape +
         (spatial_dim1, spatial_dim2, spatial_dim3, channels)` while
         `channels_first` corresponds to inputs with shape `batch_shape +
-        (channels, spatial_dim1, spatial_dim2, spatial_dim3)`. It defaults to
-        the `image_data_format` value found in your Keras config file at
-        `~/.keras/keras.json`. If you never set it, then it will be
-        "channels_last". Note that the `channels_first` format is currently not
-        supported by TensorFlow on CPU.
+        (channels, spatial_dim1, spatial_dim2, spatial_dim3)`. When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+        `~/.keras/keras.json` (if exists) else 'channels_last'. Note that the
+        `channels_first` format is currently not supported by TensorFlow on CPU.
+        Defaults to 'channels_first'.
       dilation_rate: an integer or tuple/list of 3 integers, specifying the
         dilation rate to use for dilated convolution. Can be a single integer to
         specify the same value for all spatial dimensions. Currently, specifying
diff --git a/keras/layers/convolutional/conv3d_transpose.py b/keras/layers/convolutional/conv3d_transpose.py
index d5778d2ea43e..dcb9b54a6665 100644
--- a/keras/layers/convolutional/conv3d_transpose.py
+++ b/keras/layers/convolutional/conv3d_transpose.py
@@ -82,9 +82,9 @@ class Conv3DTranspose(Conv3D):
         `(batch_size, depth, height, width, channels)` while `channels_first`
         corresponds to inputs with shape
         `(batch_size, channels, depth, height, width)`.
-        It defaults to the `image_data_format` value found in your
-        Keras config file at `~/.keras/keras.json`.
-        If you never set it, then it will be "channels_last".
+        When unspecified, uses `image_data_format` value found in your Keras
+        config file at `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
       dilation_rate: an integer or tuple/list of 3 integers, specifying
         the dilation rate to use for dilated convolution.
         Can be a single integer to specify the same value for
diff --git a/keras/layers/convolutional/depthwise_conv1d.py b/keras/layers/convolutional/depthwise_conv1d.py
index 49de8d3a426e..b1cca7a37353 100644
--- a/keras/layers/convolutional/depthwise_conv1d.py
+++ b/keras/layers/convolutional/depthwise_conv1d.py
@@ -67,10 +67,10 @@ class DepthwiseConv1D(DepthwiseConv):
         `channels_first`.  The ordering of the dimensions in the inputs.
         `channels_last` corresponds to inputs with shape `(batch_size, height,
         width, channels)` while `channels_first` corresponds to inputs with
-        shape `(batch_size, channels, height, width)`. It defaults to the
+        shape `(batch_size, channels, height, width)`. When unspecified, uses
         `image_data_format` value found in your Keras config file at
-        `~/.keras/keras.json`. If you never set it, then it will be
-        'channels_last'.
+        `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
       dilation_rate: A single integer, specifying the dilation rate to use for
         dilated convolution. Currently, specifying any `dilation_rate`
         value != 1 is incompatible with specifying any stride value != 1.
diff --git a/keras/layers/convolutional/depthwise_conv2d.py b/keras/layers/convolutional/depthwise_conv2d.py
index 4ff8de316ab5..24edea729669 100644
--- a/keras/layers/convolutional/depthwise_conv2d.py
+++ b/keras/layers/convolutional/depthwise_conv2d.py
@@ -68,10 +68,10 @@ class DepthwiseConv2D(DepthwiseConv):
         `channels_first`. The ordering of the dimensions in the inputs.
         `channels_last` corresponds to inputs with shape `(batch_size, height,
         width, channels)` while `channels_first` corresponds to inputs with
-        shape `(batch_size, channels, height, width)`. It defaults to the
+        shape `(batch_size, channels, height, width)`. When unspecified, uses
         `image_data_format` value found in your Keras config file at
-        `~/.keras/keras.json`. If you never set it, then it will be
-        'channels_last'.
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
       dilation_rate: An integer or tuple/list of 2 integers, specifying the
         dilation rate to use for dilated convolution. Currently, specifying any
         `dilation_rate` value != 1 is incompatible with specifying any `strides`
diff --git a/keras/layers/convolutional/separable_conv2d.py b/keras/layers/convolutional/separable_conv2d.py
index f0d626331a5d..76d9038f0153 100644
--- a/keras/layers/convolutional/separable_conv2d.py
+++ b/keras/layers/convolutional/separable_conv2d.py
@@ -70,9 +70,10 @@ class SeparableConv2D(SeparableConv):
         `(batch_size, height, width, channels)` while `channels_first`
         corresponds to inputs with shape
         `(batch_size, channels, height, width)`.
-        It defaults to the `image_data_format` value found in your
-        Keras config file at `~/.keras/keras.json`.
-        If you never set it, then it will be "channels_last".
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
       dilation_rate: An integer or tuple/list of 2 integers, specifying
         the dilation rate to use for dilated convolution.
       depth_multiplier: The number of depthwise convolution output channels
diff --git a/keras/layers/kernelized.py b/keras/layers/kernelized.py
index 95e74fa931c1..f8114bbb7c74 100644
--- a/keras/layers/kernelized.py
+++ b/keras/layers/kernelized.py
@@ -126,8 +126,8 @@ class RandomFourierFeatures(base_layer.Layer):
         factor of the corresponding kernel approximated by the layer (see
         concrete definitions above). When provided, it should be a positive
         float. If None, a default value is used: if the kernel initializer is
-        set to "gaussian", `scale` defaults to `sqrt(input_dim / 2)`, otherwise,
-        it defaults to 1.0.  Both the approximation error of the kernel and the
+        set to "gaussian", `scale` becomes `sqrt(input_dim / 2)`, otherwise,
+        it becomes 1.0.  Both the approximation error of the kernel and the
         classification quality are sensitive to this parameter. If `trainable`
         is set to `True`, this parameter is learned end-to-end during training
         and the provided value serves as the initial value.
@@ -135,6 +135,7 @@ class RandomFourierFeatures(base_layer.Layer):
           by making `scale` trainable, the resulting optimization problem is
           no longer convex (even if the loss function used by the linear model
           is convex).
+        Defaults to `None`.
       trainable: Whether the scaling parameter of the layer should be trainable.
         Defaults to `False`.
       name: String, name to use for this layer.
diff --git a/keras/layers/locally_connected/locally_connected1d.py b/keras/layers/locally_connected/locally_connected1d.py
index 3815bc2e8648..32fe80fee560 100644
--- a/keras/layers/locally_connected/locally_connected1d.py
+++ b/keras/layers/locally_connected/locally_connected1d.py
@@ -67,9 +67,10 @@ class LocallyConnected1D(Layer):
           `channels_first`. The ordering of the dimensions in the inputs.
           `channels_last` corresponds to inputs with shape `(batch, length,
           channels)` while `channels_first` corresponds to inputs with shape
-          `(batch, channels, length)`. It defaults to the `image_data_format`
-          value found in your Keras config file at `~/.keras/keras.json`. If you
-          never set it, then it will be "channels_last".
+          `(batch, channels, length)`. When unspecified, uses
+          `image_data_format` value found in your Keras config file at
+          `~/.keras/keras.json` (if exists) else 'channels_last'.
+          Defaults to 'channels_last'.
         activation: Activation function to use. If you don't specify anything,
           no activation is applied (ie. "linear" activation: `a(x) = x`).
         use_bias: Boolean, whether the layer uses a bias vector.
diff --git a/keras/layers/locally_connected/locally_connected2d.py b/keras/layers/locally_connected/locally_connected2d.py
index 5886b7b449fa..fce8c32e2ce4 100644
--- a/keras/layers/locally_connected/locally_connected2d.py
+++ b/keras/layers/locally_connected/locally_connected2d.py
@@ -74,10 +74,10 @@ class LocallyConnected2D(Layer):
           `channels_last` corresponds to inputs with shape `(batch, height,
             width, channels)` while `channels_first` corresponds to inputs with
             shape
-          `(batch, channels, height, width)`. It defaults to the
+          `(batch, channels, height, width)`. When unspecified, uses
           `image_data_format` value found in your Keras config file at
-          `~/.keras/keras.json`. If you never set it, then it will be
-          "channels_last".
+          `~/.keras/keras.json` (if exists) else 'channels_last'.
+          Defaults to 'channels_last'.
         activation: Activation function to use. If you don't specify anything,
           no activation is applied (ie. "linear" activation: `a(x) = x`).
         use_bias: Boolean, whether the layer uses a bias vector.
diff --git a/keras/layers/normalization/group_normalization.py b/keras/layers/normalization/group_normalization.py
index 010d07aae96c..5d883b8fd260 100644
--- a/keras/layers/normalization/group_normalization.py
+++ b/keras/layers/normalization/group_normalization.py
@@ -52,9 +52,9 @@ class GroupNormalization(Layer):
         the range [1, N] where N is the input dimension. The input dimension
         must be divisible by the number of groups. Defaults to `32`.
       axis: Integer or List/Tuple. The axis or axes to normalize across.
-        Typically this is the features axis/axes. The left-out axes are
-        typically the batch axis/axes. This argument defaults to `-1`, the last
-        dimension in the input.
+        Typically, this is the features axis/axes. The left-out axes are
+        typically the batch axis/axes. `-1` is the last dimension in the
+        input. Defaults to `-1`.
       epsilon: Small float added to variance to avoid dividing by zero. Defaults
         to 1e-3
       center: If True, add offset of `beta` to normalized tensor. If False,
diff --git a/keras/layers/normalization/layer_normalization.py b/keras/layers/normalization/layer_normalization.py
index 4b550b7c3d87..0227bdb27630 100644
--- a/keras/layers/normalization/layer_normalization.py
+++ b/keras/layers/normalization/layer_normalization.py
@@ -120,9 +120,9 @@ class LayerNormalization(Layer):
 
     Args:
       axis: Integer or List/Tuple. The axis or axes to normalize across.
-        Typically this is the features axis/axes. The left-out axes are
-        typically the batch axis/axes. This argument defaults to `-1`, the last
-        dimension in the input.
+        Typically, this is the features axis/axes. The left-out axes are
+        typically the batch axis/axes. `-1` is the last dimension in the
+        input. Defaults to `-1`.
       epsilon: Small float added to variance to avoid dividing by zero. Defaults
         to 1e-3
       center: If True, add offset of `beta` to normalized tensor. If False,
diff --git a/keras/layers/pooling/average_pooling2d.py b/keras/layers/pooling/average_pooling2d.py
index b818ed7e3a87..662ec99016e6 100644
--- a/keras/layers/pooling/average_pooling2d.py
+++ b/keras/layers/pooling/average_pooling2d.py
@@ -108,9 +108,10 @@ class AveragePooling2D(Pooling2D):
         `(batch, height, width, channels)` while `channels_first`
         corresponds to inputs with shape
         `(batch, channels, height, width)`.
-        It defaults to the `image_data_format` value found in your
-        Keras config file at `~/.keras/keras.json`.
-        If you never set it, then it will be "channels_last".
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
 
     Input shape:
       - If `data_format='channels_last'`:
diff --git a/keras/layers/pooling/average_pooling3d.py b/keras/layers/pooling/average_pooling3d.py
index 41faa234aeb0..9d1177e6c68d 100644
--- a/keras/layers/pooling/average_pooling3d.py
+++ b/keras/layers/pooling/average_pooling3d.py
@@ -48,9 +48,10 @@ class AveragePooling3D(Pooling3D):
         `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
         while `channels_first` corresponds to inputs with shape
         `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-        It defaults to the `image_data_format` value found in your
-        Keras config file at `~/.keras/keras.json`.
-        If you never set it, then it will be "channels_last".
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
 
     Input shape:
       - If `data_format='channels_last'`:
diff --git a/keras/layers/pooling/global_average_pooling2d.py b/keras/layers/pooling/global_average_pooling2d.py
index beb7038122c0..e219e2414081 100644
--- a/keras/layers/pooling/global_average_pooling2d.py
+++ b/keras/layers/pooling/global_average_pooling2d.py
@@ -44,9 +44,9 @@ class GlobalAveragePooling2D(GlobalPooling2D):
           `(batch, height, width, channels)` while `channels_first`
           corresponds to inputs with shape
           `(batch, channels, height, width)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
+          When unspecified, uses `image_data_format` value found
+          in your Keras config file at `~/.keras/keras.json`
+          (if exists) else 'channels_last'. Defaults to 'channels_last'.
         keepdims: A boolean, whether to keep the spatial dimensions or not.
           If `keepdims` is `False` (default), the rank of the tensor is reduced
           for spatial dimensions.
diff --git a/keras/layers/pooling/global_average_pooling3d.py b/keras/layers/pooling/global_average_pooling3d.py
index b2819c55164d..04b95667ed8e 100644
--- a/keras/layers/pooling/global_average_pooling3d.py
+++ b/keras/layers/pooling/global_average_pooling3d.py
@@ -36,9 +36,10 @@ class GlobalAveragePooling3D(GlobalPooling3D):
         `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
         while `channels_first` corresponds to inputs with shape
         `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-        It defaults to the `image_data_format` value found in your
-        Keras config file at `~/.keras/keras.json`.
-        If you never set it, then it will be "channels_last".
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
       keepdims: A boolean, whether to keep the spatial dimensions or not.
         If `keepdims` is `False` (default), the rank of the tensor is reduced
         for spatial dimensions.
diff --git a/keras/layers/pooling/global_max_pooling2d.py b/keras/layers/pooling/global_max_pooling2d.py
index 3ef2ee74a544..77ef11b3abdd 100644
--- a/keras/layers/pooling/global_max_pooling2d.py
+++ b/keras/layers/pooling/global_max_pooling2d.py
@@ -42,9 +42,10 @@ class GlobalMaxPooling2D(GlobalPooling2D):
         `(batch, height, width, channels)` while `channels_first`
         corresponds to inputs with shape
         `(batch, channels, height, width)`.
-        It defaults to the `image_data_format` value found in your
-        Keras config file at `~/.keras/keras.json`.
-        If you never set it, then it will be "channels_last".
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
       keepdims: A boolean, whether to keep the spatial dimensions or not.
         If `keepdims` is `False` (default), the rank of the tensor is reduced
         for spatial dimensions.
diff --git a/keras/layers/pooling/global_max_pooling3d.py b/keras/layers/pooling/global_max_pooling3d.py
index ee153d9c3cdd..f5385fc9b414 100644
--- a/keras/layers/pooling/global_max_pooling3d.py
+++ b/keras/layers/pooling/global_max_pooling3d.py
@@ -34,9 +34,10 @@ class GlobalMaxPooling3D(GlobalPooling3D):
         `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
         while `channels_first` corresponds to inputs with shape
         `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-        It defaults to the `image_data_format` value found in your
-        Keras config file at `~/.keras/keras.json`.
-        If you never set it, then it will be "channels_last".
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
       keepdims: A boolean, whether to keep the spatial dimensions or not.
         If `keepdims` is `False` (default), the rank of the tensor is reduced
         for spatial dimensions.
diff --git a/keras/layers/pooling/max_pooling2d.py b/keras/layers/pooling/max_pooling2d.py
index 7378d3d91a90..f21ab07f2142 100644
--- a/keras/layers/pooling/max_pooling2d.py
+++ b/keras/layers/pooling/max_pooling2d.py
@@ -127,9 +127,10 @@ class MaxPooling2D(Pooling2D):
         `(batch, height, width, channels)` while `channels_first`
         corresponds to inputs with shape
         `(batch, channels, height, width)`.
-        It defaults to the `image_data_format` value found in your
-        Keras config file at `~/.keras/keras.json`.
-        If you never set it, then it will be "channels_last".
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
 
     Input shape:
       - If `data_format='channels_last'`:
diff --git a/keras/layers/pooling/max_pooling3d.py b/keras/layers/pooling/max_pooling3d.py
index b0455dbf4d4e..64b2575732eb 100644
--- a/keras/layers/pooling/max_pooling3d.py
+++ b/keras/layers/pooling/max_pooling3d.py
@@ -48,9 +48,10 @@ class MaxPooling3D(Pooling3D):
         `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
         while `channels_first` corresponds to inputs with shape
         `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-        It defaults to the `image_data_format` value found in your
-        Keras config file at `~/.keras/keras.json`.
-        If you never set it, then it will be "channels_last".
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
 
     Input shape:
       - If `data_format='channels_last'`:
diff --git a/keras/layers/preprocessing/text_vectorization.py b/keras/layers/preprocessing/text_vectorization.py
index b0d57265cb15..89f14bc55f2b 100644
--- a/keras/layers/preprocessing/text_vectorization.py
+++ b/keras/layers/preprocessing/text_vectorization.py
@@ -152,7 +152,7 @@ class TextVectorization(base_preprocessing_layer.PreprocessingLayer):
         have its time dimension padded or truncated to exactly
         `output_sequence_length` values, resulting in a tensor of shape
         `(batch_size, output_sequence_length)` regardless of how many tokens
-        resulted from the splitting step. Defaults to None.
+        resulted from the splitting step. Defaults to `None`.
       pad_to_max_tokens: Only valid in  `"multi_hot"`, `"count"`, and `"tf_idf"`
         modes. If True, the output will have its feature axis padded to
         `max_tokens` even if the number of unique tokens in the vocabulary is
diff --git a/keras/layers/regularization/spatial_dropout2d.py b/keras/layers/regularization/spatial_dropout2d.py
index 4593d9220292..e913c132c682 100644
--- a/keras/layers/regularization/spatial_dropout2d.py
+++ b/keras/layers/regularization/spatial_dropout2d.py
@@ -41,10 +41,10 @@ class SpatialDropout2D(Dropout):
       rate: Float between 0 and 1. Fraction of the input units to drop.
       data_format: 'channels_first' or 'channels_last'. In 'channels_first'
         mode, the channels dimension (the depth) is at index 1, in
-        'channels_last' mode is it at index 3. It defaults to the
+        'channels_last' mode is it at index 3. When unspecified, uses
         `image_data_format` value found in your Keras config file at
-        `~/.keras/keras.json`. If you never set it, then it will be
-        "channels_last".
+        `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
     Call arguments:
       inputs: A 4D tensor.
       training: Python boolean indicating whether the layer should behave in
diff --git a/keras/layers/regularization/spatial_dropout3d.py b/keras/layers/regularization/spatial_dropout3d.py
index fb54f924c93b..d7dff8724e0b 100644
--- a/keras/layers/regularization/spatial_dropout3d.py
+++ b/keras/layers/regularization/spatial_dropout3d.py
@@ -41,10 +41,10 @@ class SpatialDropout3D(Dropout):
       rate: Float between 0 and 1. Fraction of the input units to drop.
       data_format: 'channels_first' or 'channels_last'. In 'channels_first'
         mode, the channels dimension (the depth) is at index 1, in
-        'channels_last' mode is it at index 4. It defaults to the
+        'channels_last' mode is it at index 4. When unspecified, uses
         `image_data_format` value found in your Keras config file at
-        `~/.keras/keras.json`. If you never set it, then it will be
-        "channels_last".
+        `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
     Call arguments:
       inputs: A 5D tensor.
       training: Python boolean indicating whether the layer should behave in
diff --git a/keras/layers/reshaping/cropping2d.py b/keras/layers/reshaping/cropping2d.py
index d09e5d16a7c2..118de07ee54e 100644
--- a/keras/layers/reshaping/cropping2d.py
+++ b/keras/layers/reshaping/cropping2d.py
@@ -57,9 +57,10 @@ class Cropping2D(Layer):
         `(batch_size, height, width, channels)` while `channels_first`
         corresponds to inputs with shape
         `(batch_size, channels, height, width)`.
-        It defaults to the `image_data_format` value found in your
-        Keras config file at `~/.keras/keras.json`.
-        If you never set it, then it will be "channels_last".
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
 
     Input shape:
       4D tensor with shape:
diff --git a/keras/layers/reshaping/cropping3d.py b/keras/layers/reshaping/cropping3d.py
index 63e31ec7aaa3..a7d1a933e7ca 100644
--- a/keras/layers/reshaping/cropping3d.py
+++ b/keras/layers/reshaping/cropping3d.py
@@ -54,9 +54,10 @@ class Cropping3D(Layer):
         `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
         while `channels_first` corresponds to inputs with shape
         `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-        It defaults to the `image_data_format` value found in your
-        Keras config file at `~/.keras/keras.json`.
-        If you never set it, then it will be "channels_last".
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
 
     Input shape:
       5D tensor with shape:
diff --git a/keras/layers/reshaping/flatten.py b/keras/layers/reshaping/flatten.py
index 5c66a6048163..51d3a4fe2a49 100644
--- a/keras/layers/reshaping/flatten.py
+++ b/keras/layers/reshaping/flatten.py
@@ -43,9 +43,10 @@ class Flatten(Layer):
         `channels_last` corresponds to inputs with shape
         `(batch, ..., channels)` while `channels_first` corresponds to
         inputs with shape `(batch, channels, ...)`.
-        It defaults to the `image_data_format` value found in your
-        Keras config file at `~/.keras/keras.json`.
-        If you never set it, then it will be "channels_last".
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
 
     Example:
 
diff --git a/keras/layers/reshaping/up_sampling2d.py b/keras/layers/reshaping/up_sampling2d.py
index 9a916567a56b..d6a6ff8c0c59 100644
--- a/keras/layers/reshaping/up_sampling2d.py
+++ b/keras/layers/reshaping/up_sampling2d.py
@@ -64,9 +64,10 @@ class UpSampling2D(Layer):
         `(batch_size, height, width, channels)` while `channels_first`
         corresponds to inputs with shape
         `(batch_size, channels, height, width)`.
-        It defaults to the `image_data_format` value found in your
-        Keras config file at `~/.keras/keras.json`.
-        If you never set it, then it will be "channels_last".
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
       interpolation: A string, one of `"area"`, `"bicubic"`, `"bilinear"`,
         `"gaussian"`, `"lanczos3"`, `"lanczos5"`, `"mitchellcubic"`,
         `"nearest"`.
diff --git a/keras/layers/reshaping/up_sampling3d.py b/keras/layers/reshaping/up_sampling3d.py
index ae6740da00b8..9482ea1b530c 100644
--- a/keras/layers/reshaping/up_sampling3d.py
+++ b/keras/layers/reshaping/up_sampling3d.py
@@ -51,9 +51,10 @@ class UpSampling3D(Layer):
         `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
         while `channels_first` corresponds to inputs with shape
         `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-        It defaults to the `image_data_format` value found in your
-        Keras config file at `~/.keras/keras.json`.
-        If you never set it, then it will be "channels_last".
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
 
     Input shape:
       5D tensor with shape:
diff --git a/keras/layers/reshaping/zero_padding2d.py b/keras/layers/reshaping/zero_padding2d.py
index 2615da40739a..a4e4c3e6fb57 100644
--- a/keras/layers/reshaping/zero_padding2d.py
+++ b/keras/layers/reshaping/zero_padding2d.py
@@ -74,9 +74,10 @@ class ZeroPadding2D(Layer):
         `(batch_size, height, width, channels)` while `channels_first`
         corresponds to inputs with shape
         `(batch_size, channels, height, width)`.
-        It defaults to the `image_data_format` value found in your
-        Keras config file at `~/.keras/keras.json`.
-        If you never set it, then it will be "channels_last".
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
 
     Input shape:
       4D tensor with shape:
diff --git a/keras/layers/reshaping/zero_padding3d.py b/keras/layers/reshaping/zero_padding3d.py
index c51668dcbb97..147118afd52e 100644
--- a/keras/layers/reshaping/zero_padding3d.py
+++ b/keras/layers/reshaping/zero_padding3d.py
@@ -57,9 +57,10 @@ class ZeroPadding3D(Layer):
         `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
         while `channels_first` corresponds to inputs with shape
         `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-        It defaults to the `image_data_format` value found in your
-        Keras config file at `~/.keras/keras.json`.
-        If you never set it, then it will be "channels_last".
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
 
     Input shape:
       5D tensor with shape:
diff --git a/keras/layers/rnn/base_conv_lstm.py b/keras/layers/rnn/base_conv_lstm.py
index 49f52a71c801..b3280d5ac63b 100644
--- a/keras/layers/rnn/base_conv_lstm.py
+++ b/keras/layers/rnn/base_conv_lstm.py
@@ -45,9 +45,10 @@ class ConvLSTMCell(DropoutRNNCellMixin, base_layer.BaseRandomLayer):
         up/down of the input such that output has the same height/width
         dimension as the input.
       data_format: A string, one of `channels_last` (default) or
-        `channels_first`.  It defaults to the `image_data_format` value found in
-        your Keras config file at `~/.keras/keras.json`. If you never set it,
-        then it will be "channels_last".
+        `channels_first`. When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+        `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
       dilation_rate: An integer or tuple/list of n integers, specifying the
         dilation rate to use for dilated convolution. Currently, specifying any
         `dilation_rate` value != 1 is incompatible with specifying any `strides`
@@ -383,9 +384,10 @@ class ConvLSTM(ConvRNN):
         `(batch, time, ..., channels)`
         while `channels_first` corresponds to
         inputs with shape `(batch, time, channels, ...)`.
-        It defaults to the `image_data_format` value found in your
-        Keras config file at `~/.keras/keras.json`.
-        If you never set it, then it will be "channels_last".
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
       dilation_rate: An integer or tuple/list of n integers, specifying
         the dilation rate to use for dilated convolution.
         Currently, specifying any `dilation_rate` value != 1 is
diff --git a/keras/layers/rnn/conv_lstm1d.py b/keras/layers/rnn/conv_lstm1d.py
index 5566b66808a8..96d3c2837416 100644
--- a/keras/layers/rnn/conv_lstm1d.py
+++ b/keras/layers/rnn/conv_lstm1d.py
@@ -44,9 +44,10 @@ class ConvLSTM1D(ConvLSTM):
         `channels_first`.  The ordering of the dimensions in the inputs.
         `channels_last` corresponds to inputs with shape `(batch, time, ...,
         channels)` while `channels_first` corresponds to inputs with shape
-        `(batch, time, channels, ...)`. It defaults to the `image_data_format`
-        value found in your Keras config file at `~/.keras/keras.json`. If you
-        never set it, then it will be "channels_last".
+        `(batch, time, channels, ...)`. When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+        `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
       dilation_rate: An integer or tuple/list of n integers, specifying the
         dilation rate to use for dilated convolution. Currently, specifying any
         `dilation_rate` value != 1 is incompatible with specifying any `strides`
diff --git a/keras/layers/rnn/conv_lstm2d.py b/keras/layers/rnn/conv_lstm2d.py
index d62e8828bc0b..668c9da5e4a9 100644
--- a/keras/layers/rnn/conv_lstm2d.py
+++ b/keras/layers/rnn/conv_lstm2d.py
@@ -44,9 +44,10 @@ class ConvLSTM2D(ConvLSTM):
         `channels_first`.  The ordering of the dimensions in the inputs.
         `channels_last` corresponds to inputs with shape `(batch, time, ...,
         channels)` while `channels_first` corresponds to inputs with shape
-        `(batch, time, channels, ...)`. It defaults to the `image_data_format`
-        value found in your Keras config file at `~/.keras/keras.json`. If you
-        never set it, then it will be "channels_last".
+        `(batch, time, channels, ...)`. When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+        `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
       dilation_rate: An integer or tuple/list of n integers, specifying the
         dilation rate to use for dilated convolution. Currently, specifying any
         `dilation_rate` value != 1 is incompatible with specifying any `strides`
diff --git a/keras/layers/rnn/conv_lstm3d.py b/keras/layers/rnn/conv_lstm3d.py
index e8c37ec5ea76..1488faae72c5 100644
--- a/keras/layers/rnn/conv_lstm3d.py
+++ b/keras/layers/rnn/conv_lstm3d.py
@@ -44,9 +44,10 @@ class ConvLSTM3D(ConvLSTM):
         `channels_first`. The ordering of the dimensions in the inputs.
         `channels_last` corresponds to inputs with shape `(batch, time, ...,
         channels)` while `channels_first` corresponds to inputs with shape
-        `(batch, time, channels, ...)`. It defaults to the `image_data_format`
-        value found in your Keras config file at `~/.keras/keras.json`. If you
-        never set it, then it will be "channels_last".
+        `(batch, time, channels, ...)`. When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+        `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
       dilation_rate: An integer or tuple/list of n integers, specifying the
         dilation rate to use for dilated convolution. Currently, specifying any
         `dilation_rate` value != 1 is incompatible with specifying any `strides`
diff --git a/keras/layers/rnn/gru.py b/keras/layers/rnn/gru.py
index b06f93051539..855b2561c29a 100644
--- a/keras/layers/rnn/gru.py
+++ b/keras/layers/rnn/gru.py
@@ -507,17 +507,17 @@ class GRU(DropoutRNNCellMixin, RNN, base_layer.BaseRandomLayer):
     Call arguments:
       inputs: A 3D tensor, with shape `[batch, timesteps, feature]`.
       mask: Binary tensor of shape `[samples, timesteps]` indicating whether
-        a given timestep should be masked  (optional, defaults to `None`).
+        a given timestep should be masked  (optional).
         An individual `True` entry indicates that the corresponding timestep
         should be utilized, while a `False` entry indicates that the
-        corresponding timestep should be ignored.
+        corresponding timestep should be ignored. Defaults to `None`.
       training: Python boolean indicating whether the layer should behave in
         training mode or in inference mode. This argument is passed to the cell
         when calling it. This is only relevant if `dropout` or
-        `recurrent_dropout` is used  (optional, defaults to `None`).
+        `recurrent_dropout` is used  (optional). Defaults to `None`.
       initial_state: List of initial state tensors to be passed to the first
-        call of the cell  (optional, defaults to `None` which causes creation
-        of zero-filled initial state tensors).
+        call of the cell  (optional, `None` causes creation
+        of zero-filled initial state tensors). Defaults to `None`.
     """
 
     def __init__(
diff --git a/keras/layers/rnn/legacy_cells.py b/keras/layers/rnn/legacy_cells.py
index 1df5c47d73df..ca2431cb67a9 100644
--- a/keras/layers/rnn/legacy_cells.py
+++ b/keras/layers/rnn/legacy_cells.py
@@ -186,7 +186,8 @@ def __call__(self, inputs, state, scope=None):
             `2-D Tensor` with shape `[batch_size, self.state_size]`. Otherwise,
             if `self.state_size` is a tuple of integers, this should be a tuple
             with shapes `[batch_size, s] for s in self.state_size`.
-          scope: VariableScope for the created subgraph; defaults to class name.
+          scope: VariableScope for the created subgraph; None uses class name.
+            Defaults to `None`.
 
         Returns:
           A pair containing:
diff --git a/keras/layers/rnn/lstm.py b/keras/layers/rnn/lstm.py
index 93e3e7cc200c..47ae51f7e6a5 100644
--- a/keras/layers/rnn/lstm.py
+++ b/keras/layers/rnn/lstm.py
@@ -480,17 +480,17 @@ class LSTM(DropoutRNNCellMixin, RNN, base_layer.BaseRandomLayer):
     Call arguments:
       inputs: A 3D tensor with shape `[batch, timesteps, feature]`.
       mask: Binary tensor of shape `[batch, timesteps]` indicating whether
-        a given timestep should be masked (optional, defaults to `None`).
+        a given timestep should be masked (optional).
         An individual `True` entry indicates that the corresponding timestep
         should be utilized, while a `False` entry indicates that the
-        corresponding timestep should be ignored.
+        corresponding timestep should be ignored. Defaults to `None`.
       training: Python boolean indicating whether the layer should behave in
         training mode or in inference mode. This argument is passed to the cell
         when calling it. This is only relevant if `dropout` or
-        `recurrent_dropout` is used (optional, defaults to `None`).
+        `recurrent_dropout` is used (optional). Defaults to `None`.
       initial_state: List of initial state tensors to be passed to the first
-        call of the cell (optional, defaults to `None` which causes creation
-        of zero-filled initial state tensors).
+        call of the cell (optional, `None` causes creation
+        of zero-filled initial state tensors). Defaults to `None`.
     """
 
     def __init__(
diff --git a/keras/legacy_tf_layers/base.py b/keras/legacy_tf_layers/base.py
index e2e925dba0e1..fa2beea2f2d1 100644
--- a/keras/legacy_tf_layers/base.py
+++ b/keras/legacy_tf_layers/base.py
@@ -365,8 +365,8 @@ def add_weight(
             or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
             Note, if the current variable scope is marked as non-trainable
             then this parameter is ignored and any added variables are also
-            marked as non-trainable. `trainable` defaults to `True` unless
-            `synchronization` is set to `ON_READ`.
+            marked as non-trainable. `trainable` becomes `True` unless
+            `synchronization` is set to `ON_READ`. Defaults to `True`.
           constraint: constraint instance (callable).
           use_resource: Whether to use `ResourceVariable`.
           synchronization: Indicates when a distributed a variable will be
diff --git a/keras/legacy_tf_layers/variable_scope_shim.py b/keras/legacy_tf_layers/variable_scope_shim.py
index 63c7fd6e81b7..5eaf3f2fc49e 100644
--- a/keras/legacy_tf_layers/variable_scope_shim.py
+++ b/keras/legacy_tf_layers/variable_scope_shim.py
@@ -215,7 +215,7 @@ def get_variable(
         Args:
           name: The name of the new or existing variable.
           shape: Shape of the new or existing variable.
-          dtype: Type of the new or existing variable (defaults to `DT_FLOAT`).
+          dtype: Type of the new or existing variable. Defaults to `DT_FLOAT`.
           initializer: Initializer for the variable.
           regularizer: A (Tensor -> Tensor or None) function; the result of
             applying it on a newly created variable will be added to the
@@ -226,8 +226,8 @@ def get_variable(
             always forced to be False.
           trainable: If `True` also add the variable to the graph collection
             `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). `trainable`
-            defaults to `True`, unless `synchronization` is set to `ON_READ`, in
-            which case it defaults to `False`.
+            becomes `True`, unless `synchronization` is set to `ON_READ`, in
+            which case it becomes `False`. Defaults to `True`.
           collections: List of graph collections keys to add the `Variable` to.
             Defaults to `[GraphKeys.GLOBAL_VARIABLES]` (see `tf.Variable`).
           caching_device: Optional device string or function describing where
@@ -235,7 +235,7 @@ def get_variable(
             Variable's device.  If not `None`, caches on another device.
             Typical use is to cache on the device where the Ops using the
             `Variable` reside, to deduplicate copying through `Switch` and other
-            conditional statements. Defaults to None.
+            conditional statements. Defaults to `None`.
           partitioner: Optional callable that accepts a fully defined
             `TensorShape` and dtype of the `Variable` to be created, and returns
             a list of partitions for each axis (currently only one axis can be
diff --git a/keras/losses.py b/keras/losses.py
index 5a1df59b13fe..e9c83bbeff20 100644
--- a/keras/losses.py
+++ b/keras/losses.py
@@ -69,15 +69,15 @@ def __init__(self, reduction=losses_utils.ReductionV2.AUTO, name=None):
         """Initializes `Loss` class.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+          reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+            `AUTO` indicates that the reduction option will
+            be determined by the usage context. For almost all cases this defaults
+            to `SUM_OVER_BATCH_SIZE`. When used under a
             `tf.distribute.Strategy`, except via `Model.compile()` and
             `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
             will raise an error. Please see this custom training [tutorial](
-              https://www.tensorflow.org/tutorials/distribute/custom_training)
-              for more details.
+            https://www.tensorflow.org/tutorials/distribute/custom_training)
+            for more details. Defaults to `AUTO`.
           name: Optional name for the instance.
         """
         losses_utils.ReductionV2.validate(reduction)
@@ -231,15 +231,15 @@ def __init__(
         Args:
           fn: The loss function to wrap, with signature `fn(y_true, y_pred,
             **kwargs)`.
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+          reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+            `AUTO` indicates that the reduction option will
+            be determined by the usage context. For almost all cases this defaults
+            to `SUM_OVER_BATCH_SIZE`. When used under a
             `tf.distribute.Strategy`, except via `Model.compile()` and
             `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
             will raise an error. Please see this custom training [tutorial](
             https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
+            for more details. Defaults to `AUTO`.
           name: Optional name for the instance.
           **kwargs: The keyword arguments that are passed on to `fn`.
         """
@@ -343,15 +343,15 @@ def __init__(
         """Initializes `MeanSquaredError` instance.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+          reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+            `AUTO` indicates that the reduction option will
+            be determined by the usage context. For almost all cases this defaults
+            to `SUM_OVER_BATCH_SIZE`. When used under a
             `tf.distribute.Strategy`, except via `Model.compile()` and
             `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
             will raise an error. Please see this custom training [tutorial](
             https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
+            for more details. Defaults to `AUTO`.
           name: Optional name for the instance. Defaults to
             'mean_squared_error'.
         """
@@ -404,15 +404,15 @@ def __init__(
         """Initializes `MeanAbsoluteError` instance.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+          reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+            `AUTO` indicates that the reduction option will
+            be determined by the usage context. For almost all cases this defaults
+            to `SUM_OVER_BATCH_SIZE`. When used under a
             `tf.distribute.Strategy`, except via `Model.compile()` and
             `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
             will raise an error. Please see this custom training [tutorial](
             https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
+            for more details. Defaults to `AUTO`.
           name: Optional name for the instance. Defaults to
             'mean_absolute_error'.
         """
@@ -471,15 +471,15 @@ def __init__(
         """Initializes `MeanAbsolutePercentageError` instance.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+          reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+            `AUTO` indicates that the reduction option will
+            be determined by the usage context. For almost all cases this defaults
+            to `SUM_OVER_BATCH_SIZE`. When used under a
             `tf.distribute.Strategy`, except via `Model.compile()` and
             `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
             will raise an error. Please see this custom training [tutorial](
             https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
+            for more details. Defaults to `AUTO`.
           name: Optional name for the instance. Defaults to
             'mean_absolute_percentage_error'.
         """
@@ -535,15 +535,15 @@ def __init__(
         """Initializes `MeanSquaredLogarithmicError` instance.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+          reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+            `AUTO` indicates that the reduction option will
+            be determined by the usage context. For almost all cases this defaults
+            to `SUM_OVER_BATCH_SIZE`. When used under a
             `tf.distribute.Strategy`, except via `Model.compile()` and
             `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
             will raise an error. Please see this custom training [tutorial](
             https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
+            for more details. Defaults to `AUTO`.
           name: Optional name for the instance. Defaults to
             'mean_squared_logarithmic_error'.
         """
@@ -638,15 +638,15 @@ def __init__(
             heavier smoothing.
           axis: The axis along which to compute crossentropy (the features
             axis).  Defaults to `-1`.
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+          reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+            `AUTO` indicates that the reduction option will
+            be determined by the usage context. For almost all cases this defaults
+            to `SUM_OVER_BATCH_SIZE`. When used under a
             `tf.distribute.Strategy`, except via `Model.compile()` and
             `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
             will raise an error. Please see this custom training [tutorial](
             https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
+            for more details. Defaults to `AUTO`.
           name: Name for the op. Defaults to 'binary_crossentropy'.
         """
         super().__init__(
@@ -784,15 +784,15 @@ class BinaryFocalCrossentropy(LossFunctionWrapper):
         smoothing.
       axis: The axis along which to compute crossentropy (the features axis).
         Defaults to `-1`.
-      reduction: Type of `tf.keras.losses.Reduction` to apply to
-        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-        option will be determined by the usage context. For almost all cases
-        this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+      reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+        `AUTO` indicates that the reduction option will
+        be determined by the usage context. For almost all cases this defaults
+        to `SUM_OVER_BATCH_SIZE`. When used under a
         `tf.distribute.Strategy`, except via `Model.compile()` and
         `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
         will raise an error. Please see this custom training [tutorial](
         https://www.tensorflow.org/tutorials/distribute/custom_training)
-        for more details.
+        for more details. Defaults to `AUTO`.
       name: Name for the op. Defaults to 'binary_focal_crossentropy'.
     """
 
@@ -900,15 +900,15 @@ def __init__(
             `0.9 + 0.1 / num_classes` for target labels.
           axis: The axis along which to compute crossentropy (the features
             axis). Defaults to `-1`.
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+          reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+            `AUTO` indicates that the reduction option will
+            be determined by the usage context. For almost all cases this defaults
+            to `SUM_OVER_BATCH_SIZE`. When used under a
             `tf.distribute.Strategy`, except via `Model.compile()` and
             `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
             will raise an error. Please see this custom training [tutorial](
             https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
+            for more details. Defaults to `AUTO`.
           name: Optional name for the instance.
             Defaults to 'categorical_crossentropy'.
         """
@@ -1013,15 +1013,15 @@ class CategoricalFocalCrossentropy(LossFunctionWrapper):
             `0.9 + 0.1 / num_classes` for target labels.
         axis: The axis along which to compute crossentropy (the features
             axis). Defaults to `-1`.
-        reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+        reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+            `AUTO` indicates that the reduction option will
+            be determined by the usage context. For almost all cases this defaults
+            to `SUM_OVER_BATCH_SIZE`. When used under a
             `tf.distribute.Strategy`, except via `Model.compile()` and
             `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
             will raise an error. Please see this custom training [tutorial](
             https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
+            for more details. Defaults to `AUTO`.
         name: Optional name for the instance.
             Defaults to 'categorical_focal_crossentropy'.
     """
@@ -1126,15 +1126,15 @@ def __init__(
             problems featuring a "void" class (commonly -1 or 255) in
             segmentation maps.
             By default (`ignore_class=None`), all classes are considered.
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+          reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+            `AUTO` indicates that the reduction option will
+            be determined by the usage context. For almost all cases this defaults
+            to `SUM_OVER_BATCH_SIZE`. When used under a
             `tf.distribute.Strategy`, except via `Model.compile()` and
             `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
             will raise an error. Please see this custom training [tutorial](
             https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
+            for more details. Defaults to `AUTO`.
           name: Optional name for the instance. Defaults to
             'sparse_categorical_crossentropy'.
         """
@@ -1192,15 +1192,15 @@ def __init__(self, reduction=losses_utils.ReductionV2.AUTO, name="hinge"):
         """Initializes `Hinge` instance.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+          reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+            `AUTO` indicates that the reduction option will
+            be determined by the usage context. For almost all cases this defaults
+            to `SUM_OVER_BATCH_SIZE`. When used under a
             `tf.distribute.Strategy`, except via `Model.compile()` and
             `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
             will raise an error. Please see this custom training [tutorial](
             https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
+            for more details. Defaults to `AUTO`.
           name: Optional name for the instance. Defaults to 'hinge'.
         """
         super().__init__(hinge, name=name, reduction=reduction)
@@ -1253,15 +1253,15 @@ def __init__(
         """Initializes `SquaredHinge` instance.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+          reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+            `AUTO` indicates that the reduction option will
+            be determined by the usage context. For almost all cases this defaults
+            to `SUM_OVER_BATCH_SIZE`. When used under a
             `tf.distribute.Strategy`, except via `Model.compile()` and
             `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
             will raise an error. Please see this custom training [tutorial](
             https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
+            for more details. Defaults to `AUTO`.
           name: Optional name for the instance. Defaults to 'squared_hinge'.
         """
         super().__init__(squared_hinge, name=name, reduction=reduction)
@@ -1312,15 +1312,15 @@ def __init__(
         """Initializes `CategoricalHinge` instance.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+          reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+            `AUTO` indicates that the reduction option will
+            be determined by the usage context. For almost all cases this defaults
+            to `SUM_OVER_BATCH_SIZE`. When used under a
             `tf.distribute.Strategy`, except via `Model.compile()` and
             `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
             will raise an error. Please see this custom training [tutorial](
             https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
+            for more details. Defaults to `AUTO`.
           name: Optional name for the instance. Defaults to 'categorical_hinge'.
         """
         super().__init__(categorical_hinge, name=name, reduction=reduction)
@@ -1368,15 +1368,15 @@ def __init__(self, reduction=losses_utils.ReductionV2.AUTO, name="poisson"):
         """Initializes `Poisson` instance.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+          reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+            `AUTO` indicates that the reduction option will
+            be determined by the usage context. For almost all cases this defaults
+            to `SUM_OVER_BATCH_SIZE`. When used under a
             `tf.distribute.Strategy`, except via `Model.compile()` and
             `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
             will raise an error. Please see this custom training [tutorial](
             https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
+            for more details. Defaults to `AUTO`.
           name: Optional name for the instance. Defaults to 'poisson'.
         """
         super().__init__(poisson, name=name, reduction=reduction)
@@ -1427,15 +1427,15 @@ def __init__(
         """Initializes `LogCosh` instance.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+          reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+            `AUTO` indicates that the reduction option will
+            be determined by the usage context. For almost all cases this defaults
+            to `SUM_OVER_BATCH_SIZE`. When used under a
             `tf.distribute.Strategy`, except via `Model.compile()` and
             `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
             will raise an error. Please see this custom training [tutorial](
             https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
+            for more details. Defaults to `AUTO`.
           name: Optional name for the instance. Defaults to 'log_cosh'.
         """
         super().__init__(log_cosh, name=name, reduction=reduction)
@@ -1487,15 +1487,15 @@ def __init__(
         """Initializes `KLDivergence` instance.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+          reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+            `AUTO` indicates that the reduction option will
+            be determined by the usage context. For almost all cases this defaults
+            to `SUM_OVER_BATCH_SIZE`. When used under a
             `tf.distribute.Strategy`, except via `Model.compile()` and
             `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
             will raise an error. Please see this custom training [tutorial](
             https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
+            for more details. Defaults to `AUTO`.
           name: Optional name for the instance. Defaults to 'kl_divergence'.
         """
         super().__init__(kl_divergence, name=name, reduction=reduction)
@@ -1556,15 +1556,15 @@ def __init__(
         Args:
           delta: A float, the point where the Huber loss function changes from a
             quadratic to linear.
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+          reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+            `AUTO` indicates that the reduction option will
+            be determined by the usage context. For almost all cases this defaults
+            to `SUM_OVER_BATCH_SIZE`. When used under a
             `tf.distribute.Strategy`, except via `Model.compile()` and
             `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
             will raise an error. Please see this custom training [tutorial](
             https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
+            for more details. Defaults to `AUTO`.
           name: Optional name for the instance. Defaults to 'huber_loss'.
         """
         super().__init__(huber, name=name, reduction=reduction, delta=delta)
@@ -2792,14 +2792,14 @@ class CosineSimilarity(LossFunctionWrapper):
       axis: The axis along which the cosine similarity is computed
         (the features axis). Defaults to `-1`.
       reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
-        Default value is `AUTO`. `AUTO` indicates that the reduction option will
+        `AUTO` indicates that the reduction option will
         be determined by the usage context. For almost all cases this defaults
         to `SUM_OVER_BATCH_SIZE`. When used under a
         `tf.distribute.Strategy`, except via `Model.compile()` and
         `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
         will raise an error. Please see this custom training [tutorial](
         https://www.tensorflow.org/tutorials/distribute/custom_training)
-        for more details.
+        for more details. Defaults to `AUTO`.
       name: Optional name for the instance.
     """
 
diff --git a/keras/models/cloning.py b/keras/models/cloning.py
index b490777fd81b..6c71fde32993 100644
--- a/keras/models/cloning.py
+++ b/keras/models/cloning.py
@@ -474,12 +474,13 @@ def clone_model(model, input_tensors=None, clone_function=None):
             model (except `InputLayer` instances). It takes as argument the
             layer instance to be cloned, and returns the corresponding layer
             instance to be used in the model copy. If unspecified, this callable
-            defaults to the following serialization/deserialization function:
+            becomes the following serialization/deserialization function:
             `lambda layer: layer.__class__.from_config(layer.get_config())`.
             By passing a custom callable, you can customize your copy of the
             model, e.g. by wrapping certain layers of interest (you might want
             to replace all `LSTM` instances with equivalent
             `Bidirectional(LSTM(...))` instances, for example).
+            Defaults to `None`.
 
     Returns:
       An instance of `Model` reproducing the behavior
diff --git a/keras/models/sharpness_aware_minimization.py b/keras/models/sharpness_aware_minimization.py
index 33e01cd59e01..70543101cd99 100644
--- a/keras/models/sharpness_aware_minimization.py
+++ b/keras/models/sharpness_aware_minimization.py
@@ -41,11 +41,11 @@ class SharpnessAwareMinimization(Model):
     Args:
       model: `tf.keras.Model` instance. The inner model that does the
         forward-backward pass.
-      rho: float, defaults to 0.05. The gradients scaling factor.
-      num_batch_splits: int, defaults to None. The number of mini batches to
+      rho: float. The gradients scaling factor. Defaults to `0.05`.
+      num_batch_splits: int. The number of mini batches to
         split into from each data batch. If None, batches are not split into
-        sub-batches.
-      name: string, defaults to None. The name of the SAM model.
+        sub-batches. Defaults to None.
+      name: string. The name of the SAM model. Defaults to None.
 
     Reference:
       [Pierre Foret et al., 2020](https://arxiv.org/abs/2010.01412)
diff --git a/keras/optimizers/optimizer.py b/keras/optimizers/optimizer.py
index f47abe0c34be..c8f5ced6e32b 100644
--- a/keras/optimizers/optimizer.py
+++ b/keras/optimizers/optimizer.py
@@ -484,12 +484,12 @@ def add_variable_from_reference(
           variable_name: String. The name prefix of the optimizer variable to be
             created. The create variables name will follow the pattern
             `{variable_name}/{model_variable.name}`, e.g., `momemtum/dense_1`.
-          shape: List or Tuple, defaults to None. The shape of the optimizer
+          shape: List or Tuple. The shape of the optimizer
             variable to be created. If None, the created variable will have the
-            same shape as `model_variable`.
-          initial_value: A Tensor, or Python object convertible to a Tensor,
-            defaults to None. The initial value of the optimizer variable, if
-            None, the initial value will be default to 0.
+            same shape as `model_variable`. Defaults to `None`.
+          initial_value: A Tensor, or Python object convertible to a Tensor.
+            The initial value of the optimizer variable, if None, the
+            initial value will be default to 0. Defaults to `None`.
 
         Returns:
           An optimizer variable.
@@ -614,8 +614,9 @@ def apply_gradients(self, grads_and_vars, name=None):
 
         Args:
           grads_and_vars: List of `(gradient, variable)` pairs.
-          name: string, defaults to None. The name of the namescope to
+          name: string. The name of the namescope to
             use when creating variables. If None, `self.name` will be used.
+            Defaults to `None`.
 
         Returns:
           A `tf.Variable`, representing the current iteration.
@@ -846,24 +847,24 @@ def load_own_variables(self, store):
 base_optimizer_keyword_args = """name: String. The name to use
           for momentum accumulator weights created by
           the optimizer.
-      weight_decay: Float, defaults to None. If set, weight decay is applied.
+      weight_decay: Float. If set, weight decay is applied. Defaults to `None`.
       clipnorm: Float. If set, the gradient of each weight is individually
           clipped so that its norm is no higher than this value.
       clipvalue: Float. If set, the gradient of each weight is clipped to be no
           higher than this value.
       global_clipnorm: Float. If set, the gradient of all weights is clipped so
           that their global norm is no higher than this value.
-      use_ema: Boolean, defaults to False. If True, exponential moving average
+      use_ema: Boolean. If True, exponential moving average
           (EMA) is applied. EMA consists of computing an exponential moving
           average of the weights of the model (as the weight values change after
           each training batch), and periodically overwriting the weights with
-          their moving average.
-      ema_momentum: Float, defaults to 0.99. Only used if `use_ema=True`.
+          their moving average. Defaults to `False`.
+      ema_momentum: Float. Only used if `use_ema=True`.
           This is the momentum to use when computing
           the EMA of the model's weights:
           `new_average = ema_momentum * old_average + (1 - ema_momentum) *
-          current_variable_value`.
-      ema_overwrite_frequency: Int or None, defaults to None. Only used if
+          current_variable_value`. Defaults to `0.99`.
+      ema_overwrite_frequency: Int or None. Only used if
           `use_ema=True`. Every `ema_overwrite_frequency` steps of iterations,
           we overwrite the model variable by its moving average.
           If None, the optimizer
@@ -873,10 +874,11 @@ def load_own_variables(self, store):
           (which updates the model
           variables in-place). When using the built-in `fit()` training loop,
           this happens automatically after the last epoch,
-          and you don't need to do anything.
-      jit_compile: Boolean, defaults to True.
+          and you don't need to do anything. Defaults to `None`.
+      jit_compile: Boolean.
           If True, the optimizer will use XLA
           compilation. If no GPU device is found, this flag will be ignored.
+          Defaults to `True`.
       mesh: optional `tf.experimental.dtensor.Mesh` instance. When provided,
           the optimizer will be run in DTensor mode, e.g. state
           tracking variable will be a DVariable, and aggregation/reduction will
@@ -1202,8 +1204,9 @@ def apply_gradients(
 
         Args:
           grads_and_vars: List of `(gradient, variable)` pairs.
-          name: string, defaults to None. The name of the namescope to
+          name: string. The name of the namescope to
             use when creating variables. If None, `self.name` will be used.
+            Defaults to `None`.
           skip_gradients_aggregation: If true, gradients aggregation will not be
             performed inside optimizer. Usually this arg is set to True when you
             write custom code aggregating gradients outside the optimizer.
diff --git a/keras/optimizers/optimizer_v1.py b/keras/optimizers/optimizer_v1.py
index 5cb3544ecf9e..5c36400760c1 100644
--- a/keras/optimizers/optimizer_v1.py
+++ b/keras/optimizers/optimizer_v1.py
@@ -248,7 +248,7 @@ class RMSprop(Optimizer):
       lr: float >= 0. Learning rate.
       rho: float >= 0.
       epsilon: float >= 0. Fuzz factor.
-        If `None`, defaults to `backend.epsilon()`.
+        If `None`, becomes `backend.epsilon()`. Defaults to `None`.
       decay: float >= 0. Learning rate decay over each update.
     """
 
@@ -408,12 +408,14 @@ class Adadelta(Optimizer):
     at their default values.
 
     Arguments:
-      lr: float >= 0. Initial learning rate, defaults to 1.
+      lr: float >= 0. Initial learning rate.
           It is recommended to leave it at the default value.
+          Defaults to `1`.
       rho: float >= 0. Adadelta decay factor, corresponding to fraction of
           gradient to keep at each time step.
       epsilon: float >= 0. Fuzz factor.
         If `None`, defaults to `backend.epsilon()`.
+        Defaults to `None`.
       decay: float >= 0. Initial learning rate decay.
 
     References:
@@ -505,7 +507,8 @@ class Adam(Optimizer):
       beta_1: float, 0 < beta < 1. Generally close to 1.
       beta_2: float, 0 < beta < 1. Generally close to 1.
       epsilon: float >= 0. Fuzz factor.
-        If `None`, defaults to `backend.epsilon()`.
+        If `None`, becomes `backend.epsilon()`.
+        Defaults to `None`.
       decay: float >= 0. Learning rate decay over each update.
       amsgrad: boolean. Whether to apply the AMSGrad variant of this algorithm
         from the paper "On the Convergence of Adam and Beyond".
@@ -624,7 +627,8 @@ class Adamax(Optimizer):
       lr: float >= 0. Learning rate.
       beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
       epsilon: float >= 0. Fuzz factor.
-        If `None`, defaults to `backend.epsilon()`.
+        If `None`, becomes `backend.epsilon()`.
+        Defaults to `None`.
       decay: float >= 0. Learning rate decay over each update.
     """
 
@@ -727,7 +731,8 @@ class Nadam(Optimizer):
       lr: float >= 0. Learning rate.
       beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
       epsilon: float >= 0. Fuzz factor.
-        If `None`, defaults to `backend.epsilon()`.
+        If `None`, becomes `backend.epsilon()`.
+        Defaults to `None`.
     """
 
     def __init__(
diff --git a/keras/optimizers/rmsprop.py b/keras/optimizers/rmsprop.py
index b2e5f932ef5f..faaff7c3f1c3 100644
--- a/keras/optimizers/rmsprop.py
+++ b/keras/optimizers/rmsprop.py
@@ -49,8 +49,9 @@ class RMSprop(optimizer.Optimizer):
         or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
         Defaults to `0.001`.
       rho: float. Discounting factor for the old gradients. Defaults to `0.9`.
-      momentum: float, defaults to 0.0. If not 0.0., the optimizer tracks the
+      momentum: float. If not `0.0`, the optimizer tracks the
         momentum value, with a decay rate equals to `1 - momentum`.
+        Defaults to `0.0`.
       epsilon: A small constant for numerical stability. This epsilon is
         "epsilon hat" in the Kingma and Ba paper (in the formula just before
         Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
diff --git a/keras/preprocessing/image.py b/keras/preprocessing/image.py
index 266f60e3e119..686bff57c31f 100644
--- a/keras/preprocessing/image.py
+++ b/keras/preprocessing/image.py
@@ -1227,7 +1227,7 @@ class ImageDataGenerator:
         vertical_flip: Boolean. Randomly flip inputs vertically.
         rescale: rescaling factor. If None or 0, no rescaling
           is applied, otherwise we multiply the data by the value provided
-          (after applying all other transformations). Defaults to None.
+          (after applying all other transformations). Defaults to `None`.
         preprocessing_function: function that will be applied on each input. The
           function will run after the image is resized and augmented.
             The function should take one argument: one image (Numpy tensor with
@@ -1236,9 +1236,9 @@ class ImageDataGenerator:
           "channels_last". "channels_last" mode means that the images should
           have shape `(samples, height, width, channels)`, "channels_first" mode
           means that the images should have shape `(samples, channels, height,
-          width)`.  It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`. If you never set it, then
-          it will be "channels_last".
+          width)`. When unspecified, uses `image_data_format` value found in
+          your Keras config file at `~/.keras/keras.json` (if exists) else
+          'channels_last'. Defaults to "channels_last".
         validation_split: Float. Fraction of images reserved for validation
           (strictly between 0 and 1).
         dtype: Dtype to use for the generated arrays.
@@ -1586,8 +1586,8 @@ def flow_from_directory(
               in the generator. See [this script](
               https://gist.github.com/fchollet/0830affa1f7f19fd47b06d4cf89ed44d)
               for more details.
-            target_size: Tuple of integers `(height, width)`, defaults to `(256,
-              256)`. The dimensions to which all images found will be resized.
+            target_size: Tuple of integers `(height, width)`. The dimensions to
+             which all images found will be resized. Defaults to `(256,256)`.
             color_mode: One of "grayscale", "rgb", "rgba". Default: "rgb".
               Whether the images will be converted to have 1, 3, or 4 channels.
             classes: Optional list of class subdirectories (e.g. `['dogs',
diff --git a/keras/preprocessing/text.py b/keras/preprocessing/text.py
index f47d4068059f..7a5028c36387 100644
--- a/keras/preprocessing/text.py
+++ b/keras/preprocessing/text.py
@@ -154,11 +154,11 @@ def hashing_trick(
     Args:
         text: Input text (string).
         n: Dimension of the hashing space.
-        hash_function: defaults to python `hash` function, can be 'md5' or
+        hash_function: when None uses a python `hash` function, can be 'md5' or
             any function that takes in input a string and returns a int.
             Note that 'hash' is not a stable hashing function, so
             it is not consistent across different runs, while 'md5'
-            is a stable hashing function.
+            is a stable hashing function. Defaults to `None`.
         filters: list (or concatenation) of characters to filter out, such as
             punctuation. Default: ``!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n``,
             includes basic punctuation, tabs, and newlines.
diff --git a/keras/saving/legacy/save.py b/keras/saving/legacy/save.py
index f6b1c0ece4c8..b41131d8f044 100644
--- a/keras/saving/legacy/save.py
+++ b/keras/saving/legacy/save.py
@@ -317,7 +317,7 @@ def save_weights(
             target location, or provide the user with a manual prompt.
         save_format: Either 'tf' or 'h5'. A `filepath` ending in '.h5' or
             '.keras' will default to HDF5 if `save_format` is `None`.
-            Otherwise `None` defaults to 'tf'.
+            Otherwise, `None` becomes 'tf'. Defaults to `None`.
         options: Optional `tf.train.CheckpointOptions` object that specifies
             options for saving weights.
 
diff --git a/keras/saving/legacy/saved_model/json_utils.py b/keras/saving/legacy/saved_model/json_utils.py
index 6d133bb1c41f..05b0e285be75 100644
--- a/keras/saving/legacy/saved_model/json_utils.py
+++ b/keras/saving/legacy/saved_model/json_utils.py
@@ -95,8 +95,8 @@ def _decode_helper(
 
     Args:
       obj: A decoded dictionary that may represent an object.
-      deserialize: Boolean, defaults to False. When True, deserializes any Keras
-        objects found in `obj`.
+      deserialize: Boolean. When True, deserializes any Keras
+        objects found in `obj`. Defaults to `False`.
       module_objects: A dictionary of built-in objects to look the name up in.
         Generally, `module_objects` is provided by midlevel library
         implementers.
diff --git a/keras/saving/object_registration.py b/keras/saving/object_registration.py
index a64b21f3313f..1a97880b4af7 100644
--- a/keras/saving/object_registration.py
+++ b/keras/saving/object_registration.py
@@ -128,7 +128,7 @@ class MyDense(keras.layers.Dense):
 
     Args:
       package: The package that this class belongs to. This is used for the
-        `key` (which is `"package>name"`) to idenfify the class. Note that this
+        `key` (which is `"package>name"`) to identify the class. Note that this
         is the first argument passed into the decorator.
       name: The name to serialize this class under in this package. If not
         provided or `None`, the class' name will be used (note that this is the
diff --git a/keras/saving/serialization_lib.py b/keras/saving/serialization_lib.py
index 02e4965a6f5f..ee051d102113 100644
--- a/keras/saving/serialization_lib.py
+++ b/keras/saving/serialization_lib.py
@@ -412,7 +412,7 @@ def deserialize_keras_object(
       `keras.utils.register_keras_serializable(package, name)` API. The key has
       the format of '{package}>{name}', where `package` and `name` are the
       arguments passed to `register_keras_serializable()`. If `name` is not
-      provided, it defaults to the class name. If `registered_name` successfully
+      provided, it uses the class name. If `registered_name` successfully
       resolves to a class (that was registered), the `class_name` and `config`
       values in the dict will not be used. `registered_name` is only used for
       non-built-in classes.
diff --git a/keras/utils/data_utils.py b/keras/utils/data_utils.py
index dc02c2854045..21f48cb8c237 100644
--- a/keras/utils/data_utils.py
+++ b/keras/utils/data_utils.py
@@ -247,7 +247,7 @@ def get_file(
             The default `'auto'` corresponds to `['tar', 'zip']`.
             None or an empty list will return no matches found.
         cache_dir: Location to store cached files, when None it
-            defaults to the default directory `~/.keras/`.
+            defaults to `~/.keras/`.
 
     Returns:
         Path to the downloaded file.
@@ -1063,14 +1063,16 @@ def pad_sequences(
         maxlen: Optional Int, maximum length of all sequences. If not provided,
             sequences will be padded to the length of the longest individual
             sequence.
-        dtype: (Optional, defaults to `"int32"`). Type of the output sequences.
+        dtype: (Optional). Type of the output sequences.
             To pad sequences with variable length strings, you can use `object`.
-        padding: String, "pre" or "post" (optional, defaults to `"pre"`):
-            pad either before or after each sequence.
-        truncating: String, "pre" or "post" (optional, defaults to `"pre"`):
+            Defaults to `"int32"`.
+        padding: String, "pre" or "post" (optional):
+            pad either before or after each sequence. Defaults to `"pre"`.
+        truncating: String, "pre" or "post" (optional):
             remove values from sequences larger than
             `maxlen`, either at the beginning or at the end of the sequences.
-        value: Float or String, padding value. (Optional, defaults to 0.)
+            Defaults to `"pre"`.
+        value: Float or String, padding value. (Optional). Defaults to `0.`.
 
     Returns:
         Numpy array with shape `(len(sequences), maxlen)`
diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py
index 444b25670ca8..35d234d62556 100644
--- a/keras/utils/dataset_utils.py
+++ b/keras/utils/dataset_utils.py
@@ -41,11 +41,11 @@ def split_dataset(
         left_size: If float (in the range `[0, 1]`), it signifies
           the fraction of the data to pack in the left dataset. If integer, it
           signifies the number of samples to pack in the left dataset. If
-          `None`, it defaults to the complement to `right_size`.
+          `None`, it uses the complement to `right_size`. Defaults to `None`.
         right_size: If float (in the range `[0, 1]`), it signifies
           the fraction of the data to pack in the right dataset. If integer, it
           signifies the number of samples to pack in the right dataset. If
-          `None`, it defaults to the complement to `left_size`.
+          `None`, it uses the complement to `left_size`. Defaults to `None`.
         shuffle: Boolean, whether to shuffle the data before splitting it.
         seed: A random seed for shuffling.
 
diff --git a/keras/utils/generic_utils.py b/keras/utils/generic_utils.py
index 3d8316833019..ba58673eec43 100644
--- a/keras/utils/generic_utils.py
+++ b/keras/utils/generic_utils.py
@@ -187,7 +187,7 @@ def update(self, current, values=None, finalize=None):
               as-is. Else, an average of the metric over time will be
               displayed.
             finalize: Whether this is the last update for the progress bar. If
-              `None`, defaults to `current >= self.target`.
+              `None`, uses `current >= self.target`. Defaults to `None`.
         """
         if finalize is None:
             if self.target is None:
diff --git a/keras/utils/image_utils.py b/keras/utils/image_utils.py
index d3190f51aaf6..94f4ebc2e631 100644
--- a/keras/utils/image_utils.py
+++ b/keras/utils/image_utils.py
@@ -218,12 +218,12 @@ def array_to_img(x, data_format=None, scale=True, dtype=None):
         data_format: Image data format, can be either `"channels_first"` or
           `"channels_last"`. None means the global
           setting `tf.keras.backend.image_data_format()` is used (unless you
-          changed it, it defaults to `"channels_last"`). Defaults to `None`.
+          changed it, it uses `"channels_last"`). Defaults to `None`.
         scale: Whether to rescale the image such that minimum and maximum values
           are 0 and 255 respectively. Defaults to `True`.
         dtype: Dtype to use. None makes the global setting
           `tf.keras.backend.floatx()` to be used (unless you changed it, it
-          defaults to `"float32"`). Defaults to `None`.
+          uses `"float32"`). Defaults to `None`.
 
     Returns:
         A PIL Image instance.
@@ -300,10 +300,10 @@ def img_to_array(img, data_format=None, dtype=None):
         data_format: Image data format, can be either `"channels_first"` or
           `"channels_last"`. None means the global
           setting `tf.keras.backend.image_data_format()` is used (unless you
-          changed it, it defaults to `"channels_last"`). Defaults to `None`.
+          changed it, it uses `"channels_last"`). Defaults to `None`.
         dtype: Dtype to use. None makes the global setting
           `tf.keras.backend.floatx()` to be used (unless you changed it, it
-          defaults to `"float32"`). Defaults to `None`.
+          uses `"float32"`). Defaults to `None`.
 
     Returns:
         A 3D Numpy array.
diff --git a/keras/utils/layer_utils.py b/keras/utils/layer_utils.py
index 31d42b781f4d..c15434667043 100644
--- a/keras/utils/layer_utils.py
+++ b/keras/utils/layer_utils.py
@@ -335,11 +335,12 @@ def print_summary(
             It will be called on each line of the summary.
             You can set it to a custom function
             in order to capture the string summary.
-            It defaults to `print` (prints to stdout).
+            When `None`, uses `print` (prints to stdout).
+            Defaults to `None`.
         expand_nested: Whether to expand the nested models.
-            If not provided, defaults to `False`.
+            Defaults to `False`.
         show_trainable: Whether to show if a layer is trainable.
-            If not provided, defaults to `False`.
+            Defaults to `False`.
         layer_range: List or tuple containing two strings,
             the starting layer name and ending layer name (both inclusive),
             indicating the range of layers to be printed in the summary. The
diff --git a/keras/utils/losses_utils.py b/keras/utils/losses_utils.py
index 2630326bcf93..656922224f39 100644
--- a/keras/utils/losses_utils.py
+++ b/keras/utils/losses_utils.py
@@ -32,11 +32,10 @@ class ReductionV2:
     Contains the following values:
 
     * `AUTO`: Indicates that the reduction option will be determined by the
-      usage context. For almost all cases this defaults to
-      `SUM_OVER_BATCH_SIZE`. When used with `tf.distribute.Strategy`, outside of
-      built-in training loops such as `tf.keras` `compile` and `fit`, we expect
-      reduction value to be `SUM` or `NONE`. Using `AUTO` in that case will
-      raise an error.
+      usage context. For almost all cases this uses `SUM_OVER_BATCH_SIZE`.
+      When used with `tf.distribute.Strategy`, outside of built-in training
+      loops such as `tf.keras` `compile` and `fit`, we expect reduction
+      value to be `SUM` or `NONE`. Using `AUTO` in that case will raise an error.
     * `NONE`: No **additional** reduction is applied to the output of the
       wrapped loss function. When non-scalar losses are returned to Keras
       functions like `fit`/`evaluate`, the unreduced vector loss is passed to