Fix for Keras Softmax layer gradient underflow.

hertschuh · tensorflower-gardener · commit 304bb3d9ab13 · 2023-05-30T11:15:58.000-07:00
tensorflow/tensorflow#60314 The `tf.keras.activations.softmax` function, the `tf.keras.backend.softmax` function and the `tf.keras.layers.Softmax` layer now behave consistently and save the logits in `_keras_logits`. Previously, only the activation function had this behavior. This prevents the computation of the gradient of the crossentropy from underflowing. The same fix was applied to the `tf.keras.backend.sigmoid` function and the `tf.keras.layers.Sigmoid` layer. One behavior change is that `tf.keras.backend.softmax` and `tf.keras.layers.Softmax` no longer accept inputs of rank 1. PiperOrigin-RevId: 536456175
diff --git a/keras/activations.py b/keras/activations.py
@@ -84,22 +84,7 @@ def softmax(x, axis=-1):
     >>> layer = tf.keras.layers.Dense(32,
     ...                               activation=tf.keras.activations.softmax)
     """
-    if x.shape.rank <= 1:
-        raise ValueError(
-            f"Cannot apply softmax to a tensor that is 1D. Received input: {x}"
-        )
-
-    if isinstance(axis, int):
-        output = tf.nn.softmax(x, axis=axis)
-    else:
-        # nn.softmax does not support tuple axis.
-        numerator = tf.exp(x - tf.reduce_max(x, axis=axis, keepdims=True))
-        denominator = tf.reduce_sum(numerator, axis=axis, keepdims=True)
-        output = numerator / denominator
-
-    # Cache the logits to use for crossentropy loss.
-    output._keras_logits = x
-    return output
+    return backend.softmax(x, axis)
 
 
 @keras_export("keras.activations.elu")
@@ -412,10 +397,7 @@ def sigmoid(x):
     Returns:
         Tensor with the sigmoid activation: `1 / (1 + exp(-x))`.
     """
-    output = tf.sigmoid(x)
-    # Cache the logits to use for crossentropy loss.
-    output._keras_logits = x
-    return output
+    return backend.sigmoid(x)
 
 
 @keras_export("keras.activations.exponential")
diff --git a/keras/backend.py b/keras/backend.py
@@ -5441,7 +5441,22 @@ def softmax(x, axis=-1):
     Returns:
         A tensor.
     """
-    return tf.nn.softmax(x, axis=axis)
+    if x.shape.rank <= 1:
+        raise ValueError(
+            f"Cannot apply softmax to a tensor that is 1D. Received input: {x}"
+        )
+
+    if isinstance(axis, int):
+        output = tf.nn.softmax(x, axis=axis)
+    else:
+        # nn.softmax does not support tuple axis.
+        numerator = tf.exp(x - tf.reduce_max(x, axis=axis, keepdims=True))
+        denominator = tf.reduce_sum(numerator, axis=axis, keepdims=True)
+        output = numerator / denominator
+
+    # Cache the logits to use for crossentropy loss.
+    output._keras_logits = x
+    return output
 
 
 @keras_export("keras.backend.softplus")
@@ -5899,7 +5914,10 @@ def sigmoid(x):
     Returns:
         A tensor.
     """
-    return tf.math.sigmoid(x)
+    output = tf.sigmoid(x)
+    # Cache the logits to use for crossentropy loss.
+    output._keras_logits = x
+    return output
 
 
 @keras_export("keras.backend.hard_sigmoid")
diff --git a/keras/layers/activation/softmax.py b/keras/layers/activation/softmax.py
@@ -51,13 +51,13 @@ class Softmax(Layer):
 
     Example without mask:
 
-    >>> inp = np.asarray([1., 2., 1.])
+    >>> inp = np.asarray([[1., 2., 1.]])
     >>> layer = tf.keras.layers.Softmax()
     >>> layer(inp).numpy()
-    array([0.21194157, 0.5761169 , 0.21194157], dtype=float32)
-    >>> mask = np.asarray([True, False, True], dtype=bool)
+    array([[0.21194157, 0.5761169 , 0.21194157]], dtype=float32)
+    >>> mask = np.asarray([[True, False, True]], dtype=bool)
     >>> layer(inp, mask).numpy()
-    array([0.5, 0. , 0.5], dtype=float32)
+    array([[0.5, 0. , 0.5]], dtype=float32)
 
     Input shape:
         Arbitrary. Use the keyword argument `input_shape`