* Add an argument robust_masking to the Softmax layer to enable better numerical handling of the mask (currently if the mask violates any of the assumptions it will do numerically silly things silently).

tensorflower-gardener · tensorflower-gardener · commit adad2c0e3d0e · 2025-11-14T00:06:43.000-08:00
* Plumb an argument that would opt into the usage of the new softmax layer for the official keras `MultiHeadAttention` layer and the model garden `TransformerEncoderBlock` layer.

PiperOrigin-RevId: 831896060
diff --git a/tf_keras/api/golden/v1/tensorflow.keras.layers.-multi-head-attention.pbtxt b/tf_keras/api/golden/v1/tensorflow.keras.layers.-multi-head-attention.pbtxt
@@ -129,7 +129,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_heads\', \'key_dim\', \'value_dim\', \'dropout\', \'use_bias\', \'output_shape\', \'attention_axes\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'0.0\', \'True\', \'None\', \'None\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_heads\', \'key_dim\', \'value_dim\', \'dropout\', \'use_bias\', \'output_shape\', \'attention_axes\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'softmax_robust_masking\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'0.0\', \'True\', \'None\', \'None\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tf_keras/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt b/tf_keras/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
@@ -129,7 +129,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'axis\'], varargs=None, keywords=kwargs, defaults=[\'-1\'], "
+    argspec: "args=[\'self\', \'axis\', \'robust_masking\'], varargs=None, keywords=kwargs, defaults=[\'-1\', \'False\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tf_keras/api/golden/v2/tensorflow.keras.layers.-multi-head-attention.pbtxt b/tf_keras/api/golden/v2/tensorflow.keras.layers.-multi-head-attention.pbtxt
@@ -129,7 +129,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_heads\', \'key_dim\', \'value_dim\', \'dropout\', \'use_bias\', \'output_shape\', \'attention_axes\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'0.0\', \'True\', \'None\', \'None\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_heads\', \'key_dim\', \'value_dim\', \'dropout\', \'use_bias\', \'output_shape\', \'attention_axes\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'softmax_robust_masking\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'0.0\', \'True\', \'None\', \'None\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tf_keras/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt b/tf_keras/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
@@ -129,7 +129,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'axis\'], varargs=None, keywords=kwargs, defaults=[\'-1\'], "
+    argspec: "args=[\'self\', \'axis\', \'robust_masking\'], varargs=None, keywords=kwargs, defaults=[\'-1\', \'False\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tf_keras/layers/activation/softmax.py b/tf_keras/layers/activation/softmax.py
@@ -70,6 +70,8 @@ class Softmax(Layer):
     Args:
         axis: Integer, or list of Integers, axis along which the softmax
             normalization is applied.
+        robust_masking: Bool, if true will use a more robust implementation when
+            dealing with masks.
     Call arguments:
         inputs: The inputs, or logits to the softmax layer.
         mask: A boolean mask of the same shape as `inputs`. The mask
@@ -80,23 +82,34 @@ class Softmax(Layer):
         Softmaxed output with the same shape as `inputs`.
     """
 
-    def __init__(self, axis=-1, **kwargs):
+    def __init__(self, axis=-1, robust_masking=False, **kwargs):
         super().__init__(**kwargs)
         self.supports_masking = True
+        self.robust_masking = robust_masking
         self.axis = axis
 
     def call(self, inputs, mask=None):
         if mask is not None:
-            # Since mask is 1.0 for positions we want to keep and 0.0 for masked
-            # positions, this operation will create a tensor which is 0.0 for
-            # positions we want to attend and -1e.9 for masked positions.
-            adder = (1.0 - tf.cast(mask, inputs.dtype)) * (
-                _large_compatible_negative(inputs.dtype)
-            )
-
-            # Since we are adding it to the raw scores before the softmax, this
-            # is effectively the same as removing these entirely.
-            inputs += adder
+            if self.robust_masking:
+                # We keep the positions where the mask is True or > 0.5, and set
+                # the other (masked) positions to -1e.9.
+                if mask.dtype is not tf.bool:
+                    mask = tf.greater(mask, tf.constant(0.5, dtype=mask.dtype))
+                inputs = tf.where(
+                    mask, inputs, _large_compatible_negative(inputs.dtype)
+                )
+            else:
+                # Since mask is 1.0 for positions we want to keep and 0.0 for
+                # masked positions, this operation will create a tensor which is
+                # 0.0 for positions we want to attend and -1e.9 for masked
+                # positions.
+                adder = (1.0 - tf.cast(mask, inputs.dtype)) * (
+                    _large_compatible_negative(inputs.dtype)
+                )
+
+                # Since we are adding it to the raw scores before the softmax, this
+                # is effectively the same as removing these entirely.
+                inputs += adder
         if isinstance(self.axis, (tuple, list)):
             if len(self.axis) > 1:
                 return tf.exp(
@@ -109,6 +122,8 @@ def call(self, inputs, mask=None):
 
     def get_config(self):
         config = {"axis": self.axis}
+        if self.robust_masking:
+            config["robust_masking"] = True
         base_config = super().get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
diff --git a/tf_keras/layers/activation/softmax_test.py b/tf_keras/layers/activation/softmax_test.py
@@ -31,6 +31,14 @@ def test_softmax(self):
             supports_masking=True,
         )
 
+    def test_softmax_robust_masking(self):
+        test_utils.layer_test(
+            keras.layers.Softmax,
+            kwargs={"axis": 1, "robust_masking": True},
+            input_shape=(2, 3, 4),
+            supports_masking=True,
+        )
+
 
 if __name__ == "__main__":
     tf.test.main()
diff --git a/tf_keras/layers/attention/multi_head_attention.py b/tf_keras/layers/attention/multi_head_attention.py
@@ -198,6 +198,8 @@ class MultiHeadAttention(Layer):
         activity_regularizer: Regularizer for dense layer activity.
         kernel_constraint: Constraint for dense layer kernels.
         bias_constraint: Constraint for dense layer kernels.
+        softmax_robust_masking: If true will use a more numerically robust
+            masking impl.
 
     Call arguments:
         query: Query `Tensor` of shape `(B, T, dim)`.
@@ -247,6 +249,7 @@ def __init__(
         activity_regularizer=None,
         kernel_constraint=None,
         bias_constraint=None,
+        softmax_robust_masking=False,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -264,6 +267,7 @@ def __init__(
         self._activity_regularizer = regularizers.get(activity_regularizer)
         self._kernel_constraint = constraints.get(kernel_constraint)
         self._bias_constraint = constraints.get(bias_constraint)
+        self._softmax_robust_masking = softmax_robust_masking
         if attention_axes is not None and not isinstance(
             attention_axes, collections.abc.Sized
         ):
@@ -298,6 +302,7 @@ def get_config(self):
             "query_shape": self._query_shape,
             "key_shape": self._key_shape,
             "value_shape": self._value_shape,
+            "softmax_robust_masking": self._softmax_robust_masking,
         }
         base_config = super().get_config()
         return dict(list(base_config.items()) + list(config.items()))
@@ -476,7 +481,9 @@ def _build_attention(self, rank):
             )
         )
         self._softmax = activation.Softmax(
-            axis=norm_axes, dtype=self._dtype_policy
+            axis=norm_axes,
+            robust_masking=self._softmax_robust_masking,
+            dtype=self._dtype_policy,
         )
         self._dropout_layer = regularization.Dropout(
             rate=self._dropout, dtype=self._dtype_policy

Original file line number	Diff line number	Diff line change
`@@ -129,7 +129,7 @@ tf_class {`
`129`	`129`	`}`
`130`	`130`	`member_method {`
`131`	`131`	`name: "__init__"`
`132`		`- argspec: "args=[\'self\', \'num_heads\', \'key_dim\', \'value_dim\', \'dropout\', \'use_bias\', \'output_shape\', \'attention_axes\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'0.0\', \'True\', \'None\', \'None\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "`
	`132`	+ argspec: "args=[\'self\', \'num_heads\', \'key_dim\', \'value_dim\', \'dropout\', \'use_bias\', \'output_shape\', \'attention_axes\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'softmax_robust_masking\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'0.0\', \'True\', \'None\', \'None\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\'], "
`133`	`133`	`}`
`134`	`134`	`member_method {`
`135`	`135`	`name: "add_loss"`