keras-team
diff --git a/‎keras_rs/api/layers/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎keras_rs/api/layers/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎keras_rs/src/layers/embedding/__init__.py‎ b/‎keras_rs/src/layers/embedding/__init__.py‎
diff --git a/‎keras_rs/src/layers/embedding/embed_reduce.py‎
Lines changed: 204 additions & 0 deletions b/‎keras_rs/src/layers/embedding/embed_reduce.py‎
Lines changed: 204 additions & 0 deletions
diff --git a/‎keras_rs/src/layers/embedding/embed_reduce_test.py‎
Lines changed: 138 additions & 0 deletions b/‎keras_rs/src/layers/embedding/embed_reduce_test.py‎
Lines changed: 138 additions & 0 deletions
diff --git a/‎keras_rs/src/layers/feature_interaction/dot_interaction.py‎
Lines changed: 2 additions & 2 deletions b/‎keras_rs/src/layers/feature_interaction/dot_interaction.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎keras_rs/src/layers/feature_interaction/feature_cross.py‎
Lines changed: 1 addition & 1 deletion b/‎keras_rs/src/layers/feature_interaction/feature_cross.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎keras_rs/src/metrics/ranking_metrics_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎keras_rs/src/metrics/ranking_metrics_utils.py‎
Lines changed: 1 addition & 1 deletion
@@ -4,6 +4,9 @@
 since your modifications would be overwritten.
 """
 
+from keras_rs.src.layers.embedding.embed_reduce import (
+    EmbedReduce as EmbedReduce,
+)
 from keras_rs.src.layers.feature_interaction.dot_interaction import (
     DotInteraction as DotInteraction,
 )
 
@@ -0,0 +1,204 @@
+from typing import Any, Optional
+
+import keras
+from keras import ops
+
+from keras_rs.src import types
+from keras_rs.src.api_export import keras_rs_export
+from keras_rs.src.utils.keras_utils import check_shapes_compatible
+
+SUPPORTED_COMBINERS = ("mean", "sum", "sqrtn")
+
+
+@keras_rs_export("keras_rs.layers.EmbedReduce")
+class EmbedReduce(keras.layers.Embedding):
+    """An embedding layer that reduces with a combiner.
+
+    This layer embeds inputs and then applies a reduction to combine a set of
+    embeddings into a single embedding. This is typically used to embed a
+    sequence of items as a single embedding.
+
+    If the inputs passed to `__call__` are 1D, no reduction is applied. If the
+    inputs are 2D, dimension 1 is reduced using the combiner so that the result
+    is of shape `(batch_size, output_dim`). Inputs of rank 3 and higher are not
+    allowed. Weights can optionally be passed to the `__call__` method to
+    apply weights to different samples before reduction.
+
+    This layer supports sparse inputs and ragged inputs with backends that
+    support them. The output after reduction is dense. For ragged inputs, the
+    ragged dimension must be 1 as it is the dimension that is reduced.
+
+    Args:
+        input_dim: Integer. Size of the vocabulary, maximum integer index + 1.
+        output_dim: Integer. Dimension of the dense embedding.
+        embeddings_initializer: Initializer for the `embeddings` matrix (see
+            `keras.initializers`).
+        embeddings_regularizer: Regularizer function applied to the `embeddings`
+            matrix (see `keras.regularizers`).
+        embeddings_constraint: Constraint function applied to the `embeddings`
+            matrix (see `keras.constraints`).
+        mask_zero: Boolean, whether or not the input value 0 is a special
+            "padding" value that should be masked out. This is useful when using
+            recurrent layers which may take variable length input. If this is
+            `True`, then all subsequent layers in the model need to support
+            masking or an exception will be raised. If `mask_zero` is set to
+            `True`, as a consequence, index 0 cannot be used in the vocabulary
+            (`input_dim` should equal size of vocabulary + 1).
+        weights: Optional floating-point matrix of size
+            `(input_dim, output_dim)`. The initial embeddings values to use.
+        combiner: Specifies how to reduce if there are multiple entries in a
+            single row. Currently `mean`, `sqrtn` and `sum` are supported.
+            `mean` is the default. `sqrtn` often achieves good accuracy, in
+            particular with bag-of-words columns.
+      **kwargs: Additional keyword arguments passed to `Embedding`.
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        output_dim: int,
+        embeddings_initializer: types.InitializerLike = "uniform",
+        embeddings_regularizer: Optional[types.RegularizerLike] = None,
+        embeddings_constraint: Optional[types.ConstraintLike] = None,
+        mask_zero: bool = False,
+        weights: types.Tensor = None,
+        combiner: str = "mean",
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(
+            input_dim,
+            output_dim,
+            embeddings_initializer=embeddings_initializer,
+            embeddings_regularizer=embeddings_regularizer,
+            embeddings_constraint=embeddings_constraint,
+            mask_zero=mask_zero,
+            weights=weights,
+            **kwargs,
+        )
+        if combiner not in SUPPORTED_COMBINERS:
+            raise ValueError(
+                f"Invalid `combiner`: '{combiner}', "
+                f"use one of {', '.join(SUPPORTED_COMBINERS)}."
+            )
+        self.combiner = combiner
+
+    def call(
+        self,
+        inputs: types.Tensor,
+        weights: Optional[types.Tensor] = None,
+    ) -> types.Tensor:
+        """Apply embedding and reduction.
+
+        Args:
+            inputs: 1D tensor to embed or 2D tensor to embed and reduce.
+            weights: Optional tensor of weights to apply before reduction, which
+               can be 1D or 2D and must match for the first dimension of
+               `inputs` (1D case) or match the shape of `inputs` (2D case).
+
+        Returns:
+            A dense 2D tensor of shape `(batch_size, output_dim)`.
+        """
+        x = super().call(inputs)
+        unreduced_rank = len(x.shape)
+
+        # Check that weights has a compatible shape.
+        if weights is not None:
+            weights_rank = len(weights.shape)
+            if weights_rank > unreduced_rank or not check_shapes_compatible(
+                x.shape[0:weights_rank], weights.shape
+            ):
+                raise ValueError(
+                    f"The shape of `weights`: {weights.shape} is not compatible"
+                    f" with the shape of `inputs` after embedding: {x.shape}."
+                )
+
+        dtype = (
+            x.dtype
+            if weights is None
+            else keras.backend.result_type(x.dtype, weights.dtype)
+        )
+
+        # When `weights` is `None`:
+        # - For ragged inputs, after embedding, we get a ragged result that has
+        #   a ragged dimension of 1, but when we do the "mean" or "sqrtn", we
+        #   need to divide by the number of items in each row. However, there is
+        #   no explicit cross backend API to get the row length. `ones_like`
+        #   gives us a ragged tensor that is ragged in the same way as the
+        #   inputs. When we do `ops.sum(weights, axis=-2)`, it gives us the
+        #   number of items per row.
+        # - For sparse inputs, after embedding, we get a dense tensor, not a
+        #   sparse tensor. What it does for missing values is use embedding 0.
+        #   These are bogus embedding ands should be ignored. `ones_like` gives
+        #   us a sparse tensor with the exact same missing values. Later, when
+        #   we do `x = ops.multiply(x, weights)`, which masks the bogus values
+        #   (note that `weights` has been densified beforehand). Additionally,
+        #   when we do `ops.sum(weights, axis=-2)`, it gives us the number of
+        #   items per row.
+        #
+        # When `unreduced_rank <= 2`, this means that the inputs where 1D and
+        # dense, there is only one embedding per row, so there is no real
+        # reduction is going on.
+        # - For mean: result = weights * x / weights = x we don't need `weights`
+        # - For sqrtn: result = weights * x / sqrt(square(weights)) = x we don't
+        #   needs `weights`
+        # - For sum however: `result = weights * x` we do need `weights`.
+        # So for mean and sqrtn we don't need the weights, we use ones instead.
+        # This is to avoid divisions by zero and improve the precision.
+        if weights is None or (unreduced_rank <= 2 and self.combiner != "sum"):
+            # Discard the weights if there were some and create a mask for
+            # ragged and sparse tensors to mask the result correctly (sparse
+            # only) and the apply the reduction correctly (ragged and sparse).
+            weights = ops.ones_like(inputs, dtype=dtype)
+        else:
+            weights = ops.cast(weights, dtype)
+
+        # When looking up using sparse indices, the result is dense but contains
+        # values that should be ignored as all missing values use index 0. We
+        # use `weights` as a mask, but it needs to be densified as
+        # `expand_dims` and broadcasting a sparse tensor does not produce the
+        # expected result.
+        weights = ops.convert_to_tensor(weights, sparse=False)
+
+        # Make weights and the unreduced embeddings have the same rank.
+        weights_rank = len(weights.shape)
+        if weights_rank < unreduced_rank:
+            weights = ops.expand_dims(
+                weights, axis=tuple(range(weights_rank, unreduced_rank))
+            )
+
+        # Note that `x` and `weights` are:
+        # - ragged if `inputs` was ragged and `weights` was ragged or None
+        # - dense otherwise (even if `inputs` and `weights` were sparse).
+        x = ops.multiply(x, weights)
+
+        if unreduced_rank <= 2:
+            # No reduction is applied.
+            return x
+
+        # After this reduction, `x` is always dense as we reduce the ragged
+        # dimension in the ragged case.
+        x = ops.sum(x, axis=-2)
+
+        # Apply the right divisor for the combiner.
+        # Where we use `weights` in the divisor, we use
+        # `ops.sum(weights, axis=-2)` which always makes it dense as we reduce
+        # the ragged dimension in the ragged case.
+        if self.combiner == "mean":
+            return ops.divide_no_nan(x, ops.sum(weights, axis=-2))
+        elif self.combiner == "sum":
+            return x
+        elif self.combiner == "sqrtn":
+            return ops.divide_no_nan(
+                x, ops.sqrt(ops.sum(ops.square(weights), axis=-2))
+            )
+
+    def get_config(self) -> dict[str, Any]:
+        config: dict[str, Any] = super().get_config()
+
+        config.update(
+            {
+                "combiner": self.combiner,
+            }
+        )
+
+        return config
@@ -0,0 +1,138 @@
+import math
+
+import keras
+from absl.testing import parameterized
+from keras import ops
+from keras.layers import deserialize
+from keras.layers import serialize
+
+from keras_rs.src import testing
+from keras_rs.src.layers.embedding.embed_reduce import EmbedReduce
+
+
+class EmbedReduceTest(testing.TestCase, parameterized.TestCase):
+    @parameterized.named_parameters(
+        [
+            (
+                (
+                    f"{combiner}_{input_type}_{input_rank}d"
+                    f"{'_weights' if use_weights else ''}"
+                ),
+                combiner,
+                input_type,
+                input_rank,
+                use_weights,
+            )
+            for combiner in ("sum", "mean", "sqrtn")
+            for input_type, input_rank in (
+                ("dense", 1),
+                ("dense", 2),
+                ("ragged", 2),
+                ("sparse", 2),
+            )
+            for use_weights in (False, True)
+        ]
+    )
+    def test_call(self, combiner, input_type, input_rank, use_weights):
+        if input_type == "ragged" and keras.backend.backend() != "tensorflow":
+            self.skipTest(f"ragged not supported on {keras.backend.backend()}")
+        if input_type == "sparse" and keras.backend.backend() not in (
+            "jax",
+            "tensorflow",
+        ):
+            self.skipTest(f"sparse not supported on {keras.backend.backend()}")
+
+        if input_type == "dense" and input_rank == 1:
+            inputs = ops.convert_to_tensor([1, 2])
+            weights = ops.convert_to_tensor([1.0, 2.0])
+        elif input_type == "dense" and input_rank == 2:
+            inputs = ops.convert_to_tensor([[1, 2], [3, 4]])
+            weights = ops.convert_to_tensor([[1.0, 2.0], [3.0, 4.0]])
+        elif input_type == "ragged" and input_rank == 2:
+            import tensorflow as tf
+
+            inputs = tf.ragged.constant([[1], [2, 3, 4, 5]])
+            weights = tf.ragged.constant([[1.0], [1.0, 2.0, 3.0, 4.0]])
+        elif input_type == "sparse" and input_rank == 2:
+            indices = [[0, 0], [1, 0], [1, 1], [1, 2], [1, 3]]
+
+            if keras.backend.backend() == "tensorflow":
+                import tensorflow as tf
+
+                inputs = tf.sparse.reorder(
+                    tf.SparseTensor(indices, [1, 2, 3, 4, 5], (2, 4))
+                )
+                weights = tf.sparse.reorder(
+                    tf.SparseTensor(indices, [1.0, 1.0, 2.0, 3.0, 4.0], (2, 4))
+                )
+            elif keras.backend.backend() == "jax":
+                from jax.experimental import sparse as jax_sparse
+
+                inputs = jax_sparse.BCOO(
+                    ([1, 2, 3, 4, 5], indices),
+                    shape=(2, 4),
+                    unique_indices=True,
+                )
+                weights = jax_sparse.BCOO(
+                    ([1.0, 1.0, 2.0, 3.0, 4.0], indices),
+                    shape=(2, 4),
+                    unique_indices=True,
+                )
+
+        if not use_weights:
+            weights = None
+
+        layer = EmbedReduce(10, 20, combiner=combiner)
+        res = layer(inputs, weights)
+
+        self.assertEqual(res.shape, (2, 20))
+
+        e = layer.embeddings
+        if input_type == "dense" and input_rank == 1:
+            if combiner == "sum" and use_weights:
+                expected = [e[1], e[2] * 2.0]
+            else:
+                expected = [e[1], e[2]]
+        elif input_type == "dense" and input_rank == 2:
+            if use_weights:
+                expected = [e[1] + e[2] * 2.0, e[3] * 3.0 + e[4] * 4.0]
+            else:
+                expected = [e[1] + e[2], e[3] + e[4]]
+
+            if combiner == "mean":
+                expected[0] /= 3.0 if use_weights else 2.0
+                expected[1] /= 7.0 if use_weights else 2.0
+            elif combiner == "sqrtn":
+                expected[0] /= math.sqrt(5.0 if use_weights else 2.0)
+                expected[1] /= math.sqrt(25.0 if use_weights else 2.0)
+        else:  # ragged, sparse and input_rank == 2
+            if use_weights:
+                expected = [e[1], e[2] + e[3] * 2.0 + e[4] * 3.0 + e[5] * 4.0]
+            else:
+                expected = [e[1], e[2] + e[3] + e[4] + e[5]]
+
+            if combiner == "mean":
+                expected[1] /= 10.0 if use_weights else 4.0
+            elif combiner == "sqrtn":
+                expected[1] /= math.sqrt(30.0 if use_weights else 4.0)
+
+        self.assertAllClose(res, expected)
+
+    def test_predict(self):
+        input = keras.random.randint((5, 7), minval=0, maxval=10)
+        model = keras.models.Sequential([EmbedReduce(10, 20)])
+        model.predict(input, batch_size=2)
+
+    def test_serialization(self):
+        layer = EmbedReduce(10, 20, combiner="sqrtn")
+        restored = deserialize(serialize(layer))
+        self.assertDictEqual(layer.get_config(), restored.get_config())
+
+    def test_model_saving(self):
+        input = keras.random.randint((5, 7), minval=0, maxval=10)
+        model = keras.models.Sequential([EmbedReduce(10, 20)])
+
+        self.run_model_saving_test(
+            model=model,
+            input_data=input,
+        )
@@ -205,8 +205,8 @@ def call(self, inputs: list[types.Tensor]) -> types.Tensor:
         return activations
 
     def compute_output_shape(
-        self, input_shape: list[types.TensorShape]
-    ) -> types.TensorShape:
+        self, input_shape: list[types.Shape]
+    ) -> types.Shape:
         num_features = len(input_shape)
         batch_size = input_shape[0][0]
 
 
@@ -129,7 +129,7 @@ def __init__(
                 f"`diag_scale={self.diag_scale}`"
             )
 
-    def build(self, input_shape: types.TensorShape) -> None:
+    def build(self, input_shape: types.Shape) -> None:
         last_dim = input_shape[-1]
 
         if self.projection_dim is not None:
 
@@ -7,7 +7,7 @@
 
 
 def get_shuffled_indices(
-    shape: types.TensorShape,
+    shape: types.Shape,
     mask: Optional[types.Tensor] = None,
     shuffle_ties: bool = True,
     seed: Optional[Union[int, keras.random.SeedGenerator]] = None,
Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,9 @@`
`4`	`4`	`since your modifications would be overwritten.`
`5`	`5`	`"""`
`6`	`6`
	`7`	`+from keras_rs.src.layers.embedding.embed_reduce import (`
	`8`	`+ EmbedReduce as EmbedReduce,`
	`9`	`+)`
`7`	`10`	`from keras_rs.src.layers.feature_interaction.dot_interaction import (`
`8`	`11`	`DotInteraction as DotInteraction,`
`9`	`12`	`)`
Original file line number	Diff line number	Diff line change
`@@ -129,7 +129,7 @@ def __init__(`
`129`	`129`	f"`diag_scale={self.diag_scale}`"
`130`	`130`	`)`
`131`	`131`
`132`		`- def build(self, input_shape: types.TensorShape) -> None:`
	`132`	`+ def build(self, input_shape: types.Shape) -> None:`
`133`	`133`	`last_dim = input_shape[-1]`
`134`	`134`
`135`	`135`	`if self.projection_dim is not None:`