Merge pull request #121 from zerolovesea/autoint

1985312383 · web-flow · commit b7a48adfb9de · 2025-11-17T19:42:16.000+08:00
feat(ranking model): Implementation of AutoInt model
diff --git a/examples/ranking/run_criteo.py b/examples/ranking/run_criteo.py
@@ -7,7 +7,7 @@
 from tqdm import tqdm
 
 from torch_rechub.basic.features import DenseFeature, SparseFeature
-from torch_rechub.models.ranking import DCN, EDCN, DCNv2, DeepFFM, DeepFM, FatDeepFFM, FiBiNet, WideDeep
+from torch_rechub.models.ranking import DCN, EDCN, AutoInt, DCNv2, DeepFFM, DeepFM, FatDeepFFM, FiBiNet, WideDeep
 from torch_rechub.trainers import CTRTrainer
 from torch_rechub.utils.data import DataGenerator
 
@@ -72,6 +72,8 @@ def main(dataset_path, model_name, epoch, learning_rate, batch_size, weight_deca
         model = FiBiNet(features=dense_feas + sparse_feas, reduction_ratio=3, mlp_params={"dims": [256, 128], "dropout": 0.2, "activation": "relu"})
     elif model_name == "edcn":
         model = EDCN(features=dense_feas + sparse_feas, n_cross_layers=3, mlp_params={"dims": [256, 128], "dropout": 0.2, "activation": "relu"})
+    elif model_name == "autoint":
+        model = AutoInt(dense_features=dense_feas, sparse_features=sparse_feas, num_layers=3, num_heads=2, dropout=0.2, mlp_params={"dims": [256, 128], "dropout": 0.2, "activation": "relu"})
     elif model_name == "deepffm":
         model = DeepFFM(linear_features=ffm_linear_feas, cross_features=ffm_cross_feas, embed_dim=10, mlp_params={"dims": [1600, 1600], "dropout": 0.5, "activation": "relu"})
     elif model_name == "fat_deepffm":
@@ -104,6 +106,7 @@ def main(dataset_path, model_name, epoch, learning_rate, batch_size, weight_deca
 python run_criteo.py --model_name dcn
 python run_criteo.py --model_name dcn_v2
 python run_criteo.py --model_name edcn
+python run_criteo.py --model_name autoint
 python run_criteo.py --model_name deepffm
 python run_criteo.py --model_name fat_deepffm
 """
diff --git a/tests/test_e2e_ranking.py b/tests/test_e2e_ranking.py
@@ -72,6 +72,8 @@ def test_ranking_e2e(model_class, ranking_data):
         params = {"features": features, "attention_dim": 16, "mlp_params": {"dims": [32]}}
     elif model_name == 'FiBiNet':
         params = {"features": features, "reduction_ratio": 3, "mlp_params": {"dims": [32]}}
+    elif model_name == 'AutoInt':
+        params = {"sparse_features": sparse_feats, "dense_features": dense_feats, "num_layers": 3, "num_heads": 2, "dropout": 0.0, "mlp_params": {"dims": [32]}}
     elif model_name in ["DeepFFM", "FatDeepFFM"]:
         # DeepFFM needs special features
         ffm_feats = [SparseFeature(f.name, f.vocab_size, 16) for f in sparse_feats]
diff --git a/torch_rechub/basic/layers.py b/torch_rechub/basic/layers.py
@@ -719,3 +719,77 @@ def forward(self, em):
         # [batch_size, num_field_crosses, embed_dim]
         aem = s.unsqueeze(-1) * em
         return aem.flatten(start_dim=1)
+
+
+class InteractingLayer(nn.Module):
+    """Multi-head Self-Attention based Interacting Layer, used in AutoInt model.
+
+    Args:
+        embed_dim (int): the embedding dimension.
+        num_heads (int): the number of attention heads (default=2).
+        dropout (float): the dropout rate (default=0.0).
+        residual (bool): whether to use residual connection (default=True).
+
+    Shape:
+        - Input: `(batch_size, num_fields, embed_dim)`
+        - Output: `(batch_size, num_fields, embed_dim)`
+    """
+
+    def __init__(self, embed_dim, num_heads=2, dropout=0.0, residual=True):
+        super().__init__()
+        if embed_dim % num_heads != 0:
+            raise ValueError("embed_dim must be divisible by num_heads")
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.residual = residual
+
+        self.W_Q = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.W_K = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.W_V = nn.Linear(embed_dim, embed_dim, bias=False)
+
+        # Residual connection
+        self.W_Res = nn.Linear(embed_dim, embed_dim, bias=False) if residual else None
+        self.dropout = nn.Dropout(dropout) if dropout > 0 else None
+
+    def forward(self, x):
+        """
+        Args:
+            x: input tensor with shape (batch_size, num_fields, embed_dim)
+        """
+        batch_size, num_fields, embed_dim = x.shape
+
+        # Linear projections
+        Q = self.W_Q(x)  # (batch_size, num_fields, embed_dim)
+        K = self.W_K(x)  # (batch_size, num_fields, embed_dim)
+        V = self.W_V(x)  # (batch_size, num_fields, embed_dim)
+
+        # Reshape for multi-head attention
+        # (batch_size, num_heads, num_fields, head_dim)
+        Q = Q.view(batch_size, num_fields, self.num_heads, self.head_dim).transpose(1, 2)
+        K = K.view(batch_size, num_fields, self.num_heads, self.head_dim).transpose(1, 2)
+        V = V.view(batch_size, num_fields, self.num_heads, self.head_dim).transpose(1, 2)
+
+        # Scaled dot-product attention
+        # (batch_size, num_heads, num_fields, num_fields)
+        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale
+        attn_weights = F.softmax(attn_scores, dim=-1)
+
+        if self.dropout is not None:
+            attn_weights = self.dropout(attn_weights)
+
+        # Apply attention to values
+        # (batch_size, num_heads, num_fields, head_dim)
+        attn_output = torch.matmul(attn_weights, V)
+
+        # Concatenate heads
+        # (batch_size, num_fields, embed_dim)
+        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, num_fields, embed_dim)
+
+        # Residual connection
+        if self.residual and self.W_Res is not None:
+            attn_output = attn_output + self.W_Res(x)
+
+        return F.relu(attn_output)
diff --git a/torch_rechub/models/ranking/__init__.py b/torch_rechub/models/ranking/__init__.py
@@ -1,6 +1,7 @@
-__all__ = ['WideDeep', 'DeepFM', 'DCN', 'DCNv2', 'EDCN', 'AFM', 'FiBiNet', 'DeepFFM', 'BST', 'DIN', 'DIEN', 'FatDeepFFM']
+__all__ = ['WideDeep', 'DeepFM', 'DCN', 'DCNv2', 'EDCN', 'AFM', 'FiBiNet', 'DeepFFM', 'BST', 'DIN', 'DIEN', 'FatDeepFFM', 'AutoInt']
 
 from .afm import AFM
+from .autoint import AutoInt
 from .bst import BST
 from .dcn import DCN
 from .dcn_v2 import DCNv2
diff --git a/torch_rechub/models/ranking/autoint.py b/torch_rechub/models/ranking/autoint.py
@@ -0,0 +1,102 @@
+"""
+Date: create on 14/11/2025
+References:
+    paper: (CIKM'2019) AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks
+    url: https://arxiv.org/abs/1810.11921
+Authors: Yang Zhou, zyaztec@gmail.com
+"""
+
+import torch
+import torch.nn as nn
+
+from ...basic.layers import LR, MLP, EmbeddingLayer, InteractingLayer
+
+
+class AutoInt(torch.nn.Module):
+    """AutoInt Model
+
+    Args:
+        sparse_features (list): the list of `SparseFeature` Class
+        dense_features (list): the list of `DenseFeature` Class
+        num_layers (int): number of interacting layers
+        num_heads (int): number of attention heads
+        dropout (float): dropout rate for attention
+        mlp_params (dict): parameters for MLP, keys: {"dims":list, "activation":str,
+                                             "dropout":float, "output_layer":bool"}
+    """
+
+    def __init__(self, sparse_features, dense_features, num_layers=3, num_heads=2, dropout=0.0, mlp_params=None):
+        super(AutoInt, self).__init__()
+        self.sparse_features = sparse_features
+
+        self.dense_features = dense_features if dense_features is not None else []
+        embed_dims = [fea.embed_dim for fea in self.sparse_features]
+        self.embed_dim = embed_dims[0]
+        if len(self.sparse_features) == 0:
+            raise ValueError("AutoInt requires at least one sparse feature to determine embed_dim.")
+
+        # field nums = sparse + dense
+        self.num_sparse = len(self.sparse_features)
+        self.num_dense = len(self.dense_features)
+        self.num_fields = self.num_sparse + self.num_dense
+
+        # total dims = num_fields * embed_dim
+        self.dims = self.num_fields * self.embed_dim
+        self.num_layers = num_layers
+
+        self.sparse_embedding = EmbeddingLayer(self.sparse_features)
+
+        # dense feature embedding
+        self.dense_embeddings = nn.ModuleDict()
+        for fea in self.dense_features:
+            self.dense_embeddings[fea.name] = nn.Linear(1, self.embed_dim, bias=False)
+
+        self.interacting_layers = torch.nn.ModuleList([InteractingLayer(self.embed_dim, num_heads=num_heads, dropout=dropout, residual=True) for _ in range(num_layers)])
+
+        self.linear = LR(self.dims)
+
+        self.attn_linear = nn.Linear(self.dims, 1)
+
+        if mlp_params is not None:
+            self.use_mlp = True
+            self.mlp = MLP(self.dims, **mlp_params)
+        else:
+            self.use_mlp = False
+
+    def forward(self, x):
+        # sparse feature embedding: [B, num_sparse, embed_dim]
+        sparse_emb = self.sparse_embedding(x, self.sparse_features, squeeze_dim=False)
+
+        dense_emb_list = []
+        for fea in self.dense_features:
+            v = x[fea.name].float().view(-1, 1, 1)
+            dense_emb = self.dense_embeddings[fea.name](v)  # [B, 1, embed_dim]
+            dense_emb_list.append(dense_emb)
+
+        if len(dense_emb_list) > 0:
+            dense_emb = torch.cat(dense_emb_list, dim=1)  # [B, num_dense, d]
+            embed_x = torch.cat([sparse_emb, dense_emb], dim=1)  # [B, num_fields, d]
+        else:
+            embed_x = sparse_emb  # [B, num_sparse, d]
+
+        embed_x_flatten = embed_x.flatten(start_dim=1)  # [B, num_fields * embed_dim]
+
+        # Multi-head self-attention layers
+        attn_out = embed_x
+        for layer in self.interacting_layers:
+            attn_out = layer(attn_out)  # [B, num_fields, embed_dim]
+
+        # Attention linear
+        attn_out_flatten = attn_out.flatten(start_dim=1)  # [B, num_fields * embed_dim]
+        y_attn = self.attn_linear(attn_out_flatten)  # [B, 1]
+
+        # Linear part
+        y_linear = self.linear(embed_x_flatten)  # [B, 1]
+
+        # Deep MLP
+        y = y_attn + y_linear
+        if self.use_mlp:
+            y_deep = self.mlp(embed_x_flatten)  # [B, 1]
+            y = y + y_deep
+
+        return torch.sigmoid(y.squeeze(1))