Implement L2NormHook

danielkorzekwa · danielkorzekwa · commit c7356e7ac681 · 2025-11-22T10:28:11.000+01:00
Signed-off-by: Daniel Korzekwa &lt;dkorzekwa@nvidia.com&gt;
diff --git a/modelopt/torch/nas/plugins/hooks.py b/modelopt/torch/nas/plugins/hooks.py
@@ -0,0 +1,100 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Forward hooks for activation-based importance estimation in NAS plugins."""
+
+from abc import ABC, abstractmethod
+
+import torch
+from megatron.core.tensor_parallel import gather_from_tensor_model_parallel_region
+from torch import nn
+
+
+class ForwardHook(ABC):
+    """Base class for PyTorch forward hooks.
+
+    This follows the PyTorch forward hook API where the second
+    parameter is 'args' (a tuple of positional arguments passed to forward()).
+
+    Usage:
+        hook = MyHook()
+        module.register_forward_hook(hook)
+    """
+
+    @abstractmethod
+    def __call__(
+        self, module: nn.Module, args: tuple[torch.Tensor, ...], output: torch.Tensor
+    ) -> None:
+        """Forward hook that is called after the module's forward pass.
+
+        Args:
+            module: The module this hook is registered on
+            args: Tuple of positional arguments passed to module.forward()
+            output: The output from module.forward()
+
+        Returns:
+            None (does not modify the output)
+        """
+        ...
+
+
+class L2NormHook(ForwardHook):
+    """Hook for accumulating activation statistics for importance estimation.
+
+    Activations are computed as mean over seq_len and then squared and summed over batch_size.
+    In the accumulate() method we take the square root of the sum to get the L2 norm.
+
+    Args:
+        max_size: Optional maximum expected size to validate against (skips if mismatch).
+                Useful for skipping non-max subnets during profiling.
+    """
+
+    def __init__(self, max_size: int | None = None):
+        """Initialize the L2NormHook."""
+        self.max_size = max_size
+        self._activations: torch.Tensor | None = None
+
+    def __call__(
+        self, module: nn.Module, args: tuple[torch.Tensor, ...], output: torch.Tensor
+    ) -> None:
+        """Accumulate activation statistics from the forward pass."""
+        # Gather input [seq_len, batch_size, hidden_size] over all TP regions
+        # NOTE: This is not used at the moment since we restrict to TP=1
+        input_tensor = gather_from_tensor_model_parallel_region(args[0]).detach()
+
+        # Dont aggregate activations from non-max subnets (e.g. from profiling)
+        if self.max_size is not None and input_tensor.shape[-1] != self.max_size:
+            return
+
+        input_tensor = input_tensor.to(torch.float32)  # use full precision to avoid overflow
+        activations = input_tensor.abs().mean(dim=0)  # [batch_size, hidden_size]
+        activations = activations.pow(2).sum(dim=0)  # [hidden_size]
+
+        if self._activations is None:
+            self._activations = activations
+        else:
+            self._activations += activations
+
+    def accumulate(self) -> torch.Tensor:
+        """Return the accumulated L2 norm of activations.
+
+        Returns:
+            Tensor of accumulated scores, one per channel
+
+        Raises:
+            AssertionError: If no activations have been collected yet
+        """
+        assert self._activations is not None, "No activations collected for importance estimation."
+        # Convert squared sum to L2 norm
+        return self._activations.pow(0.5)
diff --git a/modelopt/torch/nas/plugins/megatron.py b/modelopt/torch/nas/plugins/megatron.py
@@ -75,6 +75,7 @@
 from ..registry import DMRegistry
 from ..search_space import SampleFunc
 from ..traced_hp import TracedHp
+from .hooks import L2NormHook
 
 SUPPORTED_MODELS = {GPTModel: "megatron.core.models.gpt.GPTModel"}
 
@@ -211,37 +212,17 @@ def _setup(self):
         # can be discarded.
         # This limitation might be fixed in OMNIML-180 (Flexible Importance Estimator)
         # where we separate the importance estimation from the dynamic module.
-        self._register_temp_attribute("_activations", None)
-        self.hook_handle = self.linear_fc2.register_forward_hook(self._linear_fc2_forward_hook)
+        max_ffn_size = self.get_hparam("ffn_hidden_size").max
+        assert isinstance(max_ffn_size, int), "ffn_hidden_size.max must be an int"
+        activation_hook = L2NormHook(max_size=max_ffn_size)
+        self._register_temp_attribute("_activation_hook", activation_hook)
+        # TODO: confusion: why hook_handle is removed manually in export() and not using _register_temp_attribute?
+        self.hook_handle = self.linear_fc2.register_forward_hook(activation_hook)
         ffn_hidden_size.register_importance(self._estimate_importance)
 
-    def _linear_fc2_forward_hook(self, module, input, output):
-        """Hook to collect activations for importance estimation.
-
-        Activations are computed as mean over seq_len and then squared and summed over batch_size.
-        Later we take the square root of the sum to get the L2 norm.
-        """
-        # Gather input [seq_len, batch_size, ffn_hidden_size] over all TP regions
-        # NOTE: This is not used at the moment since we restrict to TP=1
-        input = gather_from_tensor_model_parallel_region(input[0]).detach()
-
-        # Dont aggregate activations from non-max subnets (e.g. from profiling)
-        if input.shape[-1] != self.get_hparam("ffn_hidden_size").max:
-            return
-
-        input = input.to(torch.float32)  # use full precision to avoid overflow
-        activations = input.abs().mean(dim=0)  # [batch_size, ffn_hidden_size]
-        activations = activations.pow(2).sum(dim=0)  # [ffn_hidden_size]
-        if self._activations is None:
-            self._activations = activations
-        else:
-            self._activations += activations
-
     def _estimate_importance(self) -> TracedHp.Importance:
         """Return the activation magnitude-based importance of the ffn_hidden_size."""
-        assert self._activations is not None, "No activations collected for importance estimation."
-        # Convert squared sum to L2 norm
-        return self._activations.pow(0.5)
+        return self._activation_hook.accumulate()
 
     def export(self) -> torch.nn.Module:
         """Export the dynamic module to a torch.nn.Module."""
@@ -545,46 +526,21 @@ def _setup(self):
         )
 
         # register importance estimator for linear_qkv.output_size and linear_proj.input_size
-        self._register_temp_attribute("_activations", None)
-        self.hook_handle = self.linear_proj.register_forward_hook(self._linear_proj_forward_hook)
+        num_heads_per_group_max = self.get_hparam("num_heads_per_group").max
+        num_query_groups_max = self.get_hparam("num_query_groups").max
+        max_size = num_heads_per_group_max * num_query_groups_max * self.config.kv_channels
+        activation_hook = L2NormHook(max_size=max_size)
+        self._register_temp_attribute("_activation_hook", activation_hook)
+        self.hook_handle = self.linear_proj.register_forward_hook(activation_hook)
         # NOTE: num_heads_per_group's slice_order will be of length num_attention_heads to be able to sort heads,
         # otherwise we would only have aggregated importance of heads per group.
         # While enforcing order during `sort_parameters`, we dont check the shape of the slice_order
         num_heads_per_group.register_importance(self._estimate_all_head_importance)
         num_query_groups.register_importance(self._estimate_query_group_importance)
 
-    def _linear_proj_forward_hook(self, module, input, output):
-        """Hook to collect activations for importance estimation.
-
-        Activations are computed as mean over seq_len and then squared and summed over batch_size.
-        Later we take the square root of the sum to get the L2 norm.
-        """
-        # Gather input [seq_len, batch_size, query_projection_size] over all TP regions
-        # NOTE: This is not used at the moment since we restrict to TP=1
-        input = gather_from_tensor_model_parallel_region(input[0]).detach()
-
-        # Dont aggregate activations from non-max subnets (e.g. from profiling)
-        if (
-            input.shape[-1]
-            != self.get_hparam("num_heads_per_group").max
-            * self.get_hparam("num_query_groups").max
-            * self.config.kv_channels
-        ):
-            return
-
-        input = input.to(torch.float32)  # use full precision to avoid overflow
-        activations = input.abs().mean(dim=0)
-        activations = activations.pow(2).sum(dim=0)  # [query_projection_size]
-        if self._activations is None:
-            self._activations = activations
-        else:
-            self._activations += activations
-
     def _estimate_all_head_importance(self) -> TracedHp.Importance:
         """Return the importance for num_attention_heads (num_heads_per_group * num_query_groups)."""
-        assert self._activations is not None, "No activations collected for importance estimation."
-        # Convert squared sum to L2 norm
-        scores = self._activations.pow(0.5)
+        scores = self._activation_hook.accumulate()
         attn_head_importance = torch.linalg.vector_norm(
             scores.view(
                 self.get_hparam("num_heads_per_group").max
@@ -598,9 +554,7 @@ def _estimate_all_head_importance(self) -> TracedHp.Importance:
 
     def _estimate_query_group_importance(self) -> TracedHp.Importance:
         """Return the importance of the ``num_query_groups`` hparam."""
-        assert self._activations is not None, "No activations collected for importance estimation."
-        # Convert squared sum to L2 norm
-        scores = self._activations.pow(0.5)
+        scores = self._activation_hook.accumulate()
         group_importance = torch.linalg.vector_norm(
             scores.view(
                 self.get_hparam("num_heads_per_group").max,
@@ -1353,7 +1307,12 @@ def get_activations_and_layer_scores(
         """Get the per-rank activations and layer scores from the module."""
         local_activations = {}
         for n, m in self.named_modules():
-            if hasattr(m, "_activations"):
+            # New pattern: activations stored in hook
+            if hasattr(m, "_activation_hook") and m._activation_hook._activations is not None:
+                local_activations[n] = m._activation_hook._activations
+            # Legacy pattern: activations stored directly on module.
+            # TODO: remove this once we switch to the new pattern.
+            elif hasattr(m, "_activations") and m._activations is not None:
                 local_activations[n] = m._activations
         activations_per_rank = dist.allgather(
             local_activations, group=get_pipeline_model_parallel_group()
@@ -1385,7 +1344,12 @@ def set_activations_and_layer_scores(
         for layer in self.decoder.layers:
             layer._scores = layer_scores[layer.layer_number]
         for n, m in self.named_modules():
-            if hasattr(m, "_activations"):
+            # New pattern: activations stored in hook
+            if hasattr(m, "_activation_hook"):
+                m._activation_hook._activations = activations_per_rank[rank][n]
+            # Legacy pattern: activations stored directly on module.
+            # TODO: remove this once we switch to the new pattern.
+            elif hasattr(m, "_activations"):
                 m._activations = activations_per_rank[rank][n]
 
 
diff --git a/tests/gpu/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py b/tests/gpu/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py
@@ -132,6 +132,52 @@ def forward_loop(m):
         assert pruning_scores["layer_scores"]
         assert pruning_scores["activations_per_rank"]
 
+        # TODO: Simplify it: this unit test is too long,
+        # hard to read (the same set of assertions across different test cases with if-else).
+
+        assert len(pruning_scores["activations_per_rank"]) == 1
+        rank_0_activations = pruning_scores["activations_per_rank"][0]
+
+        # Test case 1: MHA - pruned ffn/4 (num_attention_heads=8, num_query_groups=8, ffn_div=4)
+        if pruned_ffn_div == 4:
+            # Layer scores (these use cosine similarity, independent of FFN activation hook)
+            assert pruning_scores["layer_scores"][1] == pytest.approx(2.1437832713127136, abs=1e-5)
+            assert pruning_scores["layer_scores"][2] == pytest.approx(1.792158305644989, abs=1e-5)
+
+            # Validate decoder.layers.0.mlp activations
+            mlp_0_acts = rank_0_activations["decoder.layers.0.mlp"]
+            assert mlp_0_acts.min().item() == pytest.approx(0.0011843212, abs=1e-5)
+            assert mlp_0_acts.max().item() == pytest.approx(1.0846971273, abs=1e-5)
+            assert mlp_0_acts.mean().item() == pytest.approx(0.0535472594, abs=1e-5)
+
+            # Validate decoder.layers.1.mlp activations
+            mlp_1_acts = rank_0_activations["decoder.layers.1.mlp"]
+            assert mlp_1_acts.min().item() == pytest.approx(0.0002450741, abs=1e-5)
+            assert mlp_1_acts.max().item() == pytest.approx(1.1014972925, abs=1e-5)
+            assert mlp_1_acts.mean().item() == pytest.approx(0.0904172808, abs=1e-5)
+
+        # Test case 2: GQA - pruned attention/2 (num_attention_heads=8, num_query_groups=4, attention_div=2)
+        elif pruned_num_attention_heads_div == 2 and pruned_ffn_div == 1:
+            # Layer scores
+            assert pruning_scores["layer_scores"][1] == pytest.approx(2.1119985580444336, abs=1e-5)
+            assert pruning_scores["layer_scores"][2] == pytest.approx(1.7729830741882324, abs=1e-5)
+
+            # Validate decoder.layers.0.self_attention activations
+            assert "decoder.layers.0.self_attention" in rank_0_activations
+            attn_0_acts = rank_0_activations["decoder.layers.0.self_attention"]
+            assert attn_0_acts.shape == torch.Size([256])
+            assert attn_0_acts.min().item() == pytest.approx(0.03729403391480446, abs=1e-5)
+            assert attn_0_acts.max().item() == pytest.approx(0.3653244972229004, abs=1e-5)
+            assert attn_0_acts.mean().item() == pytest.approx(0.15008458495140076, abs=1e-5)
+
+            # Validate decoder.layers.1.self_attention activations
+            assert "decoder.layers.1.self_attention" in rank_0_activations
+            attn_1_acts = rank_0_activations["decoder.layers.1.self_attention"]
+            assert attn_1_acts.shape == torch.Size([256])
+            assert attn_1_acts.min().item() == pytest.approx(0.140824556350708, abs=1e-5)
+            assert attn_1_acts.max().item() == pytest.approx(1.0845409631729126, abs=1e-5)
+            assert attn_1_acts.mean().item() == pytest.approx(0.4730667173862457, abs=1e-5)
+
     # Assert weights are pruned correctly
     for layer in model.decoder.layers:
         assert layer.mlp.linear_fc1.weight.shape == (