Speed up GroupNorm, improve normalization testing (#637)

akhilg-nv · web-flow · commit afd4fb6ab805 · 2025-05-23T15:20:00.000-07:00
Due to an API incompatibility issue with Torch GroupNorm and TensorRT
GroupNorm, the implementation uses InstanceNorm instead as a workaround
(following the same WAR used by the ONNX parser and Torch -&gt; ONNX
converter).

The latest opset of ONNX supports scale and bias with shape
`(num_channels,)`, so it is likely that the TRT API will see this
eventually supported, at which point we can switch to the most direct
implementation.

Running 1k iterations in a loop shows the new implementation is roughly
17% faster on average. The nsys trace shows that the new implementation
is significantly better fused (only two computation kernels, for the
instancenorm and the affine transform), and the performance for a single
iteration of the module is up to 40% faster (30µs vs 50µs).
diff --git a/tripy/nvtripy/frontend/module/groupnorm.py b/tripy/nvtripy/frontend/module/groupnorm.py
@@ -24,6 +24,8 @@
 from nvtripy.frontend.module.parameter import DefaultParameter
 from nvtripy.frontend.tensor import Tensor
 
+from nvtripy.frontend.module.instancenorm import InstanceNorm
+
 
 @export.public_api(document_under="operations/modules")
 @dataclass
@@ -35,6 +37,8 @@ class GroupNorm(Module):
     :math:`\text{GroupNorm}(x) = \Large \frac{x - \bar{x}}{ \sqrt{\sigma^2 + \epsilon}} \normalsize * \gamma + \beta`
 
     where :math:`\bar{x}` is the mean and :math:`\sigma^2` is the variance.
+
+    The input should have shape :math:`[N, C, D1, ...]` where :math:`N` is the batch size, :math:`C` is the number of channels, and :math:`D1, ...` are the feature dimensions.
     """
 
     num_groups: int
@@ -68,30 +72,31 @@ def __init__(
         .. code-block:: python
             :linenos:
 
-            group_norm = tp.GroupNorm(2, 2)
+            group_norm = tp.GroupNorm(2, 4)
 
-            group_norm.weight = tp.iota(group_norm.weight.shape)
-            group_norm.bias = tp.iota(group_norm.bias.shape)
+            group_norm.weight = tp.ones(group_norm.weight.shape)
+            group_norm.bias = tp.zeros(group_norm.bias.shape)
 
-            input = tp.iota((1, 2, 2, 2), dim=1)
+            input = tp.iota((1, 4, 1, 1), dim=1)
             output = group_norm(input)
 
             np_out = cp.from_dlpack(output).get() # doc: omit
-            assert np_out.shape == (1, 2, 2, 2)
+            assert np_out.shape == (1, 4, 1, 1)
 
             torch_tensor = torch.from_dlpack(input) # doc: omit
             torch_gn = torch.nn.GroupNorm(2, 2).to(torch.device("cuda")) # doc: omit
             torch_gn.weight.data = torch.from_dlpack(group_norm.weight) # doc: omit
             torch_gn.bias.data = torch.from_dlpack(group_norm.bias) # doc: omit
             torch_out = cp.from_dlpack(torch_gn(torch_tensor).detach()).get() # doc: omit
-            assert np_out.shape == torch_out.shape # doc: omit
-            assert np.allclose(np_out, torch_out) # doc: omit
+            assert np_out.shape == torch_out.shape
+            assert np.allclose(np_out, torch_out)
         """
+
         super().__init__()
 
         if num_channels % num_groups:
             raise_error(
-                "Number of groups must divide number of channels evenly.",
+                "The number of groups must divide number of channels evenly.",
                 details=[f"Got {num_groups} groups but {num_channels} channels."],
             )
 
@@ -112,19 +117,30 @@ def forward(self, x: "nvtripy.Tensor") -> "nvtripy.Tensor":
         Returns:
             A tensor of the same shape as the input.
         """
-        from nvtripy.frontend.ops.reduce.mean import mean
-        from nvtripy.frontend.ops.reduce.var import var
-        from nvtripy.frontend.ops.reshape import reshape
-        from nvtripy.frontend.ops.unary.rsqrt import rsqrt
-
-        input_shape = x.shape
 
-        x = reshape(x, (x.shape[0], self.num_groups, -1))
-        mean_val = mean(x, dim=-1, keepdim=True)
-        var_val = var(x, dim=-1, keepdim=True, correction=0) + self.eps
-        x = (x - mean_val) * rsqrt(var_val)
-        x = reshape(x, input_shape)
-
-        shape_to_broadcast = (1, self.num_channels) + (1,) * (x.rank - 2)
+        if x.rank < 3:
+            raise_error(
+                f"Input must have a rank of at least 3, but got input of rank: {x.rank}",
+                details=[
+                    "The input should have shape [N, C, D1, ...] where N is the batch size, C is the number of channels, and D1, ... are the feature dimensions."
+                ],
+            )
 
-        return reshape(self.weight, shape_to_broadcast) * x + reshape(self.bias, shape_to_broadcast)
+        from nvtripy.frontend.ops.reshape import reshape
+        from nvtripy.frontend.ops.split import split
+        from nvtripy.frontend.ops.stack import stack
+        from nvtripy.frontend.ops.flatten import flatten
+        from nvtripy.frontend.module.instancenorm import InstanceNorm
+        from nvtripy.frontend.ops.ones import ones
+        from nvtripy.frontend.ops.zeros import zeros
+
+        instance_norm = InstanceNorm(self.num_groups, dtype=self.dtype, eps=self.eps)
+        instance_norm.weight = ones((self.num_groups,), dtype=self.dtype)
+        instance_norm.bias = zeros((self.num_groups,), dtype=self.dtype)
+
+        # Use InstanceNorm as a WAR due to lack of TRT API compatibility for scale/bias with shape (num_channels, )
+        input_reshaped = stack(split(x, self.num_groups, 1), 1)
+        x = instance_norm(input_reshaped)
+        x = flatten(x, start_dim=1, end_dim=2)
+        broadcast_shape = (1, self.num_channels) + (1,) * (x.rank - 2)
+        return x * reshape(self.weight, broadcast_shape) + reshape(self.bias, broadcast_shape)
diff --git a/tripy/nvtripy/frontend/module/instancenorm.py b/tripy/nvtripy/frontend/module/instancenorm.py
@@ -45,7 +45,7 @@ def instancenorm(
 
     if input_rank < 3:
         raise_error(
-            f"InstanceNorm input must have a rank of at least 3, but got input of rank: {input.rank}",
+            f"Input must have a rank of at least 3, but got input of rank: {input.rank}",
             details=[
                 "Input is expected to have shape (N, C, D1, ...) where N is the batch size, C is the number of channels, and D1, ... are the spatial dimensions"
             ],
diff --git a/tripy/nvtripy/frontend/module/layernorm.py b/tripy/nvtripy/frontend/module/layernorm.py
@@ -20,6 +20,7 @@
 
 from nvtripy import export, utils
 from nvtripy.common import datatype
+from nvtripy.common.exception import raise_error
 from nvtripy.frontend.module.module import Module
 from nvtripy.frontend.module.parameter import DefaultParameter
 from nvtripy.frontend.tensor import Tensor
@@ -44,10 +45,17 @@ def layernorm(
     D = len(normalized_shape)
     input_rank = input.rank
 
-    # Reshape weight and bias to match input rank for TensorRT normalization (expects [1, ...] + normalized_shape)
-    if input_rank > D:
-        from nvtripy.frontend.ops.reshape import reshape
+    if input_rank < 2:
+        raise_error(
+            f"Input must have a rank of at least 2, but got input of rank: {input.rank}",
+            details=[
+                "Input is expected to have shape (N, *) where N is the batch size, and * represents any number of channel dimension + spatial dimensions"
+            ],
+        )
+
+    from nvtripy.frontend.ops.reshape import reshape
 
+    if input_rank > D:
         broadcast_shape = (1,) * (input_rank - D) + normalized_shape
         weight = reshape(weight, broadcast_shape)
         bias = reshape(bias, broadcast_shape)
diff --git a/tripy/nvtripy/trace/ops/layernorm.py b/tripy/nvtripy/trace/ops/layernorm.py
@@ -27,7 +27,7 @@
 @dataclass(repr=False)
 class LayerNorm(TraceOp):
     normalized_shape: Sequence[int]
-    eps: float = 1e-5
+    eps: float
 
     infer_rank = op_utils.InferRankPolicies.same_as_input()
 
diff --git a/tripy/tests/frontend/module/test_instancenorm.py b/tripy/tests/frontend/module/test_instancenorm.py
@@ -24,28 +24,26 @@ def test_instancenorm_improper_rank(self):
         tp_instancenorm = tp.InstanceNorm(
             num_channels=3,
         )
-        tp_instancenorm.weight = tp.ones((3,))
-        tp_instancenorm.bias = tp.ones((3,))
+        tp_instancenorm.initialize_dummy_parameters()
 
         x = tp.ones((2, 3))
         with helper.raises(
             tp.TripyException,
-            match=f"InstanceNorm input must have a rank of at least 3, but got input of rank: {x.rank}",
+            match=f"Input must have a rank of at least 3, but got input of rank: {x.rank}",
         ):
             tp_instancenorm(x).eval()
 
     def test_instancenorm_improper_channels(self):
         tp_instancenorm = tp.InstanceNorm(
             num_channels=3,
         )
-        tp_instancenorm.weight = tp.ones((3,))
-        tp_instancenorm.bias = tp.ones((3,))
+        tp_instancenorm.initialize_dummy_parameters()
 
         # dynamic shape
         x = tp.ones((2, 6, 4, 4))
         with helper.raises(
             tp.TripyException,
-            match="MTRTException: failed to run pass pipeline",
+            match=r"'tensorrt.slice' op inferred type\(s\) 'tensor\<2x6x4x4xf32\>' are incompatible with return type\(s\) of operation 'tensor\<\?x3x\?x\?xf32\>'",
         ):
             tp_instancenorm(x).eval()
 
diff --git a/tripy/tests/frontend/module/test_layernorm.py b/tripy/tests/frontend/module/test_layernorm.py
@@ -23,11 +23,23 @@ def test_layernorm_improper_dimensions(self):
         tp_layernorm = tp.LayerNorm(
             normalized_shape=[2, 2],
         )
-        tp_layernorm.weight = tp.ones((2, 2))
-        tp_layernorm.bias = tp.ones((2, 2))
+        tp_layernorm.initialize_dummy_parameters()
 
         x = tp.ones((5, 5, 5))
         with helper.raises(
             tp.TripyException, match="The normalization scale is not broadcast-compatible with the input at dimension 1"
         ):
             tp_layernorm(x).eval()
+
+    def test_layernorm_improper_rank(self):
+        tp_layernorm = tp.LayerNorm(
+            normalized_shape=[2],
+        )
+        tp_layernorm.initialize_dummy_parameters()
+
+        x = tp.ones((2,))
+        with helper.raises(
+            tp.TripyException,
+            match=f"Input must have a rank of at least 2, but got input of rank: {x.rank}",
+        ):
+            tp_layernorm(x).eval()
diff --git a/tripy/tests/integration/test_groupnorm.py b/tripy/tests/integration/test_groupnorm.py
@@ -15,45 +15,95 @@
 # limitations under the License.
 #
 
+import nvtripy as tp
 import pytest
 import torch
 
-import nvtripy as tp
+from tests.helper import TORCH_DTYPES
+
+DTYPES = [tp.float16, tp.float32]
+
+dtype_params = pytest.mark.parametrize("dtype", DTYPES)
+input_shape_params = pytest.mark.parametrize("input_shape", [(1, 6, 2, 2)])
+num_groups_params = pytest.mark.parametrize("num_groups", [2, 3])
+num_channels_params = pytest.mark.parametrize("num_channels", [6])
+
 
-DTYPES = [(torch.float16, tp.float16), (torch.float32, tp.float32)]
+@pytest.fixture
+def setup(dtype, input_shape, num_groups, num_channels):
+    eps = 0.0
+    torch_dtype = TORCH_DTYPES[dtype]
+    groupnorm = torch.nn.GroupNorm(
+        num_groups=num_groups,
+        num_channels=num_channels,
+        eps=eps,
+        dtype=torch_dtype,
+        device="cuda",
+    )
+    tp_groupnorm = tp.GroupNorm(
+        num_groups=num_groups,
+        num_channels=num_channels,
+        eps=eps,
+        dtype=dtype,
+    )
+
+    input = torch.empty(*input_shape, dtype=torch_dtype, device="cuda").uniform_(0, 10)
+    tp_input = tp.Tensor(input, dtype=dtype)
+    yield groupnorm, tp_groupnorm, tp_input
 
 
 class TestGroupNorm:
 
-    @pytest.mark.parametrize("torch_dtype, tp_dtype", DTYPES)
-    @pytest.mark.parametrize("input_shape", [(1, 10, 2)])
-    @pytest.mark.parametrize("num_groups", [2, 5])
-    @pytest.mark.parametrize("num_channels", [10])
-    def test_groupnorm_accuracy(self, torch_dtype, tp_dtype, input_shape, num_groups, num_channels, eager_or_compiled):
-        eps = 1e-5
-        groupnorm = torch.nn.GroupNorm(
-            num_groups=num_groups,
-            num_channels=num_channels,
-            eps=eps,
-            dtype=torch_dtype,
-            device="cuda",
-        )
-        tp_groupnorm = tp.GroupNorm(
-            num_groups=num_groups,
-            num_channels=num_channels,
-            eps=eps,
-            dtype=tp_dtype,
-        )
+    @dtype_params
+    @input_shape_params
+    @num_groups_params
+    @num_channels_params
+    def test_groupnorm_normalization(self, input_shape, num_groups, setup, eager_or_compiled):
+        """Test that normalized output has approximately mean=0, std=1"""
+        _, tp_groupnorm, tp_input = setup
+        dtype = tp_groupnorm.weight.dtype
+
+        tp_groupnorm.weight = tp.ones(tp_groupnorm.weight.shape, dtype=dtype)
+        tp_groupnorm.bias = tp.zeros(tp_groupnorm.bias.shape, dtype=dtype)
+
+        output = eager_or_compiled(tp_groupnorm, tp_input)
+        output_torch = torch.from_dlpack(output)
+
+        N, C = input_shape[0], input_shape[1]
+        spatial_size = torch.prod(torch.tensor(input_shape[2:]))
+        reshaped = output_torch.view(N, num_groups, C // num_groups, spatial_size)
+
+        means = reshaped.mean(dim=(2, 3))
+        vars = reshaped.var(dim=(2, 3), unbiased=False)
+
+        mean_abs = means.abs().mean().item()
+        var_diff = (vars - 1).abs().mean().item()
+
+        assert mean_abs < 2e-4, f"Group mean should be close to 0, got {mean_abs}"
+        assert var_diff < 1e-3, f"Group variance should be close to 1, got {var_diff}"
+
+    @dtype_params
+    @input_shape_params
+    @num_groups_params
+    @num_channels_params
+    def test_groupnorm_affine_transformation(self, setup, eager_or_compiled):
+        """Test the GroupNorm with affine transformation included"""
+        groupnorm, tp_groupnorm, tp_input = setup
+        dtype = tp_groupnorm.weight.dtype
+        input = torch.from_dlpack(tp_input)
+
+        torch.nn.init.uniform_(groupnorm.weight, 0.2, 2)
+        torch.nn.init.uniform_(groupnorm.bias, 0.2, 2)
 
         tp_groupnorm.weight = tp.Tensor(groupnorm.weight.to("cpu").detach())
         tp_groupnorm.bias = tp.Tensor(groupnorm.bias.to("cpu").detach())
 
-        input = torch.arange(torch.prod(torch.Tensor(input_shape))).reshape(input_shape).to(torch_dtype).to("cuda")
-        tp_input = tp.Tensor(input, dtype=tp_dtype)
-
         output = eager_or_compiled(tp_groupnorm, tp_input)
         with torch.no_grad():
             expected = groupnorm(input)
 
-        rtol_ = 2e-6 if tp_dtype == tp.float32 else 1e-3
-        assert torch.allclose(torch.from_dlpack(output), expected, rtol=rtol_)
+        atol_ = 1e-6 if dtype == tp.float32 else 5e-3
+
+        torch_output = torch.from_dlpack(output)
+        assert torch_output.shape == expected.shape
+        assert torch.allclose(torch_output, expected, atol=atol_)
diff --git a/tripy/tests/integration/test_instancenorm.py b/tripy/tests/integration/test_instancenorm.py
diff --git a/tripy/tests/integration/test_layernorm.py b/tripy/tests/integration/test_layernorm.py