Removes unnecessary casting around normalization layers

pranavm-nvidia · pranavm-nvidia · commit 6c3162a0a62b · 2025-06-26T16:31:15.000-07:00
diff --git a/tripy/examples/segment-anything-model-v2/sam2/build_sam.py b/tripy/examples/segment-anything-model-v2/sam2/build_sam.py
@@ -84,7 +84,7 @@ def get_component_configs(model, cfg):
                 # TODO (#594): Remove this hack once we are able to pass in DimensionSizes directly:
                 tp.InputInfo(((4, 16, 64),), tp.int32),
             ],
-            "skip_dtype_convert": ["ln", "norm"],
+            "skip_dtype_convert": [],
         },
         "sam_mask_decoder_false": {
             "enabled": True,
@@ -118,7 +118,7 @@ def get_component_configs(model, cfg):
                     dtype=getattr(tp, model_precision),
                 ),  # high_res_features_2
             ],
-            "skip_dtype_convert": ["ln", "norm", "output_upscaling.1"],
+            "skip_dtype_convert": [],
         },
         "sam_mask_decoder_true": {
             "enabled": True,
@@ -152,7 +152,7 @@ def get_component_configs(model, cfg):
                     dtype=getattr(tp, model_precision),
                 ),  # high_res_features_2
             ],
-            "skip_dtype_convert": ["ln", "norm", "output_upscaling.1"],
+            "skip_dtype_convert": [],
             "skip_load_state_dict": True,
         },
         "sam_mask_decoder.conv_s0": {
@@ -190,8 +190,7 @@ def get_component_configs(model, cfg):
                 tp.InputInfo((batch, num_obj, 1024, 1024), getattr(tp, model_precision)),
                 True,
             ],
-            "skip_dtype_convert": ["ln", "norm"]
-            + [f"encoder.{i}.{param}" for i in range(1, 40, 3) for param in ("weight", "bias")],
+            "skip_dtype_convert": [],
         },
         "sam_prompt_encoder": {
             "enabled": True,
@@ -230,7 +229,7 @@ def get_component_configs(model, cfg):
                     dtype=getattr(tp, model_precision),
                 ),
             ],
-            "skip_dtype_convert": ["norm"],
+            "skip_dtype_convert": [],
             "special_key_loading": lambda key: (
                 # If it's a neck.convs key that contains 'conv.'
                 # neck.convs.0.conv.weight -> neck.convs.0.weight
diff --git a/tripy/examples/segment-anything-model-v2/sam2/modeling/backbones/hieradet.py b/tripy/examples/segment-anything-model-v2/sam2/modeling/backbones/hieradet.py
@@ -113,7 +113,7 @@ def __init__(
         super().__init__()
 
         if isinstance(norm_layer, str):
-            norm_layer = partial(getattr(tp, norm_layer), eps=1e-6)
+            norm_layer = partial(getattr(tp, norm_layer), eps=1e-6, dtype=dtype)
 
         self.dim = dim
         self.dim_out = dim_out
@@ -149,15 +149,8 @@ def __init__(
             self.proj = tp.Linear(dim, dim_out, dtype=dtype)
 
     def forward(self, x):
-
-        def call_norm(x, norm):
-            x_dtype = x.dtype
-            x = tp.cast(x, tp.float32)
-            x = norm(x)
-            return tp.cast(x, x_dtype)
-
         shortcut = x  # B, H, W, C
-        x = call_norm(x, self.norm1)
+        x = self.norm1(x)
 
         # Skip connection
         if self.dim != self.dim_out:
@@ -189,7 +182,7 @@ def mod_int(x, y):
 
         x = shortcut + x
         # MLP
-        t = call_norm(x, self.norm2)
+        t = self.norm2(x)
         x = x + self.mlp(t)
         return x
 
diff --git a/tripy/examples/segment-anything-model-v2/sam2/modeling/memory_attention.py b/tripy/examples/segment-anything-model-v2/sam2/modeling/memory_attention.py
@@ -54,9 +54,9 @@ def __init__(
         self.linear1 = tp.Linear(d_model, dim_feedforward, dtype=self.dtype)
         self.linear2 = tp.Linear(dim_feedforward, d_model, dtype=self.dtype)
 
-        self.norm1 = tp.LayerNorm(d_model)
-        self.norm2 = tp.LayerNorm(d_model)
-        self.norm3 = tp.LayerNorm(d_model)
+        self.norm1 = tp.LayerNorm(d_model, dtype=self.dtype)
+        self.norm2 = tp.LayerNorm(d_model, dtype=self.dtype)
+        self.norm3 = tp.LayerNorm(d_model, dtype=self.dtype)
 
         self.activation_str = activation
         self.activation = get_activation_fn(activation)
@@ -68,15 +68,15 @@ def __init__(
 
     def _forward_sa(self, tgt, query_pos):
         # Self-Attention
-        tgt2 = tp.cast(self.norm1(tp.cast(tgt, self.norm1.dtype)), self.dtype)
+        tgt2 = self.norm1(tgt)
         q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
         tgt2 = self.self_attn(q, k, v=tgt2, num_k_exclude_rope=0)
         tgt = tgt + tgt2
         return tgt
 
     def _forward_ca(self, tgt, memory, query_pos, pos, num_k_exclude_rope=0):
         # Cross-Attention
-        tgt2 = tp.cast(self.norm2(tp.cast(tgt, self.norm2.dtype)), self.dtype)
+        tgt2 = self.norm2(tgt)
 
         tgt2 = self.cross_attn_image(
             q=tgt2 + query_pos if self.pos_enc_at_cross_attn_queries else tgt2,
@@ -100,7 +100,7 @@ def forward(
         tgt = self._forward_sa(tgt, query_pos)
         tgt = self._forward_ca(tgt, memory, query_pos, pos, num_k_exclude_rope)
         # MLP
-        tgt2 = tp.cast(self.norm3(tp.cast(tgt, self.norm3.dtype)), self.dtype)
+        tgt2 = self.norm3(tgt)
 
         tgt2 = self.linear2(self.activation(self.linear1(tgt2)))
         tgt = tgt + tgt2
@@ -137,12 +137,13 @@ def __init__(
         dtype="float32",
     ):
         super().__init__()
+        self.dtype = getattr(tp, dtype)
+
         self.d_model = d_model
         self.num_layers = num_layers
-        self.norm = tp.LayerNorm(d_model)
+        self.norm = tp.LayerNorm(d_model, self.dtype)
         self.pos_enc_at_input = pos_enc_at_input
         self.batch_first = batch_first
-        self.dtype = getattr(tp, dtype)
         self.layers = []
         for _ in range(num_layers):
             self_attn = RoPEAttention(
@@ -215,7 +216,7 @@ def forward(
                 **kwds,
             )
 
-        normed_output = tp.cast(self.norm(tp.cast(output, self.norm.dtype)), self.dtype)
+        normed_output = self.norm(output)
 
         if self.batch_first:
             # Convert back to seq first
diff --git a/tripy/examples/segment-anything-model-v2/sam2/modeling/memory_encoder.py b/tripy/examples/segment-anything-model-v2/sam2/modeling/memory_encoder.py
@@ -71,7 +71,7 @@ def __init__(
                     dtype=self.dtype,
                 )
             )
-            self.encoder.append(LayerNorm2d(mask_out_chans))
+            self.encoder.append(LayerNorm2d(mask_out_chans, dtype=self.dtype))
             self.encoder.append(activation)
             mask_in_chans = mask_out_chans
 
@@ -108,7 +108,7 @@ def __init__(
             groups=dim if use_dwconv else 1,
             dtype=self.dtype,
         )  # depthwise conv
-        self.norm = LayerNorm2d(dim, eps=1e-6)
+        self.norm = LayerNorm2d(dim, eps=1e-6, dtype=self.dtype)
         self.pwconv1 = tp.Linear(dim, 4 * dim, dtype=self.dtype)  # pointwise/1x1 convs, implemented with linear layers
         self.act = tp.gelu
         self.pwconv2 = tp.Linear(4 * dim, dim, dtype=self.dtype)
diff --git a/tripy/examples/segment-anything-model-v2/sam2/modeling/sam/mask_decoder.py b/tripy/examples/segment-anything-model-v2/sam2/modeling/sam/mask_decoder.py
@@ -99,7 +99,7 @@ def __init__(
                 stride=(2, 2),
                 dtype=dtype,
             ),
-            LayerNorm2d(transformer_dim // 4),
+            LayerNorm2d(transformer_dim // 4, dtype=dtype),
             Dummy(),  # Accounts for Dropout layer, needed for weight loading
             tp.ConvTranspose(
                 transformer_dim // 4,
@@ -289,13 +289,13 @@ def predict_masks(
 
         if not self.use_high_res_features:
             dc1, ln1, _, dc2, _ = self.output_upscaling
-            post_ln1 = tp.cast(ln1(tp.cast(dc1(src), tp.float32)), src.dtype)
+            post_ln1 = ln1(dc1(src))
             upscaled_embedding = act2(dc2(act1(post_ln1)))
             # upscaled_embedding = act2(dc2(act1(ln1(dc1(src)))))
         else:
             dc1, ln1, _, dc2, _ = self.output_upscaling
             feat_s0, feat_s1 = high_res_features_1, high_res_features_2
-            post_ln1 = tp.cast(ln1(tp.cast(dc1(src) + feat_s1, tp.float32)), src.dtype)
+            post_ln1 = ln1(dc1(src) + feat_s1)
             upscaled_embedding = act1(post_ln1)
             # upscaled_embedding = act1(ln1(dc1(src) + feat_s1))
             upscaled_embedding = act2(dc2(upscaled_embedding) + feat_s0)
diff --git a/tripy/examples/segment-anything-model-v2/sam2/modeling/sam/transformer.py b/tripy/examples/segment-anything-model-v2/sam2/modeling/sam/transformer.py
@@ -81,7 +81,7 @@ def __init__(
             downsample_rate=attention_downsample_rate,
             dtype=dtype,
         )
-        self.norm_final_attn = tp.LayerNorm(embedding_dim)
+        self.norm_final_attn = tp.LayerNorm(embedding_dim, dtype=dtype)
 
     def forward(
         self,
@@ -134,10 +134,7 @@ def forward_impl(
         k = keys + image_pe
         attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
         queries = queries + attn_out
-        queries = tp.cast(
-            self.norm_final_attn(tp.cast(queries, self.norm_final_attn.dtype)),
-            queries.dtype,
-        )
+        queries = self.norm_final_attn(queries)
         # queries = self.norm_final_attn(queries)
 
         return queries, keys
@@ -170,15 +167,15 @@ def __init__(
         """
         super().__init__()
         self.self_attn = Attention(embedding_dim, num_heads, dtype=dtype)
-        self.norm1 = tp.LayerNorm(embedding_dim)
+        self.norm1 = tp.LayerNorm(embedding_dim, dtype=dtype)
 
         self.cross_attn_token_to_image = Attention(
             embedding_dim,
             num_heads,
             downsample_rate=attention_downsample_rate,
             dtype=dtype,
         )
-        self.norm2 = tp.LayerNorm(embedding_dim)
+        self.norm2 = tp.LayerNorm(embedding_dim, dtype=dtype)
 
         self.mlp = MLP(
             embedding_dim,
@@ -188,9 +185,9 @@ def __init__(
             activation=activation,
             dtype=dtype,
         )
-        self.norm3 = tp.LayerNorm(embedding_dim)
+        self.norm3 = tp.LayerNorm(embedding_dim, dtype=dtype)
 
-        self.norm4 = tp.LayerNorm(embedding_dim)
+        self.norm4 = tp.LayerNorm(embedding_dim, dtype=dtype)
         self.cross_attn_image_to_token = Attention(
             embedding_dim,
             num_heads,
@@ -212,29 +209,29 @@ def forward_impl(self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe:
             attn_out = self.self_attn(q=q, k=q, v=queries)
             queries = queries + attn_out
 
-        queries = tp.cast(self.norm1(tp.cast(queries, self.norm1.dtype)), queries.dtype)
+        queries = self.norm1(queries)
 
         # Cross attention block, tokens attending to image embedding
         q = queries + query_pe
         k = keys + key_pe
         attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
         queries = queries + attn_out
 
-        queries = tp.cast(self.norm2(tp.cast(queries, self.norm2.dtype)), queries.dtype)
+        queries = self.norm2(queries)
         # queries = self.norm2(queries)
 
         # MLP block
         mlp_out = self.mlp(queries)
         queries = queries + mlp_out
-        queries = tp.cast(self.norm3(tp.cast(queries, self.norm3.dtype)), queries.dtype)
+        queries = self.norm3(queries)
         # queries = self.norm3(queries)
 
         # Cross attention block, image embedding attending to tokens
         q = queries + query_pe
         k = keys + key_pe
         attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
         keys = keys + attn_out
-        keys = tp.cast(self.norm4(tp.cast(keys, self.norm4.dtype)), keys.dtype)
+        keys = self.norm4(keys)
         # keys = self.norm4(keys)
 
         return queries, keys
diff --git a/tripy/examples/segment-anything-model-v2/sam2/modeling/sam2_utils.py b/tripy/examples/segment-anything-model-v2/sam2/modeling/sam2_utils.py
@@ -181,16 +181,12 @@ def forward(self, x):
 
 
 class LayerNorm2d(tp.LayerNorm):
-    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
-        super().__init__(num_channels, dtype=tp.float32, eps=eps)
+    def __init__(self, num_channels: int, eps: float = 1e-6, dtype: tp.dtype = tp.float32) -> None:
+        super().__init__(num_channels, dtype=dtype, eps=eps)
 
     def forward(self, x: tp.Tensor) -> tp.Tensor:
         x = tp.permute(x, (0, 2, 3, 1))
-        # LayerNorm is always done in float32:
-        original_dtype = x.dtype
-        x = tp.cast(x, tp.float32)
         x = super().forward(x)
-        x = tp.cast(x, original_dtype)
         return tp.permute(x, (0, 3, 1, 2))