Adds support for FP16 in memory encoder

pranavm-nvidia · pranavm-nvidia · commit 24a2deb0ebe9 · 2025-06-11T16:35:07.000-07:00
diff --git a/tripy/examples/segment-anything-model-v2/configs/sam2_hiera_l.yaml b/tripy/examples/segment-anything-model-v2/configs/sam2_hiera_l.yaml
@@ -62,6 +62,7 @@ model:
   memory_encoder:
       _target_: sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
+      dtype: float16
       position_encoding:
         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
@@ -73,6 +74,7 @@ model:
         kernel_size: 3
         stride: 2
         padding: 1
+        dtype: float16
       fuser:
         _target_: sam2.modeling.memory_encoder.Fuser
         layer:
@@ -82,6 +84,7 @@ model:
           padding: 3
           layer_scale_init_value: 1e-6
           use_dwconv: True  # depth-wise convs
+          dtype: float16
         num_layers: 2
 
   num_maskmem: 7
diff --git a/tripy/examples/segment-anything-model-v2/configs/sam2_hiera_s.yaml b/tripy/examples/segment-anything-model-v2/configs/sam2_hiera_s.yaml
@@ -63,6 +63,7 @@ model:
   memory_encoder:
       _target_: sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
+      dtype: float16
       position_encoding:
         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
@@ -74,6 +75,7 @@ model:
         kernel_size: 3
         stride: 2
         padding: 1
+        dtype: float16
       fuser:
         _target_: sam2.modeling.memory_encoder.Fuser
         layer:
@@ -83,6 +85,7 @@ model:
           padding: 3
           layer_scale_init_value: 1e-6
           use_dwconv: True  # depth-wise convs
+            dtype: float16
         num_layers: 2
 
   num_maskmem: 7
diff --git a/tripy/examples/segment-anything-model-v2/configs/sam2_hiera_t.yaml b/tripy/examples/segment-anything-model-v2/configs/sam2_hiera_t.yaml
@@ -63,6 +63,7 @@ model:
   memory_encoder:
       _target_: sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
+      dtype: float16
       position_encoding:
         _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
@@ -74,6 +75,7 @@ model:
         kernel_size: 3
         stride: 2
         padding: 1
+        dtype: float16
       fuser:
         _target_: sam2.modeling.memory_encoder.Fuser
         layer:
@@ -83,6 +85,7 @@ model:
           padding: 3
           layer_scale_init_value: 1e-6
           use_dwconv: True  # depth-wise convs
+            dtype: float16
         num_layers: 2
 
   num_maskmem: 7
diff --git a/tripy/examples/segment-anything-model-v2/sam2/build_sam.py b/tripy/examples/segment-anything-model-v2/sam2/build_sam.py
@@ -23,8 +23,6 @@
 # limitations under the License.
 
 
-import logging
-
 import torch
 from hydra import compose
 from hydra.utils import instantiate
@@ -184,10 +182,10 @@ def get_component_configs(model, cfg):
         "memory_encoder": {
             "enabled": True,
             "model": model.memory_encoder,
-            "dtype": "float32",  # TODO add fp16 to yaml
+            "dtype": model_precision,
             "compile_args": [
-                tp.InputInfo((batchsize, 256, 64, 64), tp.float32),
-                tp.InputInfo((batchsize, num_obj, 1024, 1024), tp.float32),
+                tp.InputInfo((batchsize, 256, 64, 64), getattr(tp, model_precision)),
+                tp.InputInfo((batchsize, num_obj, 1024, 1024), getattr(tp, model_precision)),
                 True,
             ],
             "skip_dtype_convert": ["ln", "norm"]
@@ -227,10 +225,7 @@ def get_component_configs(model, cfg):
             "compile_args": [
                 tp.InputInfo(
                     (batchsize, 3, 1024, 1024),
-                    dtype=getattr(
-                        tp,
-                        model_precision,
-                    ),
+                    dtype=getattr(tp, model_precision),
                 ),
             ],
             "skip_dtype_convert": ["norm"],
@@ -285,7 +280,7 @@ def get_or_compile_component(self, comp_name: str, comp_info: Dict[str, Any]) ->
         else:
             print(f"Compiling {comp_name}...")
             start = time.time()
-            compiled_model = tp.compile(comp_info["model"], args=comp_info["compile_args"])
+            compiled_model = tp.compile(comp_info["model"], optimization_level=5, args=comp_info["compile_args"])
             print(f"Compilation took {time.time() - start:.2f}s")
             compiled_model.save(executable_file)
 
diff --git a/tripy/examples/segment-anything-model-v2/sam2/modeling/sam2_base.py b/tripy/examples/segment-anything-model-v2/sam2/modeling/sam2_base.py
@@ -710,7 +710,7 @@ def _encode_new_memory(
         # scale the raw mask logits with a temperature before applying sigmoid
         binarize = self.binarize_mask_from_pts_for_mem_enc and is_mask_from_pts
         if binarize and not self.training:
-            mask_for_mem = (pred_masks_high_res > 0).float()
+            mask_for_mem = pred_masks_high_res > 0
         else:
             # apply sigmoid on the raw mask logits to turn them into range (0, 1)
             mask_for_mem = torch.sigmoid(pred_masks_high_res)
@@ -720,8 +720,11 @@ def _encode_new_memory(
         if self.sigmoid_bias_for_mem_enc != 0.0:
             mask_for_mem = mask_for_mem + self.sigmoid_bias_for_mem_enc
 
+        if self.memory_encoder.input_infos["masks"].dtype == tp.float16:
+            mask_for_mem = mask_for_mem.half()
+
         maskmem_features, maskmem_pos_enc = self.memory_encoder(
-            tp.Tensor(pix_feat.float().contiguous()), tp.Tensor(mask_for_mem.contiguous())
+            tp.Tensor(pix_feat.contiguous()), tp.Tensor(mask_for_mem.contiguous())
         )  # sigmoid already applied
         maskmem_features = torch.from_dlpack(maskmem_features)
         maskmem_pos_enc = [torch.from_dlpack(maskmem_pos_enc)]