Updates SAMv2 to name dimensions in order to trigger MHA fusions

pranavm-nvidia · pranavm-nvidia · commit b634fd84dbef · 2025-06-12T11:59:09.000-07:00
diff --git a/tripy/examples/segment-anything-model-v2/sam2/build_sam.py b/tripy/examples/segment-anything-model-v2/sam2/build_sam.py
@@ -54,8 +54,10 @@ def get_component_configs(model, cfg):
     """
     Get configurations for different components, including both compilation and weight loading info.
     """
-    batchsize = (1, 2, 4)
-    num_obj = (1, 2, 4)
+    batch = tp.NamedDimension("batch", 1, 2, 4)
+    num_obj = tp.NamedDimension("num_obj", 1, 2, 4)
+    seq_len = tp.NamedDimension("seq_len", 4100, 16400, 28736)
+    mem_attention_batch = tp.NamedDimension("mem_attention_batch", 1, 2, 8)
     model_precision = getattr(cfg["model"], "model_precision", "float32")
     return {
         "memory_attention": {
@@ -64,19 +66,19 @@ def get_component_configs(model, cfg):
             "dtype": model_precision,
             "compile_args": [
                 tp.InputInfo(
-                    (4096, (1, 2, 8), 256),
+                    (4096, mem_attention_batch, 256),
                     getattr(tp, model_precision),
                 ),
                 tp.InputInfo(
-                    ((4100, 16400, 28736), (1, 2, 8), 64),
+                    (seq_len, mem_attention_batch, 64),
                     getattr(tp, model_precision),
                 ),
                 tp.InputInfo(
-                    (4096, (1, 2, 8), 256),
+                    (4096, mem_attention_batch, 256),
                     getattr(tp, model_precision),
                 ),
                 tp.InputInfo(
-                    ((4100, 16400, 28736), (1, 2, 8), 64),
+                    (seq_len, mem_attention_batch, 64),
                     getattr(tp, model_precision),
                 ),
                 # TODO (#594): Remove this hack once we are able to pass in DimensionSizes directly:
@@ -124,29 +126,29 @@ def get_component_configs(model, cfg):
             "dtype": model_precision,
             "compile_args": [
                 tp.InputInfo(
-                    (batchsize, 256, 64, 64),
+                    (batch, 256, 64, 64),
                     dtype=getattr(tp, model_precision),
                 ),  # image_embeddings
                 tp.InputInfo(
                     (1, 256, 64, 64),
                     dtype=getattr(tp, model_precision),
                 ),  # image_pe
                 tp.InputInfo(
-                    (batchsize, (2, 4, 6), 256),
+                    (batch, (2, 4, 6), 256),
                     dtype=getattr(tp, model_precision),
                 ),  # sparse_prompt_embeddings
                 tp.InputInfo(
-                    (batchsize, 256, 64, 64),
+                    (batch, 256, 64, 64),
                     dtype=getattr(tp, model_precision),
                 ),  # dense_prompt_embeddings
                 True,  # multimask_output
                 False,  # repeat_image
                 tp.InputInfo(
-                    (batchsize, 32, 256, 256),
+                    (batch, 32, 256, 256),
                     dtype=getattr(tp, model_precision),
                 ),  # high_res_features_1
                 tp.InputInfo(
-                    (batchsize, 64, 128, 128),
+                    (batch, 64, 128, 128),
                     dtype=getattr(tp, model_precision),
                 ),  # high_res_features_2
             ],
@@ -159,7 +161,7 @@ def get_component_configs(model, cfg):
             "dtype": model_precision,
             "compile_args": [
                 tp.InputInfo(
-                    (batchsize, 256, 256, 256),
+                    (batch, 256, 256, 256),
                     dtype=getattr(tp, model_precision),
                 )
             ],
@@ -172,7 +174,7 @@ def get_component_configs(model, cfg):
             "dtype": model_precision,
             "compile_args": [
                 tp.InputInfo(
-                    (batchsize, 256, 128, 128),
+                    (batch, 256, 128, 128),
                     dtype=getattr(tp, model_precision),
                 )
             ],
@@ -184,8 +186,8 @@ def get_component_configs(model, cfg):
             "model": model.memory_encoder,
             "dtype": model_precision,
             "compile_args": [
-                tp.InputInfo((batchsize, 256, 64, 64), getattr(tp, model_precision)),
-                tp.InputInfo((batchsize, num_obj, 1024, 1024), getattr(tp, model_precision)),
+                tp.InputInfo((batch, 256, 64, 64), getattr(tp, model_precision)),
+                tp.InputInfo((batch, num_obj, 1024, 1024), getattr(tp, model_precision)),
                 True,
             ],
             "skip_dtype_convert": ["ln", "norm"]
@@ -196,8 +198,8 @@ def get_component_configs(model, cfg):
             "model": model.sam_prompt_encoder,
             "dtype": "float32",
             "compile_args": [
-                tp.InputInfo((batchsize, num_obj, 2), dtype=tp.float32),
-                tp.InputInfo((batchsize, num_obj), dtype=tp.int32),
+                tp.InputInfo((batch, num_obj, 2), dtype=tp.float32),
+                tp.InputInfo((batch, num_obj), dtype=tp.int32),
                 None,
                 None,
             ],
@@ -224,7 +226,7 @@ def get_component_configs(model, cfg):
             "dtype": model_precision,
             "compile_args": [
                 tp.InputInfo(
-                    (batchsize, 3, 1024, 1024),
+                    (batch, 3, 1024, 1024),
                     dtype=getattr(tp, model_precision),
                 ),
             ],
diff --git a/tripy/examples/segment-anything-model-v2/sam2/modeling/memory_attention.py b/tripy/examples/segment-anything-model-v2/sam2/modeling/memory_attention.py
@@ -22,16 +22,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, List
-
-from sam2.modeling.sam.transformer import RoPEAttention
-from sam2.modeling.sam2_utils import get_activation_fn
+from typing import List, Optional
 
 import nvtripy as tp
+from sam2.modeling.sam2_utils import get_activation_fn
+from sam2.modeling.sam.transformer import RoPEAttention
 
 
 class MemoryAttentionLayer(tp.Module):
-
     def __init__(
         self,
         activation: str,
@@ -77,8 +75,6 @@ def _forward_sa(self, tgt, query_pos):
         return tgt
 
     def _forward_ca(self, tgt, memory, query_pos, pos, num_k_exclude_rope=0):
-        kwds = {}
-
         # Cross-Attention
         tgt2 = tp.cast(self.norm2(tp.cast(tgt, self.norm2.dtype)), self.dtype)
 
@@ -112,7 +108,6 @@ def forward(
 
 
 class MemoryAttention(tp.Module):
-
     def __init__(
         self,
         d_model: int,
diff --git a/tripy/examples/segment-anything-model-v2/video_demo.py b/tripy/examples/segment-anything-model-v2/video_demo.py
@@ -42,19 +42,15 @@
 
 
 def compute_mask_properties(mask):
-    # Ensure we have a boolean array
-    test_mask = np.asarray(mask, dtype=bool)
-
-    # Calculate basic stats
-    volume = np.sum(test_mask)
+    volume = torch.sum(mask)
 
     # Calculate centroid (center of mass)
     if volume > 0:
-        indices = np.where(test_mask)
-        centroid = tuple(float(np.mean(idx)) for idx in indices)
+        indices = torch.where(mask)
+        centroid = tuple((torch.sum(idx) / float(len(idx))).item() for idx in indices)
     else:
         centroid = None
-    return volume, centroid
+    return volume.item(), centroid
 
 
 def main(video_dir: str, save_path: Optional[str] = None):
@@ -161,7 +157,7 @@ def make_tensors_contiguous(d):
     video_segments = {}  # video_segments contains the per-frame segmentation results
     for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state):
         video_segments[out_frame_idx] = {
-            out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy() for i, out_obj_id in enumerate(out_obj_ids)
+            out_obj_id: (out_mask_logits[i] > 0.0) for i, out_obj_id in enumerate(out_obj_ids)
         }
     end = time.perf_counter()
     print(f"Video segmentation took {(end - start)}s")
@@ -175,7 +171,7 @@ def make_tensors_contiguous(d):
             plt.imshow(Image.open(os.path.join(video_dir, frame_names[out_frame_idx])))
             for out_obj_id, out_mask in video_segments[out_frame_idx].items():
                 vol, centre = compute_mask_properties(out_mask)
-                show_mask(out_mask, plt.gca(), obj_id=out_obj_id)
+                show_mask(out_mask.cpu().numpy(), plt.gca(), obj_id=out_obj_id)
             plt.savefig(os.path.join(save_path, f"video_final_mask_{out_frame_idx}.png"))
 
         # Print the properties of the mask generated for the final image for integration testing.