Resolved merge conflicts, linter issues, added pytest for packed fp6 dims

alex-titterton · alex-titterton · commit 2cd2104e4f8b · 2025-02-07T15:09:54.000Z
diff --git a/test/prototype/mx_formats/test_mx_linear.py b/test/prototype/mx_formats/test_mx_linear.py
@@ -59,7 +59,7 @@ def test_linear_eager(elem_dtype, bias, input_shape):
         nn.Linear(8, 8, bias=bias, device="cuda"),
     )
     m_mx = copy.deepcopy(m)
-    block_size = 2
+    block_size = 4
     swap_linear_with_mx_linear(m_mx, *elem_dtype, block_size=block_size)
 
     x_ref = torch.randn(*input_shape, device="cuda").requires_grad_()
@@ -94,10 +94,10 @@ def test_activation_checkpointing():
     elem_dtype = torch.float8_e4m3fn
 
     m = nn.Sequential(
-        nn.Linear(4, 6, bias=True, device="cuda"),
-        nn.Linear(6, 6, bias=True, device="cuda"),
+        nn.Linear(4, 8, bias=True, device="cuda"),
+        nn.Linear(8, 8, bias=True, device="cuda"),
     )
-    block_size = 2
+    block_size = 4
     swap_linear_with_mx_linear(m, elem_dtype, block_size=block_size)
 
     x = torch.randn(*input_shape, device="cuda").requires_grad_()
@@ -133,7 +133,7 @@ def test_linear_compile(elem_dtype, bias, use_autocast):
     m_mx = nn.Sequential(
         nn.Linear(K, N, bias=bias, device="cuda"),
     )
-    block_size = 2
+    block_size = 4
     swap_linear_with_mx_linear(m_mx, elem_dtype, block_size=block_size)
     m_mx_c = copy.deepcopy(m_mx)
     m_mx_c = torch.compile(m_mx_c, fullgraph=True, backend="inductor")
@@ -188,7 +188,6 @@ def test_inference_linear(elem_dtype, bias, input_shape):
     y_ref = m(x)
     y_mx = m_mx(x)
     sqnr = compute_error(y_ref, y_mx)
-    print(sqnr)
     if elem_dtype is torch.float8_e4m3fn:
         assert sqnr >= 20.0
     else:
@@ -254,4 +253,4 @@ def test_filter_fn():
 
     swap_linear_with_mx_inference_linear(m2, torch.float8_e4m3fn, 32, filter_fn)  # noqa: E501
     assert type(m2[0]) == MXInferenceLinear
-    assert type(m2[1]) == torch.nn.Linear
+    assert type(m2[1]) == torch.nn.Linear
diff --git a/test/prototype/mx_formats/test_mx_tensor.py b/test/prototype/mx_formats/test_mx_tensor.py
@@ -219,6 +219,23 @@ def test_view(elem_dtype):
     x_mx_2 = x_mx.view(2, 4)  # noqa: F841
 
 
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.parametrize("elem_dtype", [DTYPE_FP6_E2M3, DTYPE_FP6_E3M2])
+@pytest.mark.parametrize("do_fp6_packing", [False, True])
+def test_fp6_packing(elem_dtype, do_fp6_packing):
+    config.pack_fp6 = do_fp6_packing
+    x = torch.randn(1, 2, 4, device="cuda")
+    block_size = 4
+    x_mx = MXTensor.to_mx(x, elem_dtype, block_size)
+    if config.pack_fp6:
+        expected_packed_shape = torch.Size([*x.shape[:-1], 3 * x.shape[-1] // 4])
+    else:
+        expected_packed_shape = x.shape
+    config.pack_fp6 = True
+
+    assert x_mx._data.shape == expected_packed_shape
+    
+
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.skipif(
     is_sm_at_least_100(), reason="triton does not work yet on CUDA capability 10.0"
diff --git a/third_party/cutlass b/third_party/cutlass
@@ -1 +1 @@
-Subproject commit bf9da7b76c766d7ee7d536afc77880a4ef1f1156
+Subproject commit b78588d1630aa6643bf021613717bafb705df4ef
diff --git a/torchao/prototype/mx_formats/mx_ops.py b/torchao/prototype/mx_formats/mx_ops.py
@@ -25,8 +25,8 @@
 from torchao.prototype.mx_formats.constants import DTYPE_FP4, DTYPE_FP6_E3M2, DTYPE_FP6_E2M3
 from torchao.prototype.mx_formats.mx_tensor import (  # noqa: E501
     MXTensor,
-    tensor_size_hp_to_fp4x2,
     tensor_size_hpx3_to_fp6x4,
+    tensor_size_hp_to_fp4x2,
 )
 
 aten = torch.ops.aten
diff --git a/torchao/prototype/mx_formats/mx_tensor.py b/torchao/prototype/mx_formats/mx_tensor.py
@@ -45,14 +45,14 @@
     f4_unpacked_to_f32,
     f6_e2m3_unpacked_to_f32,
     f6_e3m2_unpacked_to_f32,
-    triton_f6_e2m3_to_scaled_bf16,
-    triton_f6_e3m2_to_scaled_bf16,
     f32_to_f4_unpacked,
     f32_to_f6_e2m3_unpacked,
     f32_to_f6_e3m2_unpacked,
     pack_uint4,
     pack_uint6,
     triton_f4_to_scaled_bf16,
+    triton_f6_e2m3_to_scaled_bf16,
+    triton_f6_e3m2_to_scaled_bf16,
     unpack_uint4,
 )
 

Original file line number	Diff line number	Diff line change
`@@ -25,8 +25,8 @@`
`25`	`25`	`from torchao.prototype.mx_formats.constants import DTYPE_FP4, DTYPE_FP6_E3M2, DTYPE_FP6_E2M3`
`26`	`26`	`from torchao.prototype.mx_formats.mx_tensor import ( # noqa: E501`
`27`	`27`	`MXTensor,`
`28`		`- tensor_size_hp_to_fp4x2,`
`29`	`28`	`tensor_size_hpx3_to_fp6x4,`
	`29`	`+ tensor_size_hp_to_fp4x2,`
`30`	`30`	`)`
`31`	`31`
`32`	`32`	`aten = torch.ops.aten`