graphcore-research
diff --git a/‎test/prototype/mx_formats/test_mx_linear.py‎
Lines changed: 14 additions & 14 deletions b/‎test/prototype/mx_formats/test_mx_linear.py‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎test/prototype/mx_formats/test_mx_tensor.py‎
Lines changed: 33 additions & 24 deletions b/‎test/prototype/mx_formats/test_mx_tensor.py‎
Lines changed: 33 additions & 24 deletions
diff --git a/‎third_party/cutlass‎ b/‎third_party/cutlass‎
diff --git a/‎torchao/prototype/mx_formats/config.py‎
Lines changed: 1 addition & 0 deletions b/‎torchao/prototype/mx_formats/config.py‎
Lines changed: 1 addition & 0 deletions
@@ -49,13 +49,13 @@ def test_linear_eager(elem_dtype, bias, input_shape):
     Smoke test for training linear module with mx weight
     """
     grad_shape = list(input_shape)
-    grad_shape[-1] = 6
+    grad_shape[-1] = 8
 
     m = nn.Sequential(
-        nn.Linear(8, 6, bias=bias, device="cuda"),
+        nn.Linear(8, 8, bias=bias, device="cuda"),
     )
     m_mx = copy.deepcopy(m)
-    block_size = 2
+    block_size = 4
     swap_linear_with_mx_linear(m_mx, elem_dtype, block_size)
 
     x_ref = torch.randn(*input_shape, device="cuda").requires_grad_()
@@ -86,14 +86,13 @@ def test_linear_eager(elem_dtype, bias, input_shape):
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 def test_activation_checkpointing():
     input_shape = (2, 4)
-    grad_shape = (2, 6)
+    grad_shape = (2, 8)
     elem_dtype = torch.float8_e4m3fn
 
     m = nn.Sequential(
-        nn.Linear(4, 6, bias=True, device="cuda"),
-        nn.Linear(6, 6, bias=True, device="cuda"),
-    )
-    block_size = 2
+        nn.Linear(4, 8, bias=True, device="cuda"),
+        nn.Linear(8, 8, bias=True, device="cuda"),    )
+    block_size = 4
     swap_linear_with_mx_linear(m, elem_dtype, block_size)
 
     x = torch.randn(*input_shape, device="cuda").requires_grad_()
@@ -123,13 +122,13 @@ def test_linear_compile(elem_dtype, bias, use_autocast):
     if elem_dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
         if not is_sm_at_least_89():
             pytest.skip("CUDA capability >= 8.9 required for float8 in triton")
-    M, K, N = 4, 8, 6
+    M, K, N = 4, 8, 8
     input_shape = (M, K)
     grad_shape = (M, N)
     m_mx = nn.Sequential(
         nn.Linear(K, N, bias=bias, device="cuda"),
     )
-    block_size = 2
+    block_size = 4
     swap_linear_with_mx_linear(m_mx, elem_dtype, block_size)
     m_mx_c = copy.deepcopy(m_mx)
     m_mx_c = torch.compile(m_mx_c, fullgraph=True, backend="inductor")
@@ -174,16 +173,17 @@ def test_inference_linear(elem_dtype, bias, input_shape):
     """
     Smoke test for inference linear module with mx weight
     """
-    m = nn.Sequential(nn.Linear(4, 6, bias=bias, dtype=torch.bfloat16))
+    m = nn.Sequential(nn.Linear(4, 8, bias=bias, dtype=torch.bfloat16))
     m = m.cuda()
     m_mx = copy.deepcopy(m)
-    block_size = 2
+    block_size = 4
     swap_linear_with_mx_inference_linear(m_mx, elem_dtype, block_size)
 
     x = torch.randn(*input_shape, device="cuda", dtype=torch.bfloat16)
     y_ref = m(x)
     y_mx = m_mx(x)
     sqnr = compute_error(y_ref, y_mx)
+    print(sqnr)
     if elem_dtype is torch.float8_e4m3fn:
         assert sqnr >= 20.0
     else:
@@ -202,10 +202,10 @@ def test_inference_compile_simple(elem_dtype):
     if elem_dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
         if not is_sm_at_least_89():
             pytest.skip("CUDA capability >= 8.9 required for float8 in triton")
-    m = nn.Sequential(nn.Linear(4, 6, bias=False, dtype=torch.bfloat16))
+    m = nn.Sequential(nn.Linear(4, 8, bias=False, dtype=torch.bfloat16))
     m = m.cuda()
     m_mx = copy.deepcopy(m)
-    block_size = 2
+    block_size = 4
     swap_linear_with_mx_inference_linear(m_mx, elem_dtype, block_size)
     m_mx = torch.compile(m_mx, fullgraph="true")
 
 
@@ -14,7 +14,7 @@
     DTYPE_FP6_E3M2,
     SUPPORTED_ELEM_DTYPES,
 )
-from torchao.prototype.mx_formats.custom_cast import pack_uint4
+from torchao.prototype.mx_formats.custom_cast import pack_uint4, pack_uint6
 from torchao.prototype.mx_formats.mx_tensor import (
     E8M0_EXPONENT_NAN_VAL,
     MXTensor,
@@ -70,15 +70,15 @@ def assert_sqnr_gt_threshold(orig, new, threshold):
 @pytest.mark.parametrize("elem_dtype", SUPPORTED_ELEM_DTYPES)
 def test_hello_world(elem_dtype):
     data = torch.randn(4, 4, device="cuda", dtype=torch.bfloat16)
-    block_size = 2
+    block_size = 4
     _test_mx(data, elem_dtype, block_size)
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.parametrize("elem_dtype", SUPPORTED_ELEM_DTYPES)
 def test_all_zeros(elem_dtype):
     data = torch.zeros(4, 4, device="cuda", dtype=torch.bfloat16)
-    block_size = 2
+    block_size = 4
     _test_mx(data, elem_dtype, block_size)
 
 
@@ -88,7 +88,7 @@ def test_some_zeros(elem_dtype):
     data = torch.randn(4, 4, device="cuda", dtype=torch.bfloat16)
     data[0, :] = 0.0
     data[:, 2] = 0.0
-    block_size = 2
+    block_size = 4
     _test_mx(data, elem_dtype, block_size)
 
 
@@ -100,9 +100,9 @@ def test_exponent_nan_in(elem_dtype):
     value is set to is NaN
     """
     tensor_hp = torch.tensor(
-        [float("nan"), 1, 2, 3, 4, 5], device="cuda", dtype=torch.bfloat16
+        [float("nan"), 1, 2, 3, 4, 5, 6, 7], device="cuda", dtype=torch.bfloat16
     )
-    block_size = 2
+    block_size = 4
     tensor_mx = MXTensor.to_mx(tensor_hp, elem_dtype, block_size)
     assert torch.all(tensor_mx._scale_e8m0[0] == E8M0_EXPONENT_NAN_VAL)
     assert not torch.any(tensor_mx._scale_e8m0[1:] == E8M0_EXPONENT_NAN_VAL)
@@ -115,24 +115,30 @@ def test_exponent_nan_out(elem_dtype):
     If block exponent value is NaN, the MX tensor block value is NaN
     """
     scale_e8m0_bits = torch.tensor(
-        [E8M0_EXPONENT_NAN_VAL, 23, 42], dtype=torch.uint8, device="cuda"
+        [E8M0_EXPONENT_NAN_VAL, 23], dtype=torch.uint8, device="cuda"
     )
+    
+    block_size = 4
+
     if elem_dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
-        data_bits = torch.tensor([0, 1, 2, 3, 4, 5], dtype=elem_dtype, device="cuda")  # noqa: E501
+        data_bits = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7], dtype=elem_dtype, device="cuda")  # noqa: E501
     elif elem_dtype in (DTYPE_FP6_E2M3, DTYPE_FP6_E3M2):
-        data_bits = torch.tensor([0, 1, 2, 3, 4, 5], dtype=torch.uint8, device="cuda")  # noqa: E501
+        data_bits = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7], dtype=torch.uint8, device="cuda")  # noqa: E501
+        if config.pack_fp6:
+            data_bits = data_bits.reshape(-1, block_size)
+            data_bits = pack_uint6(data_bits)
     elif elem_dtype == DTYPE_FP4:
-        data_bits = torch.tensor([0, 1, 2, 3, 4, 5], dtype=torch.uint8, device="cuda")  # noqa: E501
+        data_bits = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7], dtype=torch.uint8, device="cuda")  # noqa: E501
         data_bits = pack_uint4(data_bits)
     else:
         raise AssertionError("unsupported")
-    block_size = 2
+    
     tensor_mx = MXTensor(
         scale_e8m0_bits, data_bits, elem_dtype, block_size, torch.float
     )
     tensor_hp = tensor_mx.to_dtype(torch.float)
-    assert torch.all(torch.isnan(tensor_hp[0:1]))
-    assert not torch.any(torch.isnan(tensor_hp[2:]))
+    assert torch.all(torch.isnan(tensor_hp.flatten()[0:4]))
+    assert not torch.any(torch.isnan(tensor_hp.flatten()[4:]))
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@@ -141,24 +147,26 @@ def test_ranks(elem_dtype):
     """
     The reshaping logic works for various ranks
     """
-    B = 2
-    shapes = ((B * 4,), (B * 4, 2), (B * 4, 2, 2), (B * 4, 2, 2, 2))
+    B = 4
+    shapes = ((B * 4,), (B * 4, 4), (B * 4, 4, 4), (B * 4, 4, 4, 4))
     for s in shapes:
         tensor_hp = torch.randn(*s, device="cuda", dtype=torch.bfloat16)
         _test_mx(tensor_hp, elem_dtype, B)
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.parametrize("elem_dtype", SUPPORTED_ELEM_DTYPES)
-def test_block_sizes(elem_dtype):
+@pytest.mark.parametrize("B", [1, 4, 32])
+def test_block_sizes(elem_dtype, B):
     """
     Smoke test for various block sizes
     """
-    for B in (1, 2, 32):
-        if B == 1 and elem_dtype == DTYPE_FP4:
-            pytest.skip("unsupported configuration")
-        tensor_hp = torch.randn(B, device="cuda", dtype=torch.bfloat16)
-        _test_mx(tensor_hp, elem_dtype, B)
+    if B == 1 and elem_dtype == DTYPE_FP4:
+        pytest.skip("unsupported configuration")
+    elif B % 4 != 0 and elem_dtype in [DTYPE_FP6_E2M3, DTYPE_FP6_E3M2]:
+        pytest.skip("unsupported configuration")
+    tensor_hp = torch.randn(B, device="cuda", dtype=torch.bfloat16)
+    _test_mx(tensor_hp, elem_dtype, B)
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@@ -202,10 +210,11 @@ def test_cast_autograd(elem_dtype):
     torch.testing.assert_close(grad, x.grad, atol=0, rtol=0)
 
 
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.parametrize("elem_dtype", SUPPORTED_ELEM_DTYPES)
 def test_view(elem_dtype):
-    x = torch.randn(1, 2, 4)
-    block_size = 2
+    x = torch.randn(1, 2, 4, device="cuda")
+    block_size = 4
     x_mx = MXTensor.to_mx(x, elem_dtype, block_size)
     x_mx_2 = x_mx.view(2, 4)  # noqa: F841
 
@@ -231,7 +240,7 @@ def test_to_mx_from_mx_compile_numerics(elem_dtype, hp_dtype, all_zeros):
         x = torch.randn(*shape, dtype=hp_dtype, device="cuda")
     else:
         x = torch.zeros(*shape, dtype=hp_dtype, device="cuda")
-    block_size = 2
+    block_size = 4
     to_mx_c = torch.compile(MXTensor.to_mx, fullgraph=True)
 
     x_mx = MXTensor.to_mx(x, elem_dtype, block_size)
 
@@ -1,2 +1,3 @@
 # If True, uses a custom triton kernel for fp4 dequantize
 use_fp4_custom_triton_dequant_kernel = False
+pack_fp6 = True
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`# If True, uses a custom triton kernel for fp4 dequantize`
`2`	`2`	`use_fp4_custom_triton_dequant_kernel = False`
	`3`	`+pack_fp6 = True`