Add more supports for NPU in addition to CUDA in previously supported use cases.

chenhao388 · chenhao388 · commit 76ef99ffa1b4 · 2025-10-24T15:52:04.000+08:00
diff --git a/test/test_compile.py b/test/test_compile.py
@@ -38,6 +38,8 @@
 
 from tensordict.tensorclass import TensorClass
 
+from _utils_internal import is_npu_available
+
 from torch.utils._pytree import SUPPORTED_NODES, tree_map
 
 TORCH_VERSION = version.parse(version.parse(torch.__version__).base_version)
@@ -50,6 +52,13 @@
 
 _IS_OSX = platform.system() == "Darwin"
 
+npu_device_count = 0
+if torch.cuda.is_available():
+    cur_device = "cuda"
+elif is_npu_available():
+    cur_device = "npu"
+    npu_device_count = torch.npu.device_count()
+
 
 def test_vmap_compile():
     # Since we monkey patch vmap we need to make sure compile is happy with it
@@ -262,11 +271,11 @@ def make_td_with_names(data):
         make_td_with_names_c(data_dict)
 
     @pytest.mark.skipif(
-        not torch.cuda.is_available(), reason="cuda required to test device casting"
+        not torch.cuda.is_available() and not is_npu_available(), reason="cuda or npu required to test device casting"
     )
     @pytest.mark.parametrize("has_device", [True, False])
     def test_to(self, has_device, mode):
-        device = "cuda:0"
+        device = f"{cur_device}:0"
 
         def test_to_device(td):
             return td.to(device)
@@ -549,11 +558,11 @@ def clone(td: TensorDict):
             assert clone_c(data).a.b is data.a.b
 
     @pytest.mark.skipif(
-        not torch.cuda.is_available(), reason="cuda required to test device casting"
+        not torch.cuda.is_available() and not is_npu_available(), reason="cuda or npu required to test device casting"
     )
     @pytest.mark.parametrize("has_device", [True, False])
     def test_tc_to(self, has_device, mode):
-        device = "cuda:0"
+        device = f"{cur_device}:0"
 
         def test_to_device(tc):
             return tc.to(device)
@@ -1242,12 +1251,12 @@ def test_state_dict(self, compiled):
         torch.testing.assert_close(y1, y2)
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda is not available")
+@pytest.mark.skipif(not torch.cuda.is_available() and not is_npu_available(), reason="cuda or npu is not available")
 class TestCompileNontensor:
     # Same issue with the decorator @tensorclass version
     @pytest.fixture(scope="class")
     def data(self):
-        return torch.zeros((4, 3), device="cuda")
+        return torch.zeros((4, 3), device=cur_device)
 
     class TensorClassWithNonTensorData(TensorClass["nocast"]):
         tensor: torch.Tensor
@@ -1265,13 +1274,13 @@ def fn_no_device(self, data):
 
     def fn_with_device(self, data):
         a = self.TensorClassWithNonTensorData(
-            tensor=data, non_tensor_data=1, batch_size=[4], device="cuda"
+            tensor=data, non_tensor_data=1, batch_size=[4], device=cur_device
         )
         return a.tensor
 
     def fn_with_device_without_batch_size(self, data):
         a = self.TensorClassWithNonTensorData(
-            tensor=data, non_tensor_data=1, device="cuda"
+            tensor=data, non_tensor_data=1, device=cur_device
         )
         return a.tensor
 
diff --git a/test/test_distributed.py b/test/test_distributed.py
@@ -17,6 +17,7 @@
 
 from tensordict import LazyStackedTensorDict, MemoryMappedTensor, TensorDict
 from tensordict.utils import logger as tdlogger
+from _utils_internal import is_npu_available
 from torch import distributed as dist, multiprocessing as mp, nn
 from torch.distributed._tensor import (
     DeviceMesh,
@@ -107,6 +108,71 @@ def test_fsdp_module(self, tmpdir):
         assert (TensorDict.load_memmap(tmpdir) == 1).all()
 
 
+@pytest.mark.skipif(
+    not is_npu_available() or not torch.npu.device_count() > 2, reason="not enough npu devices"
+)
+class TestNPUFSDP:
+    class MyDModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.fc1 = nn.Linear(8, 8, bias=False)
+            self.fc2 = nn.Linear(8, 8, bias=False)
+            self.relu = nn.ReLU()
+            for p in self.parameters():
+                p.data.fill_(1.0)
+
+        def forward(self, input):
+            return self.relu(self.fc1(input) + self.fc2(input))
+
+    @classmethod
+    def make_module(cls, device=None):
+        with (
+            torch.device(f"npu:{device}")
+            if device is not None
+            else torch.device("npu")
+        ):
+            my_module = cls.MyDModule()
+            my_sharded_module = FSDP(my_module, device_id=device)
+        return my_sharded_module
+
+    @classmethod
+    def worker(cls, rank, path):
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "10017"
+
+        torch.distributed.init_process_group(
+            "hccl",
+            rank=rank,
+            world_size=2,
+            init_method="tcp://localhost:10017",
+        )
+        torch.npu.set_device(rank)
+        module = cls.make_module(rank)
+        dist.barrier()
+        # cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+        # with FSDP.state_dict_type(module, StateDictType.SHARDED_STATE_DICT): #, cfg):
+        #     tdlogger.info(module.state_dict())
+
+        # td = TensorDict(module.state_dict(), []).unflatten_keys(".")
+        td = TensorDict.from_module(module, use_state_dict=True)
+        if rank == 0:
+            td.memmap(path)
+        dist.destroy_process_group()
+
+    def test_fsdp_module(self, tmpdir):
+        try:
+            mp.set_start_method("spawn")
+        except Exception:
+            tdlogger.info("start method already set to", mp.get_start_method())
+        proc0 = mp.Process(target=self.worker, args=(0, tmpdir))
+        proc1 = mp.Process(target=self.worker, args=(1, tmpdir))
+        proc0.start()
+        proc1.start()
+        proc0.join(timeout=TIMEOUT)
+        proc1.join(timeout=TIMEOUT)
+        assert (TensorDict.load_memmap(tmpdir) == 1).all()
+
+
 # not using TorchVersion to make the comparison work with dev
 TORCH_VERSION = version.parse(version.parse(torch.__version__).base_version)
 
@@ -241,8 +307,8 @@ def server(queue):
                 },
                 [2],
             )
-            .expand(1, 2)
-            .contiguous()
+                .expand(1, 2)
+                .contiguous()
         )
         td.gather_and_stack(0)
         assert (td != 0).all()
@@ -314,8 +380,8 @@ def server(queue, op, async_op, return_premature):
                 },
                 [2],
             )
-            .expand(1, 2)
-            .contiguous()
+                .expand(1, 2)
+                .contiguous()
         )
         out = td.reduce(0, op=op, async_op=async_op, return_premature=return_premature)
         if not async_op:
@@ -798,8 +864,8 @@ def server(cls, queue):
                 },
                 [2],
             )
-            .expand(1, 2)
-            .contiguous()
+                .expand(1, 2)
+                .contiguous()
         )
         td.init_remote(dst=1)
 
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -57,6 +57,8 @@
     skip_existing,
 )
 from tensordict.tensorclass import TensorClass
+from _utils_internal import is_npu_available
+
 
 from torch import distributions, nn
 from torch.distributions import Categorical, Normal
@@ -111,6 +113,15 @@
         pytest.mark.filterwarnings("ignore:inplace"),
     )
 
+def get_device():
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda:0")
+    elif is_npu_available():
+        device = torch.device("npu:0")
+    elif torch.mps.is_available():
+        device = torch.device("mps:0")
+    return device
 
 class TestInteractionType:
     def test_base(self):
@@ -2149,37 +2160,24 @@ def test_module_buffer():
     if torch.cuda.device_count():
         module.cuda()
         assert module.td.device.type == "cuda"
+    elif is_npu_available():
+        module.npu()
+        assert module.td.device.type == "npu"
 
 
 @pytest.mark.parametrize(
     "original_device",
     [
         None,
         torch.device("cpu"),
-        (
-            torch.device("cuda:0")
-            if torch.cuda.is_available()
-            else (
-                torch.device("mps:0")
-                if torch.mps.is_available()
-                else torch.device("cpu")
-            )
-        ),
+        get_device(),
     ],
 )
 @pytest.mark.parametrize(
     "new_device",
     [
         torch.device("cpu"),
-        (
-            torch.device("cuda:0")
-            if torch.cuda.is_available()
-            else (
-                torch.device("mps:0")
-                if torch.mps.is_available()
-                else torch.device("cpu")
-            )
-        ),
+        get_device(),
     ],
 )
 @pytest.mark.parametrize("tc", [True, False], ids=["tc", "td"])
diff --git a/test/test_tensorclass.py b/test/test_tensorclass.py
@@ -45,6 +45,8 @@
 from tensordict._td import lazy_stack
 from tensordict.base import _GENERIC_NESTED_ERR
 from tensordict.tensorclass import from_dataclass
+from _utils_internal import is_npu_available
+
 from torch import Tensor
 
 _has_streaming = importlib.util.find_spec("streaming", None) is not None
@@ -2566,6 +2568,8 @@ def test_to(self):
         td = self.get_nested()
         if torch.cuda.is_available():
             device = torch.device("cuda:0")
+        elif is_npu_available():
+            device = torch.device("npu:0")
         else:
             device = torch.device("cpu:1")
         td_device = td.to(device)