Revert "Support cpu tensor transfer with NIXL in GPU Objects" (#56026)

aslonnie · web-flow · commit 92419304797a · 2025-08-27T17:56:56.000-07:00
Reverts #55793 fixing lint check
diff --git a/python/ray/experimental/channel/serialization_context.py b/python/ray/experimental/channel/serialization_context.py
@@ -97,9 +97,7 @@ def serialize_tensor(
         from ray.experimental.channel import ChannelContext
 
         ctx = ChannelContext.get_current()
-        if self._use_external_transport and (
-            ctx._torch_device is None or ctx._torch_device == tensor.device
-        ):
+        if self._use_external_transport and tensor.device == ctx.torch_device:
             # External transport is enabled and we found a tensor that matches
             # our device.  Add the actual tensor to a buffer. The buffer of
             # tensors should later be popped by the caller and sent via
diff --git a/python/ray/experimental/collective/collective_tensor_transport.py b/python/ray/experimental/collective/collective_tensor_transport.py
@@ -40,19 +40,8 @@ def __ray_get_tensor_transport_metadata__(
             # it could take arbitrarily long and we don't want to trigger a spurious
             # timeout.
             gpu_object = gpu_object_store.wait_and_get_object(obj_id)
-            tensor_meta = []
-            device = None
-            if gpu_object:
-                device = gpu_object[0].device
-                for t in gpu_object:
-                    if t.device.type != device.type:
-                        raise ValueError(
-                            "All tensors in one GPU object must be the same device type."
-                        )
-                    tensor_meta.append((t.shape, t.dtype))
             return CollectiveTransportMetadata(
-                tensor_meta=tensor_meta,
-                tensor_device=device,
+                tensor_meta=[(t.shape, t.dtype) for t in gpu_object],
             )
 
         # Submit a Ray actor task to the source actor to get the tensor metadata.
@@ -141,11 +130,10 @@ def recv_multiple_tensors(
     def send_multiple_tensors(
         tensors: List["torch.Tensor"],
         communicator_metadata: CollectiveCommunicatorMetadata,
+        device: "torch.device",
     ):
         import ray.util.collective as collective
 
-        device = tensor_transport_metadata.tensor_device
-
         for tensor in tensors:
             if tensor.device.type != device.type:
                 # TODO(swang): Right now there is no way to catch this error
diff --git a/python/ray/experimental/collective/nixl_tensor_transport.py b/python/ray/experimental/collective/nixl_tensor_transport.py
@@ -45,25 +45,14 @@ def __ray_get_tensor_transport_metadata__(
             from ray.util.collective.collective import get_group_handle
 
             nixl_backend: NixlBackend = get_group_handle(NIXL_GROUP_NAME)
-            device = None
-            tensor_meta = []
             if gpu_object:
                 serialized_descs, agent_meta = nixl_backend.get_nixl_metadata(
                     gpu_object
                 )
-                # We assume all tensors in one GPU object have the same device type.
-                device = gpu_object[0].device
-                for t in gpu_object:
-                    if t.device.type != device.type:
-                        raise ValueError(
-                            "All tensors in one GPU object must be the same device type."
-                        )
-                    tensor_meta.append((t.shape, t.dtype))
             else:
                 serialized_descs, agent_meta = None, None
             return NixlTransportMetadata(
-                tensor_meta=tensor_meta,
-                tensor_device=device,
+                tensor_meta=[(t.shape, t.dtype) for t in gpu_object],
                 nixl_serialized_descs=serialized_descs,
                 nixl_agent_meta=agent_meta,
             )
diff --git a/python/ray/experimental/collective/tensor_transport_manager.py b/python/ray/experimental/collective/tensor_transport_manager.py
@@ -143,11 +143,13 @@ def recv_multiple_tensors(
     def send_multiple_tensors(
         tensors: List["torch.Tensor"],
         communicator_metadata: CommunicatorMetadata,
+        device: "torch.device",
     ):
         """
         Send multiple tensors to the destination actor.
 
         Args:
             tensors: The tensors to send.
             communicator_metadata: The communicator metadata for the send/recv operation.
+            device: The device to send the tensors to.
         """
diff --git a/python/ray/experimental/collective/util.py b/python/ray/experimental/collective/util.py
@@ -1,4 +1,4 @@
-from typing import Tuple, TYPE_CHECKING
+from typing import Tuple
 from contextlib import closing
 import socket
 
@@ -11,9 +11,6 @@
     CollectiveTensorTransport,
 )
 
-if TYPE_CHECKING:
-    import torch
-
 # Singleton instances for tensor transport managers
 _nixl_tensor_transport_manager = None
 _collective_tensor_transport_manager = None
@@ -44,18 +41,6 @@ def get_tensor_transport_manager(
         raise ValueError(f"Unsupported tensor transport protocol: {tensor_transport}")
 
 
-def device_match_transport(device: "torch.device", tensor_transport: Backend) -> bool:
-    """Check if the device matches the transport."""
-    if tensor_transport == Backend.NIXL:
-        return device.type == "cuda" or device.type == "cpu"
-    elif tensor_transport == Backend.TORCH_GLOO:
-        return device.type == "cpu"
-    elif tensor_transport == Backend.NCCL:
-        return device.type == "cuda"
-    else:
-        raise ValueError(f"Unsupported tensor transport protocol: {tensor_transport}")
-
-
 def find_free_port() -> int:
     with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
         s.bind(("", 0))
diff --git a/python/ray/experimental/gpu_object_manager/gpu_object_store.py b/python/ray/experimental/gpu_object_manager/gpu_object_store.py
@@ -11,9 +11,6 @@
     TensorTransportMetadata,
 )
 
-from ray.experimental.collective import get_tensor_transport_manager
-from ray.experimental.collective.util import device_match_transport
-
 try:
     import torch
 except ImportError:
@@ -28,6 +25,14 @@
     TensorTransportEnum.NIXL: Backend.NIXL,
 }
 
+COLLECTIVE_BACKEND_TO_TORCH_DEVICE = {
+    Backend.NCCL: torch.device("cuda"),
+    Backend.TORCH_GLOO: torch.device("cpu"),
+    # TODO(Qiaolin-Yu): NIXL could also transfer tensors from CPU to CPU.
+    # More details in https://github.com/ray-project/ray/issues/55587.
+    Backend.NIXL: torch.device("cuda"),
+}
+
 
 def _tensor_transport_to_collective_backend(
     tensor_transport: TensorTransportEnum,
@@ -56,17 +61,15 @@ def __ray_send__(
     tensors = gpu_object_store.get_object(obj_id)
 
     backend = collective.get_group_handle(communicator_meta.communicator_name).backend()
+    device = COLLECTIVE_BACKEND_TO_TORCH_DEVICE[backend]
+
+    from ray.experimental.collective import get_tensor_transport_manager
 
     tensor_transport_manager = get_tensor_transport_manager(backend)
-    if tensors and not device_match_transport(
-        tensor_transport_meta.tensor_device, backend
-    ):
-        raise ValueError(
-            f"Tensor transport backend {backend} does not support tensor transfer on device {tensor_transport_meta.tensor_device}."
-        )
     tensor_transport_manager.send_multiple_tensors(
         tensors,
         communicator_meta,
+        device=device,
     )
 
 
@@ -79,16 +82,14 @@ def __ray_recv__(
     """Helper function that runs on the dst actor to receive tensors from the src actor."""
     from ray._private.worker import global_worker
 
+    from ray.experimental.collective import get_tensor_transport_manager
+
     backend = collective.get_group_handle(communicator_meta.communicator_name).backend()
 
-    device = tensor_transport_meta.tensor_device
+    device = COLLECTIVE_BACKEND_TO_TORCH_DEVICE[backend]
     tensor_meta = tensor_transport_meta.tensor_meta
 
     gpu_object_store = global_worker.gpu_object_manager.gpu_object_store
-    if tensor_meta and not device_match_transport(device, backend):
-        raise ValueError(
-            f"Tensor transport backend {backend} does not support tensor transfer on device {device}."
-        )
     tensors = []
     for meta in tensor_meta:
         shape, dtype = meta
diff --git a/python/ray/tests/test_gpu_objects_nixl.py b/python/ray/tests/test_gpu_objects_nixl.py
@@ -7,11 +7,10 @@
 @ray.remote(num_gpus=1, num_cpus=0, enable_tensor_transport=True)
 class GPUTestActor:
     @ray.method(tensor_transport="nixl")
-    def echo(self, data, device):
-        return data.to(device)
+    def echo(self, data):
+        return data.to("cuda")
 
-    def sum(self, data, device):
-        assert data.device.type == device
+    def sum(self, data):
         return data.sum().item()
 
 
@@ -24,21 +23,12 @@ def test_p2p(ray_start_regular):
 
     # Create test tensor
     tensor = torch.tensor([1, 2, 3])
-
-    tensor1 = torch.tensor([4, 5, 6])
-
-    # Test GPU to GPU transfer
-    ref = src_actor.echo.remote(tensor, "cuda")
+    ref = src_actor.echo.remote(tensor)
 
     # Trigger tensor transfer from src to dst actor
-    result = dst_actor.sum.remote(ref, "cuda")
+    result = dst_actor.sum.remote(ref)
     assert tensor.sum().item() == ray.get(result)
 
-    # Test CPU to CPU transfer
-    ref1 = src_actor.echo.remote(tensor1, "cpu")
-    result1 = dst_actor.sum.remote(ref1, "cpu")
-    assert tensor1.sum().item() == ray.get(result1)
-
 
 @pytest.mark.parametrize("ray_start_regular", [{"num_gpus": 1}], indirect=True)
 def test_intra_gpu_tensor_transfer(ray_start_regular):
@@ -47,8 +37,8 @@ def test_intra_gpu_tensor_transfer(ray_start_regular):
     tensor = torch.tensor([1, 2, 3])
 
     # Intra-actor communication for pure GPU tensors
-    ref = actor.echo.remote(tensor, "cuda")
-    result = actor.sum.remote(ref, "cuda")
+    ref = actor.echo.remote(tensor)
+    result = actor.sum.remote(ref)
     assert tensor.sum().item() == ray.get(result)
 
 
diff --git a/python/ray/util/collective/types.py b/python/ray/util/collective/types.py
@@ -61,12 +61,9 @@ class TensorTransportMetadata:
 
     Args:
         tensor_meta: A list of tuples, each containing the shape and dtype of a tensor.
-        tensor_device: The device of the tensor. Currently, we require all tensors in the
-        list have the same device type.
     """
 
     tensor_meta: List[Tuple["torch.Size", "torch.dtype"]]
-    tensor_device: Optional["torch.device"] = None
 
 
 @dataclass