From 970e4cf285efdb4a787115aba24e7625d5a1525b Mon Sep 17 00:00:00 2001 From: Kristian Hartikainen Date: Mon, 11 Aug 2025 12:09:55 -0400 Subject: [PATCH] Add L4 support --- xmanager/cloud/vertex.py | 60 ++++++++++++++++++++++++++++++++++- xmanager/cloud/vertex_test.py | 58 ++++++++++++++++++++++++++++++++- xmanager/xm/resources.py | 2 ++ 3 files changed, 118 insertions(+), 2 deletions(-) diff --git a/xmanager/cloud/vertex.py b/xmanager/cloud/vertex.py index f93bf73..f3b2441 100644 --- a/xmanager/cloud/vertex.py +++ b/xmanager/cloud/vertex.py @@ -64,6 +64,16 @@ 8: 'a2-highgpu-8g', 16: 'a2-megagpu-16g', } +_L4_GPUS_TO_MACHINE_TYPE = { + (1, 4): 'g2-standard-4', + (1, 8): 'g2-standard-8', + (1, 12): 'g2-standard-12', + (1, 16): 'g2-standard-16', + (1, 32): 'g2-standard-32', + (2, 24): 'g2-standard-24', + (4, 48): 'g2-standard-48', + (8, 96): 'g2-standard-96', +} _CLOUD_TPU_ACCELERATOR_TYPES = { xm.ResourceType.TPU_V2: 'TPU_V2', @@ -97,6 +107,28 @@ ), } +def aip_v1_gpu_accelerator_type_str(gpu_type: xm.GpuType) -> str: + tesla_architectures = {xm.ResourceType.P4, xm.ResourceType.T4, xm.ResourceType.P100, xm.ResourceType.V100, xm.ResourceType.A100} + match gpu_type: + case _ if gpu_type in tesla_architectures: + return f"NVIDIA_TESLA_{gpu_type.name.upper()}" + case xm.ResourceType.L4: + return 'NVIDIA_L4' + case xm.ResourceType.L4_24TH: + return 'NVIDIA_L4' + case xm.ResourceType.A100_80GIB: + return 'NVIDIA_A100_80GB' + case xm.ResourceType.H100: + return 'NVIDIA_H100_80GB' + case xm.ResourceType.H200: + return 'NVIDIA_H200_141GB' + case xm.ResourceType.B200: + return 'NVIDIA_B200' + case _: + raise ValueError( + f'Unsupported GPU type {gpu_type}. Supported types are: {GpuType}' + ) + # Hide noisy warning regarding: # `file_cache is unavailable when using oauth2client >= 4.0.0 or google-auth` logging.getLogger('googleapiclient.discovery_cache').setLevel(logging.ERROR) @@ -298,7 +330,7 @@ def get_machine_spec(job: xm.Job) -> Dict[str, Any]: for resource, value in requirements.task_requirements.items(): accelerator_type = None if resource in xm.GpuType: - accelerator_type = 'NVIDIA_TESLA_' + str(resource).upper() + accelerator_type = aip_v1_gpu_accelerator_type_str(resource) elif resource in xm.TpuType: accelerator_type = _CLOUD_TPU_ACCELERATOR_TYPES[resource] if accelerator_type: @@ -306,6 +338,8 @@ def get_machine_spec(job: xm.Job) -> Dict[str, Any]: spec['accelerator_count'] = int(value) accelerator = spec.get('accelerator_type', None) if accelerator and accelerator == aip_v1.AcceleratorType.NVIDIA_TESLA_A100: + print(f'Available A100 machine types (gpus: machine_type): {_A100_GPUS_TO_MACHINE_TYPE}') + for gpus, machine_type in sorted(_A100_GPUS_TO_MACHINE_TYPE.items()): if spec['accelerator_count'] <= gpus: spec['machine_type'] = machine_type @@ -316,6 +350,30 @@ def get_machine_spec(job: xm.Job) -> Dict[str, Any]: spec['accelerator_count'] ) ) + elif accelerator and accelerator == aip_v1.AcceleratorType.NVIDIA_L4: + print(f'Available L4 machine types ((gpus, cpus): machine_type): {_L4_GPUS_TO_MACHINE_TYPE}') + + required_gpus = spec['accelerator_count'] + required_cpus = requirements.task_requirements.get(xm.ResourceType.CPU, None) + gpus_matches = lambda gpus: spec['accelerator_count'] <= gpus + cpus_matches = lambda cpus: required_cpus is None or cpus == required_cpus + + l4_candidates = [ + (machine_type, (gpus, cpus)) + for (gpus, cpus), machine_type in _L4_GPUS_TO_MACHINE_TYPE.items() + if gpus_matches(gpus) and cpus_matches(cpus) + ] + + if not l4_candidates: + cpu_str = f' with {required_cpus} CPUs' if required_cpus else '' + raise ValueError( + f'l4={required_gpus}{cpu_str} does not fit in any valid machine type.' + ) + + # Find the best fit (smallest machine that satisfies the requirements). + # The key for sorting is (gpus, cpus). + best_fit_machine_type, _ = min(l4_candidates, key=lambda item: item[1]) + spec['machine_type'] = best_fit_machine_type elif ( accelerator == aip_v1.AcceleratorType.TPU_V2 or accelerator == aip_v1.AcceleratorType.TPU_V3 diff --git a/xmanager/cloud/vertex_test.py b/xmanager/cloud/vertex_test.py index c2c1573..79f1fd1 100644 --- a/xmanager/cloud/vertex_test.py +++ b/xmanager/cloud/vertex_test.py @@ -16,7 +16,10 @@ import os import unittest from unittest import mock +import io +from contextlib import redirect_stdout +from absl.testing import parameterized from google import auth from google.auth import credentials from google.cloud import aiplatform @@ -31,7 +34,7 @@ from xmanager.cloud import vertex # pylint: disable=g-bad-import-order -class VertexTest(unittest.TestCase): +class VertexTest(parameterized.TestCase): @mock.patch.object(xm_auth, 'get_service_account') @mock.patch.object(auth, 'default') @@ -155,6 +158,59 @@ def test_get_machine_spec_a100(self): }, ) + @parameterized.parameters( + {'cpus': 4, 'gpus': 1, 'expected': 'g2-standard-4'}, + {'cpus': 8, 'gpus': 1, 'expected': 'g2-standard-8'}, + {'cpus': 12, 'gpus': 1, 'expected': 'g2-standard-12'}, + {'cpus': 16, 'gpus': 1, 'expected': 'g2-standard-16'}, + {'cpus': 32, 'gpus': 1, 'expected': 'g2-standard-32'}, + {'cpus': 24, 'gpus': 2, 'expected': 'g2-standard-24'}, + {'cpus': 48, 'gpus': 4, 'expected': 'g2-standard-48'}, + {'cpus': 96, 'gpus': 8, 'expected': 'g2-standard-96'}, + ) + def test_get_machine_spec_l4(self, cpus, gpus, expected): + job = xm.Job( + executable=local_executables.GoogleContainerRegistryImage('name', ''), + executor=local_executors.Vertex( + requirements=xm.JobRequirements(l4=gpus, cpu=cpus) + ), + args={}, + ) + machine_spec = vertex.get_machine_spec(job) + self.assertDictEqual( + machine_spec, + { + 'machine_type': expected, + 'accelerator_type': vertex.aip_v1.AcceleratorType.NVIDIA_L4, + 'accelerator_count': gpus, + }, + ) + + @parameterized.parameters( + {'cpus': 3, 'gpus': 1}, + {'cpus': 4, 'gpus': 2}, + {'cpus': 25, 'gpus': 2}, + {'cpus': 41, 'gpus': 4}, + {'cpus': 48, 'gpus': 8}, + ) + def test_get_machine_spec_l4_failure(self, cpus, gpus): + job = xm.Job( + executable=local_executables.GoogleContainerRegistryImage('name', ''), + executor=local_executors.Vertex( + requirements=xm.JobRequirements(l4=gpus, cpu=cpus) + ), + args={}, + ) + f = io.StringIO() + with redirect_stdout(f), self.assertRaises(ValueError) as cm: + vertex.get_machine_spec(job) + + self.assertIn('Available L4 machine types', f.getvalue()) + self.assertIn( + f'l4={gpus} with {cpus}.0 CPUs does not fit in any valid machine type.', + str(cm.exception), + ) + def test_get_machine_spec_tpu(self): job = xm.Job( executable=local_executables.GoogleContainerRegistryImage('name', ''), diff --git a/xmanager/xm/resources.py b/xmanager/xm/resources.py index d9f6f2f..19c0fe3 100644 --- a/xmanager/xm/resources.py +++ b/xmanager/xm/resources.py @@ -69,6 +69,7 @@ class ResourceType(enum.Enum, metaclass=_CaseInsensitiveResourceTypeMeta): LOCAL_GPU = 100006 P4 = 21 T4 = 22 + L4 = 11 L4_24TH = 68 P100 = 14 V100 = 17 @@ -194,6 +195,7 @@ def __new__(cls, value: int) -> ResourceType: # LOCAL_GPU is missing as only specific GPU types should be added. ResourceType.P4, ResourceType.T4, + ResourceType.L4, ResourceType.L4_24TH, ResourceType.P100, ResourceType.V100,