Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions dags/common/test_owner.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,25 +39,18 @@ class Team(enum.Enum):
# Default test owner
AIRFLOW = "airflow"
# XLML - TensorFlow
ERIC_L = "Eric L."
CHANDRA_D = "chandrasekhard2"
GAGIK_A = "gagika"

# PYTORCH
PEI_Z = "Pei Z."
MANFEI_B = "manfeiBai"

# MaxText
TONY_C = "tonyjohnchen"
JON_B = "Jon B."
RAYMOND_Z = "Raymond Z."
MATT_D = "gobbleturk"
PRIYANKA_G = "Priyanka G."
SURBHI_J = "SurbhiJainUSC"
ZHIYU_L = "Zhiyu L."
MOHIT_K = "khatwanimohit"
ANISHA_M = "A9isha"
YUWEI_Y = "Yuwei Y."
RISHABH_B = "notabee"
NUOJIN_C = "NuojCheng"
BRANDEN_V = "bvandermoon"
Expand All @@ -78,9 +71,7 @@ class Team(enum.Enum):
ROHAN_B = "Rohan-Bierneni"

# Inference
ANDY_Y = "Andy Y."
XIANG_S = "sixiang-google"
MORGAN_D = "Morgan D."
YIJIA_J = "jyj0w0"
PATE_M = "patemotter"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
}

shared_task_config = {
"test_owner": test_owner.ZHIYU_L,
"test_owner": test_owner.AIRFLOW,
"cluster": XpkClusters.TPU_V5E_256_CLUSTER,
"time_out_in_min": 60,
"base_output_directory": base_output_directory,
Expand Down
2 changes: 1 addition & 1 deletion dags/examples/maxtext_sweep_gke_example_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
# Get list of MaxText GKE XPK jobs
maxtext_sweep_gke_test = (
maxtext_sweep_gke_config.get_maxtext_sweep_gke_config(
test_owner=test_owner.RAYMOND_Z,
test_owner=test_owner.AIRFLOW,
cluster=XpkClusters.TPU_V4_128_CLUSTER,
time_out_in_min=60,
base_output_directory=base_output_directory,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def get_config(
set_up_cmds=set_up_cmds,
run_model_cmds=run_model_cmds,
timeout=datetime.timedelta(minutes=time_out_in_min),
task_owner=test_owner.ANDY_Y,
task_owner=test_owner.AIRFLOW,
num_slices=num_slices,
gcs_subfolder=f"{GCS_SUBFOLDER_PREFIX}/maxtext",
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def config(
set_up_cmds=set_up_cmds,
run_model_cmds=run_model_cmds,
timeout=datetime.timedelta(minutes=time_out_in_min),
task_owner=test_owner.MORGAN_D,
task_owner=test_owner.AIRFLOW,
num_slices=num_slices,
gcs_subfolder=f"{GCS_SUBFOLDER_PREFIX}/maxtext",
)
Expand Down
2 changes: 1 addition & 1 deletion dags/multipod/configs/maxtext_gce_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def get_maxtext_end_to_end_test_config(
set_up_cmds=set_up_cmds,
run_model_cmds=run_model_cmds,
timeout=datetime.timedelta(minutes=time_out_in_min),
task_owner=test_owner.JON_B,
task_owner=test_owner.AIRFLOW,
num_slices=num_slices,
)

Expand Down
10 changes: 5 additions & 5 deletions dags/multipod/legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@
time_out_in_min=60,
test_name=f"gpt1-like-{test_mode.value}",
docker_image=DOCKER_IMAGE[test_mode].value,
test_owner=test_owner.JON_B,
test_owner=test_owner.AIRFLOW,
num_slices=n_slice,
cluster=XpkClusters.TPU_V4_16_CLUSTER,
).run()
Expand All @@ -87,7 +87,7 @@
f"bash end_to_end/tpu/test_decode.sh 10 gs://maxtext-xlml gs://maxtext-xlml/dataset xlml-decode-v4-8-1slice-{test_mode.value}",
),
docker_image=DOCKER_IMAGE[test_mode].value,
test_owner=test_owner.JON_B,
test_owner=test_owner.AIRFLOW,
).run()

# v4-8 1 slice TFLOPS test
Expand All @@ -98,7 +98,7 @@
f"bash end_to_end/tpu/test_tflops.sh xlml {tflop_thresholds['v4-8']['1']} gs://maxtext-xlml gs://maxtext-xlml/dataset xlml-tflops-v4-8-1slice-{test_mode.value}",
),
docker_image=DOCKER_IMAGE[test_mode].value,
test_owner=test_owner.PRIYANKA_G,
test_owner=test_owner.AIRFLOW,
).run()

# v4-16 1 and 2 slice TFLOPS test
Expand All @@ -112,7 +112,7 @@
),
cluster=XpkClusters.TPU_V4_16_CLUSTER,
docker_image=DOCKER_IMAGE[test_mode].value,
test_owner=test_owner.PRIYANKA_G,
test_owner=test_owner.AIRFLOW,
).run()

# v4-16 two slices determinism test
Expand Down Expand Up @@ -164,5 +164,5 @@
f"bash end_to_end/tpu/test_checkpoint_resharding.sh xlml-checkpoint-resharding-v4-8-2slice-{SetupMode.STABLE.value} gs://maxtext-xlml gs://maxtext-xlml/dataset",
),
docker_image=DOCKER_IMAGE[SetupMode.STABLE].value,
test_owner=test_owner.PRIYANKA_G,
test_owner=test_owner.AIRFLOW,
).run()
4 changes: 2 additions & 2 deletions dags/multipod/maxtext_configs_aot_hybridsim.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def hybridsim_compile_and_run(test_group_id):
test_name=f"maxtext-{model_size}-{n}xv{tpu.value}-{num_cores}-aot",
run_model_cmds=aot_cmd,
docker_image=DockerImage.MAXTEXT_TPU_JAX_NIGHTLY.value,
test_owner=test_owner.RAYMOND_Z,
test_owner=test_owner.AIRFLOW,
).run(gcs_location=shared_gcs_location)

# Run HybridSim workload: read HLO from GCS, generate estimated step time
Expand All @@ -75,7 +75,7 @@ def hybridsim_compile_and_run(test_group_id):
test_name=f"maxtext-{model_size}-{n}xv{tpu.value}-{num_cores}-hybridsim",
run_model_cmds=hybridsim_cmd,
docker_image=DockerImage.CLOUD_HYBRIDSIM_NIGHTLY.value,
test_owner=test_owner.RAYMOND_Z,
test_owner=test_owner.AIRFLOW,
user_specified_job_metric_config=job_metric_config,
).run(gcs_location=shared_gcs_location)

Expand Down
4 changes: 2 additions & 2 deletions dags/multipod/maxtext_gpu_end_to_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def run_maxtext_tests(dag: models.DAG):
num_slices=nnodes,
cluster=XpkClusters.GPU_A3_CLUSTER,
docker_image=DockerImage.MAXTEXT_GPU_JAX_STABLE_STACK.value,
test_owner=test_owner.YUWEI_Y,
test_owner=test_owner.AIRFLOW,
).run_with_quarantine(quarantine_task_group)
stable_a3plus_gpu = gke_config.get_maxtext_end_to_end_gpu_gke_test_config(
time_out_in_min=300,
Expand All @@ -216,7 +216,7 @@ def run_maxtext_tests(dag: models.DAG):
num_slices=nnodes,
cluster=XpkClusters.GPU_A3PLUS_CLUSTER,
docker_image=DockerImage.MAXTEXT_GPU_JAX_STABLE_STACK.value,
test_owner=test_owner.YUWEI_Y,
test_owner=test_owner.AIRFLOW,
).run_with_quarantine(quarantine_task_group)
stable_a3_gpu >> stable_a3plus_gpu

Expand Down
2 changes: 1 addition & 1 deletion dags/multipod/maxtext_profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,5 +60,5 @@
test_name=f"maxtext-profiling-{mode.value}",
run_model_cmds=profiling_cmds,
docker_image=image.value,
test_owner=test_owner.SURBHI_J,
test_owner=test_owner.BRANDEN_V,
).run()
2 changes: 1 addition & 1 deletion dags/multipod/maxtext_trillium_configs_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@

maxtext_sweep_gke_test = (
maxtext_sweep_gke_config.get_maxtext_sweep_gke_config(
test_owner=test_owner.RAYMOND_Z,
test_owner=test_owner.AIRFLOW,
dataset_project=Project.CLOUD_ML_AUTO_SOLUTIONS.value,
composer_project=Project.CLOUD_ML_AUTO_SOLUTIONS.value,
dataset_name=metric_config.DatasetOption.XLML_DATASET,
Expand Down
4 changes: 2 additions & 2 deletions dags/multipod/maxtext_v5e_configs_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
]
maxtext_sweep_gke_test = (
maxtext_sweep_gke_config.get_maxtext_sweep_gke_config(
test_owner=test_owner.RAYMOND_Z,
test_owner=test_owner.AIRFLOW,
dataset_project=Project.CLOUD_ML_AUTO_SOLUTIONS.value,
composer_project=Project.CLOUD_ML_AUTO_SOLUTIONS.value,
dataset_name=metric_config.DatasetOption.XLML_DATASET,
Expand Down Expand Up @@ -107,7 +107,7 @@
]
maxtext_sweep_gke_test = (
maxtext_sweep_gke_config.get_maxtext_sweep_gke_config(
test_owner=test_owner.RAYMOND_Z,
test_owner=test_owner.AIRFLOW,
dataset_project=Project.CLOUD_ML_AUTO_SOLUTIONS.value,
composer_project=Project.CLOUD_ML_AUTO_SOLUTIONS.value,
dataset_name=metric_config.DatasetOption.XLML_DATASET,
Expand Down
8 changes: 4 additions & 4 deletions dags/multipod/mxla_maxtext_nightly_gke.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
time_out_in_min=60,
test_name=default_test_name,
docker_image=jax_nightly_image.value,
test_owner=test_owner.RAYMOND_Z,
test_owner=test_owner.AIRFLOW,
).run_with_quarantine(quarantine_task_group)

maxtext_nightly_2slice_v5p_8 = gke_config.get_gke_maxtext_nightly_config(
Expand All @@ -61,7 +61,7 @@
time_out_in_min=60,
test_name=default_test_name,
docker_image=jax_nightly_image.value,
test_owner=test_owner.RAYMOND_Z,
test_owner=test_owner.AIRFLOW,
).run_with_quarantine(quarantine_task_group)

maxtext_nightly_4slice_v5p_8 = gke_config.get_gke_maxtext_nightly_config(
Expand All @@ -70,7 +70,7 @@
time_out_in_min=60,
test_name=default_test_name,
docker_image=jax_nightly_image.value,
test_owner=test_owner.RAYMOND_Z,
test_owner=test_owner.AIRFLOW,
).run_with_quarantine(quarantine_task_group)

maxtext_nightly_8slice_v5p_8 = gke_config.get_gke_maxtext_nightly_config(
Expand All @@ -79,7 +79,7 @@
time_out_in_min=60,
test_name=default_test_name,
docker_image=jax_nightly_image.value,
test_owner=test_owner.RAYMOND_Z,
test_owner=test_owner.AIRFLOW,
).run_with_quarantine(quarantine_task_group)

# v6e tests
Expand Down
4 changes: 2 additions & 2 deletions dags/multipod/pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,15 @@
)
pytorch_config.get_nightly_pytorch_config(
test_name="shardings",
test_owner=test_owner.JON_B,
test_owner=test_owner.AIRFLOW,
run_commands=run_cmds,
cluster=cluster,
num_slices=num_slices,
).run()

pytorch_config.get_nightly_pytorch_config(
test_name="checkpoint",
test_owner=test_owner.JON_B,
test_owner=test_owner.AIRFLOW,
run_commands=(
f"export CHKPT_PATH={metric_config.SshEnvVars.GCS_OUTPUT.value}",
"pip install gcsfs",
Expand Down
4 changes: 2 additions & 2 deletions dags/pytorch_xla/configs/pytorchxla_torchbench_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ def get_torchbench_tpu_config(
set_up_cmds=set_up_cmds,
run_model_cmds=run_script_cmds,
timeout=datetime.timedelta(minutes=time_out_in_min),
task_owner=test_owner.PEI_Z,
task_owner=test_owner.AIRFLOW,
gcs_subfolder=f"{GCS_SUBFOLDER_PREFIX}/torchbench",
)

Expand Down Expand Up @@ -523,7 +523,7 @@ def get_torchbench_gpu_config(
set_up_cmds=set_up_cmds,
run_model_cmds=run_script_cmds,
timeout=datetime.timedelta(minutes=time_out_in_min),
task_owner=test_owner.PEI_Z,
task_owner=test_owner.AIRFLOW,
gcs_subfolder=f"{GCS_SUBFOLDER_PREFIX}/torchbench",
use_existing_instance=False,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def get_tf_keras_config(
set_up_cmds=set_up_cmds,
run_model_cmds=run_model_cmds,
timeout=datetime.timedelta(minutes=time_out_in_min),
task_owner=test_owner.ERIC_L,
task_owner=test_owner.AIRFLOW,
)

return task.run_queued_resource_test(
Expand Down