NVIDIA
diff --git a/‎.github/container/Dockerfile.jax‎
Lines changed: 2 additions & 1 deletion b/‎.github/container/Dockerfile.jax‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/container/Dockerfile.mjx‎
Lines changed: 0 additions & 54 deletions b/‎.github/container/Dockerfile.mjx‎
Lines changed: 0 additions & 54 deletions
diff --git a/‎.github/container/build-jax.sh‎
Lines changed: 11 additions & 9 deletions b/‎.github/container/build-jax.sh‎
Lines changed: 11 additions & 9 deletions
diff --git a/‎.github/container/manifest.yaml‎
Lines changed: 5 additions & 15 deletions b/‎.github/container/manifest.yaml‎
Lines changed: 5 additions & 15 deletions
diff --git a/‎.github/container/nsys_jax/nsys_jax/analyses/Analysis.ipynb‎
Lines changed: 28 additions & 22 deletions b/‎.github/container/nsys_jax/nsys_jax/analyses/Analysis.ipynb‎
Lines changed: 28 additions & 22 deletions
diff --git a/‎.github/container/nsys_jax/nsys_jax/analyses/communication.py‎
Lines changed: 5 additions & 5 deletions b/‎.github/container/nsys_jax/nsys_jax/analyses/communication.py‎
Lines changed: 5 additions & 5 deletions
@@ -98,11 +98,12 @@ ADD build-jax.sh local_cuda_arch pytest-xdist.sh test-jax.sh /usr/local/bin/
 RUN mkdir -p /opt/pip-tools.d
 
 ## Editable installations of jax and jaxlib
+# Note that jax now is an independent wheel, extra [k8s] needs to be from build path also
 RUN <<"EOF" bash -ex
 for component in $(ls ${BUILD_PATH_JAXLIB}); do
     echo "-e file://${BUILD_PATH_JAXLIB}/${component}" >> /opt/pip-tools.d/requirements-jax.in;
 done
-echo "-e file://${SRC_PATH_JAX}[k8s]" >> /opt/pip-tools.d/requirements-jax.in
+echo "-e file://${BUILD_PATH_JAXLIB}/jax[k8s]" >> /opt/pip-tools.d/requirements-jax.in
 EOF
 
 ## Flax
 
@@ -288,7 +288,8 @@ pushd ${SRC_PATH_JAX}
 time python "${SRC_PATH_JAX}/build/build.py" build \
     --editable \
     --use_clang \
-    --wheels=jaxlib,jax-cuda-plugin,jax-cuda-pjrt \
+    --use_new_wheel_build_rule \
+    --wheels=jax,jaxlib,jax-cuda-plugin,jax-cuda-pjrt \
     --cuda_compute_capabilities=$TF_CUDA_COMPUTE_CAPABILITIES \
     --bazel_options=--linkopt=-fuse-ld=lld \
     --local_xla_path=$SRC_PATH_XLA \
@@ -298,12 +299,13 @@ popd
 
 # Make sure that JAX depends on the local jaxlib installation
 # https://jax.readthedocs.io/en/latest/developer.html#specifying-dependencies-on-local-wheels
-line="jaxlib @ file://${BUILD_PATH_JAXLIB}/jaxlib"
+line="jax @ file://${BUILD_PATH_JAXLIB}/jax"
 if ! grep -xF "${line}" "${SRC_PATH_JAX}/build/requirements.in"; then
     pushd "${SRC_PATH_JAX}"
     echo "${line}" >> build/requirements.in
-    echo "jax-cuda${TF_CUDA_MAJOR_VERSION}-pjrt @ file://${BUILD_PATH_JAXLIB}/jax-cuda-pjrt" >> build/requirements.in
-    echo "jax-cuda${TF_CUDA_MAJOR_VERSION}-plugin @ file://${BUILD_PATH_JAXLIB}/jax-cuda-plugin" >> build/requirements.in
+    echo "jaxlib @ file://${BUILD_PATH_JAXLIB}/jaxlib" >> build/requirements.in
+    echo "jax-cuda${TF_CUDA_MAJOR_VERSION}-pjrt @ file://${BUILD_PATH_JAXLIB}/jax_cuda${TF_CUDA_MAJOR_VERSION}_pjrt" >> build/requirements.in
+    echo "jax-cuda${TF_CUDA_MAJOR_VERSION}-plugin @ file://${BUILD_PATH_JAXLIB}/jax_cuda${TF_CUDA_MAJOR_VERSION}_plugin" >> build/requirements.in
     PYTHON_VERSION=$(python -c 'import sys; print("{}.{}".format(*sys.version_info[:2]))')
     bazel run --verbose_failures=true //build:requirements.update --repo_env=HERMETIC_PYTHON_VERSION="${PYTHON_VERSION}"
     popd
@@ -318,13 +320,13 @@ else
 fi
 
 # install jax and jaxlib
-pip --disable-pip-version-check install -e ${BUILD_PATH_JAXLIB}/jaxlib -e ${BUILD_PATH_JAXLIB}/jax-cuda-pjrt -e ${BUILD_PATH_JAXLIB}/jax-cuda-plugin -e "${SRC_PATH_JAX}"
+pip --disable-pip-version-check install -e ${BUILD_PATH_JAXLIB}/jaxlib -e ${BUILD_PATH_JAXLIB}/jax_cuda${TF_CUDA_MAJOR_VERSION}_pjrt -e ${BUILD_PATH_JAXLIB}/jax_cuda${TF_CUDA_MAJOR_VERSION}_plugin -e ${BUILD_PATH_JAXLIB}/jax
 
 ## after installation (example)
-# jax               0.4.36.dev20241125+f828f2d7d /opt/jax
-# jax-cuda12-pjrt   0.4.36.dev20241125           /opt/jaxlibs/jax-cuda-pjrt
-# jax-cuda12-plugin 0.4.36.dev20241125           /opt/jaxlibs/jax-cuda-plugin
-# jaxlib            0.4.36.dev20241125           /opt/jaxlibs/jaxlib
+# jax                     0.5.4.dev20250325    /opt/jaxlibs/jax
+# jax-cuda12-pjrt         0.5.4.dev20250325    /opt/jaxlibs/jax_cuda12_pjrt
+# jax-cuda12-plugin       0.5.4.dev20250325    /opt/jaxlibs/jax_cuda12_plugin
+# jaxlib                  0.5.4.dev20250325    /opt/jaxlibs/jaxlib
 pip list | grep jax
 
 # Ensure directories are readable by all for non-root users
 
@@ -71,6 +71,11 @@ seqio:
   tracking_ref: main
   latest_verified_commit: 11706e4a1e01a81ea6b3e02c5ad147028d5b94bb
   mode: pip-vcs
+google-jetstream:
+  url: https://github.com/AI-Hypercomputer/JetStream.git
+  tracking_ref: main
+  latest_verified_commit: b8b9cb2ea4668da2c5012fc4c7ba958424d82ac9
+  mode: pip-vcs
 maxtext:
   url: https://github.com/google/maxtext.git
   tracking_ref: main
@@ -86,21 +91,6 @@ haliax:
   tracking_ref: main
   latest_verified_commit: 2a696a0c971901ff93afdaa965959d8e3b982ba9
   mode: git-clone
-mujoco:
-  url: https://github.com/google-deepmind/mujoco.git
-  tracking_ref: main
-  latest_verified_commit: e95159b4f6d48d114b16a8dc13ad26b3e44bc3e2
-  mode: git-clone
-mujoco-mpc:
-  url: https://github.com/google-deepmind/mujoco_mpc.git
-  tracking_ref: main
-  latest_verified_commit: 4700f4a13be18398f5aaf6a33ed42e531967e3ae
-  mode: git-clone
-language-to-reward-2023:
-  url: https://github.com/google-deepmind/language_to_reward_2023.git
-  tracking_ref: main
-  latest_verified_commit: abb8e5125e4ecd0da378490b73448c05a694def5
-  mode: git-clone
 mlperf-logging:
   url: https://github.com/mlcommons/logging.git
   tracking_ref: master
 
@@ -21,7 +21,8 @@
     "    xla_module_metadata,\n",
     ")\n",
     "import matplotlib.pyplot as plt\n",
-    "import numpy as np"
+    "import numpy as np\n",
+    "import pathlib"
    ]
   },
   {
@@ -33,6 +34,7 @@
    "source": [
     "# Set the input data to use. default_data_prefix() checks the NSYS_JAX_DEFAULT_PREFIX environment variable, and if that is\n",
     "# not set then the current working directory is used. Use pathlib.Path if setting this explicitly.\n",
+    "prefix = pathlib.Path(\".\")  # modify this and comment out the next line\n",
     "prefix = default_data_prefix()"
    ]
   },
@@ -128,15 +130,14 @@
    "id": "7727d800-13d3-4505-89e8-80a5fed63512",
    "metadata": {},
    "source": [
-    "Here the index has four levels. `ProgramId`, `ProgramExecution` and `Device` have the same meanings as in `steady_state.module`.\n",
-    "The fourth level (in the 3rd position) shows that this row is the `ThunkIndex`-th thunk within the `ProgramExecution`-th execution of XLA module `ProgramId`.\n",
-    "Note that a given thunk can be executed multiple times within the same module, so indexing on the thunk name would not be unique.\n",
+    "Here the index has five levels. `ProgramId`, `ProgramExecution` and `Device` have the same meanings as in `steady_state.module`.\n",
+    "The two new levels, `Name` and `ThunkExecution`, show that a given row is the `ThunkExecution`-th execution within the `ProgramExecution`-th execution of XLA module `ProgramId` of thunk `Name`.\n",
+    "The `ThunkExecution` value is needed because a given thunk can be executed multiple times within the same module.\n",
+    "The `Name` of a thunk can be used, along with a `ProgramId`, to look up XLA metadata.\n",
     "\n",
     "The columns are as follows:\n",
-    "- `Name`: the name of the thunk; this should be unique within a given `ProgramId` and can be used as a key to look up XLA metadata\n",
     "- `ProjStartMs`: see above, same meaning as in `steady_state.module`.\n",
     "- `Communication`: does this thunk represent communication between GPUs (*i.e.* a NCCL collective)? XLA overlaps communication and computation kernels, and `load_profiler_data` triggers an overlap calculation. `ProjDurMs` for a communication kernel shows only the duration that was **not** overlapped with computation kernels, while `ProjDurHiddenMs` shows the duration that **was** overlapped.\n",
-    "- This is the `ThunkExecution`-th execution of this thunk for this `(ProgramId, ProgramExecution, Device)`\n",
     "\n",
     "The third data frame does not show any GPU execution, but is rather a host-side trace:"
    ]
@@ -178,7 +179,7 @@
    "id": "2e82c357-4e9d-48e4-b758-fa5357b2c8bd",
    "metadata": {},
    "source": [
-    "The index structure, and many of the columns, are equivalent to `thunk_df`. Additional columns are:\n",
+    "The index structure, and many of the columns, are equivalent to the `.thunk` data frame. Additional columns are:\n",
     "\n",
     "- `MessageSize`: the message size of the collective in bytes; this aims to follow the same conventions as the NCCL tests\n",
     "- `Collective`: the type of collective communication\n",
@@ -524,7 +525,9 @@
     "        # program, there may be different sub-groupings that are participating in smaller\n",
     "        # collectives in the strict/NCCL sense. TODO: it would be better to identify those\n",
     "        # sub-groupings and group them, but we currently lack the relevant information.\n",
-    "        collective_df = df.groupby([\"ProgramId\", \"ProgramExecution\", \"ThunkIndex\"])\n",
+    "        collective_df = df.groupby(\n",
+    "            [\"ProgramId\", \"ProgramExecution\", \"Name\", \"ThunkExecution\"]\n",
+    "        )\n",
     "        # Take the fastest device kernel as a proxy for the actual bandwidth of the\n",
     "        # collective.\n",
     "        bandwidth_df = collective_df.agg(\n",
@@ -534,7 +537,6 @@
     "                \"ProjStartMs\": \"min\",\n",
     "                \"ProjDurFullMs\": \"min\",\n",
     "                \"ProjEndMs\": \"max\",\n",
-    "                \"Name\": \"count\",\n",
     "            }\n",
     "        )\n",
     "        axs[0].plot(\n",
@@ -582,9 +584,9 @@
     "\n",
     "# Calculate statistics over different devices and different executions of each thunk, including multiple executions of the same thunk within the same module\n",
     "compute_durations = steady_state.thunk.loc[\n",
-    "    ~steady_state.thunk[\"Communication\"], (\"Name\", \"ProjDurMs\")\n",
+    "    ~steady_state.thunk[\"Communication\"], \"ProjDurMs\"\n",
     "].groupby([\"ProgramId\", \"Name\"])\n",
-    "compute_duration_stats = compute_durations[\"ProjDurMs\"].agg((\"mean\", \"std\"))\n",
+    "compute_duration_stats = compute_durations.agg((\"mean\", \"std\"))\n",
     "compute_duration_means = compute_duration_stats[\"mean\"]\n",
     "compute_duration_rel_stds = compute_duration_stats[\"std\"] / compute_duration_means\n",
     "\n",
@@ -634,8 +636,7 @@
     "\n",
     "def durations_ms(idx):\n",
     "    program_id, thunk_name = idx\n",
-    "    tmp = steady_state.thunk.loc[program_id, (\"Name\", \"ProjDurMs\")]\n",
-    "    return tmp.loc[tmp[\"Name\"] == thunk_name, \"ProjDurMs\"]\n",
+    "    return steady_state.thunk.loc[(program_id, slice(None), thunk_name), \"ProjDurMs\"]\n",
     "\n",
     "\n",
     "detailed_index = high_variance_means[high_variance_means > mean_threshold].index\n",
@@ -666,6 +667,7 @@
     "        squeeze=False,\n",
     "        tight_layout=True,\n",
     "    )\n",
+    "    # Compute (non-comm) kernel timings\n",
     "    time_df = steady_state.thunk.loc[\n",
     "        ~steady_state.thunk[\"Communication\"], (\"ProjStartMs\", \"ProjDurMs\")\n",
     "    ]\n",
@@ -688,14 +690,17 @@
     "        ):\n",
     "            # Mean over devices to get a single [thunk0_start, thunk0_end, thunk1_start, ...]\n",
     "            # array for this execution of this module\n",
-    "            mean_times = interleave(exec_df.groupby(\"ThunkIndex\").agg(\"mean\"))\n",
+    "            mean_times = interleave(\n",
+    "                exec_df.groupby([\"Name\", \"ThunkExecution\"], sort=False).agg(\"mean\")\n",
+    "            )\n",
     "            # x axis of the plot will be the average over executions of the module\n",
     "            x_values.append(mean_times - mean_times[0])\n",
     "            for device, device_values in exec_df.groupby(\"Device\"):\n",
     "                # [thunk0_start, thunk0_end, ...] array for one device within one module exec\n",
     "                # with the average over devices subtracted\n",
     "                y_values[device].append(interleave(device_values) - mean_times)\n",
     "        mean_start_time_ms = np.mean(x_values, axis=0)\n",
+    "        # all_values: (num_devices, num_module_executions, thunks_per_module)\n",
     "        all_values = np.array(list(y_values.values()))\n",
     "        ax.plot(\n",
     "            mean_start_time_ms,\n",
@@ -728,18 +733,17 @@
     "                exec_df[\"ProjEndMs\"]\n",
     "                - steady_state.module.loc[(program_id, module_execution), \"ProjStartMs\"]\n",
     "            )\n",
-    "            tmp = exec_df.groupby(\"ThunkIndex\").agg(\n",
+    "            tmp = exec_df.groupby([\"Name\", \"ThunkExecution\"]).agg(\n",
     "                {\n",
-    "                    \"Name\": \"first\",\n",
     "                    \"Collective\": \"first\",\n",
     "                    \"CollectiveSize\": \"first\",\n",
     "                    \"EndInModuleMs\": \"mean\",\n",
     "                }\n",
     "            )\n",
     "            for coll_size, values in tmp.groupby(\"CollectiveSize\"):\n",
     "                comm_x_values[coll_size].append(values[\"EndInModuleMs\"])\n",
-    "        (_, xmax), (ymin, ymax) = ax.get_xlim(), ax.get_ylim()\n",
-    "        ax.set_xlim(0, xmax)\n",
+    "        ymin, ymax = ax.get_ylim()\n",
+    "        ax.set_xlim(mean_start_time_ms[0], mean_start_time_ms[-1])\n",
     "        ax.set_ylim(ymin, ymax)\n",
     "        largest_collective = max(comm_x_values.keys())\n",
     "        for n_color, (coll_size, values) in enumerate(comm_x_values.items()):\n",
@@ -748,10 +752,10 @@
     "                collective_times,\n",
     "                ymin,\n",
     "                # Draw taller vertical lines for collectives involving more devices\n",
-    "                ymin * (1 - coll_size / largest_collective),\n",
+    "                ymin * (1 - 0.75 * coll_size / largest_collective),\n",
     "                color=f\"C{n_color}\",\n",
     "                label=f\"{coll_size}-device collective\",\n",
-    "                linestyle=\"--\",\n",
+    "                linestyle=\"-\",\n",
     "            )\n",
     "\n",
     "        ax.set_title(\n",
@@ -836,7 +840,9 @@
    "outputs": [],
    "source": [
     "num_traces = {\n",
-    "    module_id: xla_module_metadata(module_id, policy=\"all\").unique_result(\n",
+    "    module_id: xla_module_metadata(\n",
+    "        module_id, policy=\"all\", prefix=prefix\n",
+    "    ).unique_result(\n",
     "        lambda hlo_module: len(\n",
     "            hlo_module.proto().buffer_assignment.heap_simulator_traces\n",
     "        )\n",
@@ -855,7 +861,7 @@
     "    squeeze=False,\n",
     ")\n",
     "for n_module, module_id in enumerate(module_ids_with_traces):\n",
-    "    protos = xla_module_metadata(module_id, policy=\"all\")\n",
+    "    protos = xla_module_metadata(module_id, policy=\"all\", prefix=prefix)\n",
     "    sizes_by_logical_id = protos.unique_result(\n",
     "        lambda proto: {\n",
     "            buffer.id: buffer.size\n",
 
@@ -38,7 +38,9 @@ def process_communication_data(steady_state):
         collective_types.add(collective)
         # This grouped data frame will have a row for each device that is participating
         # in this instance of the collective.
-        devices = df.groupby(["ProgramId", "ProgramExecution", "ThunkIndex"])
+        devices = df.groupby(
+            ["ProgramId", "ProgramExecution", "Name", "ThunkExecution"]
+        )
         # Take the fastest device bandwidth. Rationale: the slower devices appear
         # slower because they spend some time waiting for the last device, and then all
         # devices complete the collective at the same time. The fastest device is
@@ -134,8 +136,7 @@ def process_hidden_ms_to_total_ms(steady_state):
     for collective, df in grouped_data:
         collective_types.add(collective)
         total_ms = df["ProjDurMs"] + df["ProjDurHiddenMs"]
-        mean_dur_hidden_ms_to_total_ms = (df["ProjDurHiddenMs"] / total_ms).mean()
-        summary_data[collective] = mean_dur_hidden_ms_to_total_ms
+        summary_data[collective] = df["ProjDurHiddenMs"].sum() / total_ms.sum()
 
     return collective_types, summary_data
 
@@ -253,8 +254,7 @@ def main():
     # Load the profiler data; the compilation part is needed for the warmup heuristics
     all_data = load_profiler_data(args.prefix, frames={"communication", "compile"})
     # Align timestamps
-    all_data, alignment_metadata = align_profiler_data_timestamps(all_data)
-    print(f"Alignment metadata: {alignment_metadata}")
+    all_data, _ = align_profiler_data_timestamps(all_data)
     # Partition the profile data into initialisation and steady-state running
     _, steady_state = apply_warmup_heuristics(all_data)